markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,760 @@
|
|
|
1
|
+
"""Core document conversion logic.
|
|
2
|
+
|
|
3
|
+
This module provides the unified core conversion flow shared between
|
|
4
|
+
single-file and batch processing modes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import re
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
from loguru import logger
|
|
17
|
+
|
|
18
|
+
from markitai.constants import IMAGE_EXTENSIONS
|
|
19
|
+
from markitai.converter.base import FileFormat, detect_format, get_converter
|
|
20
|
+
from markitai.image import ImageProcessor
|
|
21
|
+
from markitai.security import (
|
|
22
|
+
atomic_write_text,
|
|
23
|
+
check_symlink_safety,
|
|
24
|
+
escape_glob_pattern,
|
|
25
|
+
validate_file_size,
|
|
26
|
+
)
|
|
27
|
+
from markitai.utils.paths import ensure_dir
|
|
28
|
+
from markitai.workflow.helpers import add_basic_frontmatter, merge_llm_usage
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from markitai.config import MarkitaiConfig
|
|
32
|
+
from markitai.converter.base import ConvertResult
|
|
33
|
+
from markitai.llm import LLMProcessor
|
|
34
|
+
from markitai.workflow.single import ImageAnalysisResult
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ConversionContext:
|
|
39
|
+
"""Context for a document conversion operation.
|
|
40
|
+
|
|
41
|
+
This dataclass holds all the input parameters and intermediate state
|
|
42
|
+
for a single document conversion.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
# Required inputs
|
|
46
|
+
input_path: Path
|
|
47
|
+
output_dir: Path
|
|
48
|
+
config: MarkitaiConfig
|
|
49
|
+
|
|
50
|
+
# Optional inputs
|
|
51
|
+
actual_file: Path | None = None # For pre-converted files (batch COM)
|
|
52
|
+
shared_processor: LLMProcessor | None = None
|
|
53
|
+
project_dir: Path | None = None
|
|
54
|
+
|
|
55
|
+
# Processing flags
|
|
56
|
+
use_multiprocess_images: bool = False
|
|
57
|
+
|
|
58
|
+
# Intermediate state (set during processing)
|
|
59
|
+
converter: Any = None
|
|
60
|
+
conversion_result: ConvertResult | None = None
|
|
61
|
+
output_file: Path | None = None
|
|
62
|
+
embedded_images_count: int = 0
|
|
63
|
+
screenshots_count: int = 0
|
|
64
|
+
|
|
65
|
+
# LLM tracking
|
|
66
|
+
llm_cost: float = 0.0
|
|
67
|
+
llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
68
|
+
image_analysis: ImageAnalysisResult | None = None
|
|
69
|
+
|
|
70
|
+
# Additional tracking (for caller use)
|
|
71
|
+
duration: float = 0.0
|
|
72
|
+
cache_hit: bool = False
|
|
73
|
+
input_base_dir: Path | None = None # For batch relative path calculation
|
|
74
|
+
|
|
75
|
+
# Optional callback for stage completion (stage_name, duration)
|
|
76
|
+
on_stage_complete: Callable[[str, float], None] | None = None
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def effective_input(self) -> Path:
|
|
80
|
+
"""Return actual file to process (handles pre-conversion)."""
|
|
81
|
+
return self.actual_file if self.actual_file else self.input_path
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def is_preconverted(self) -> bool:
|
|
85
|
+
"""Check if this is a pre-converted file."""
|
|
86
|
+
return self.actual_file is not None and self.actual_file != self.input_path
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@dataclass
|
|
90
|
+
class ConversionStepResult:
|
|
91
|
+
"""Result of a conversion step."""
|
|
92
|
+
|
|
93
|
+
success: bool
|
|
94
|
+
error: str | None = None
|
|
95
|
+
skip_reason: str | None = None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class DocumentConversionError(Exception):
|
|
99
|
+
"""Error during document conversion."""
|
|
100
|
+
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class UnsupportedFormatError(DocumentConversionError):
|
|
105
|
+
"""Unsupported file format."""
|
|
106
|
+
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class FileSizeError(DocumentConversionError):
|
|
111
|
+
"""File size exceeds limit."""
|
|
112
|
+
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
async def run_in_converter_thread(func, *args, **kwargs):
|
|
117
|
+
"""Run a converter function in the shared thread pool.
|
|
118
|
+
|
|
119
|
+
Uses the shared ThreadPoolExecutor from utils.executor to avoid
|
|
120
|
+
creating a new executor for each conversion.
|
|
121
|
+
"""
|
|
122
|
+
from markitai.utils.executor import run_in_converter_thread as _run_in_thread
|
|
123
|
+
|
|
124
|
+
return await _run_in_thread(func, *args, **kwargs)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def validate_and_detect_format(
|
|
128
|
+
ctx: ConversionContext, max_size: int
|
|
129
|
+
) -> ConversionStepResult:
|
|
130
|
+
"""Validate file size and detect format.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
ctx: Conversion context
|
|
134
|
+
max_size: Maximum file size in bytes
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
ConversionStepResult indicating success or failure
|
|
138
|
+
"""
|
|
139
|
+
try:
|
|
140
|
+
validate_file_size(ctx.input_path, max_size)
|
|
141
|
+
except ValueError as e:
|
|
142
|
+
return ConversionStepResult(success=False, error=str(e))
|
|
143
|
+
|
|
144
|
+
fmt = detect_format(ctx.effective_input)
|
|
145
|
+
if fmt == FileFormat.UNKNOWN:
|
|
146
|
+
return ConversionStepResult(
|
|
147
|
+
success=False, error=f"Unsupported file format: {ctx.input_path.suffix}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
ctx.converter = get_converter(ctx.effective_input, config=ctx.config)
|
|
151
|
+
if ctx.converter is None:
|
|
152
|
+
return ConversionStepResult(
|
|
153
|
+
success=False, error=f"No converter available for format: {fmt.value}"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
return ConversionStepResult(success=True)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def prepare_output_directory(ctx: ConversionContext) -> ConversionStepResult:
|
|
160
|
+
"""Create output directory with symlink safety check.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
ctx: Conversion context
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
ConversionStepResult indicating success or failure
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
check_symlink_safety(
|
|
170
|
+
ctx.output_dir, allow_symlinks=ctx.config.output.allow_symlinks
|
|
171
|
+
)
|
|
172
|
+
ensure_dir(ctx.output_dir)
|
|
173
|
+
return ConversionStepResult(success=True)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
return ConversionStepResult(success=False, error=str(e))
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
async def convert_document(ctx: ConversionContext) -> ConversionStepResult:
|
|
179
|
+
"""Execute document conversion.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
ctx: Conversion context
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
ConversionStepResult indicating success or failure
|
|
186
|
+
"""
|
|
187
|
+
try:
|
|
188
|
+
logger.info(f"Converting {ctx.input_path.name}...")
|
|
189
|
+
ctx.conversion_result = await run_in_converter_thread(
|
|
190
|
+
ctx.converter.convert,
|
|
191
|
+
ctx.effective_input,
|
|
192
|
+
output_dir=ctx.output_dir,
|
|
193
|
+
)
|
|
194
|
+
return ConversionStepResult(success=True)
|
|
195
|
+
except Exception as e:
|
|
196
|
+
return ConversionStepResult(success=False, error=f"Conversion failed: {e}")
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def resolve_output_file(ctx: ConversionContext) -> ConversionStepResult:
|
|
200
|
+
"""Resolve output file path with conflict handling.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
ctx: Conversion context
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
ConversionStepResult - may have skip_reason if file exists
|
|
207
|
+
"""
|
|
208
|
+
from markitai.utils.output import resolve_output_path
|
|
209
|
+
|
|
210
|
+
base_output_file = ctx.output_dir / f"{ctx.input_path.name}.md"
|
|
211
|
+
ctx.output_file = resolve_output_path(
|
|
212
|
+
base_output_file, ctx.config.output.on_conflict
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
if ctx.output_file is None:
|
|
216
|
+
logger.info(f"[SKIP] Output exists: {base_output_file}")
|
|
217
|
+
return ConversionStepResult(success=True, skip_reason="exists")
|
|
218
|
+
|
|
219
|
+
return ConversionStepResult(success=True)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
async def process_embedded_images(ctx: ConversionContext) -> ConversionStepResult:
|
|
223
|
+
"""Extract and process embedded images from markdown.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
ctx: Conversion context
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
ConversionStepResult indicating success or failure
|
|
230
|
+
"""
|
|
231
|
+
if ctx.conversion_result is None:
|
|
232
|
+
return ConversionStepResult(success=False, error="No conversion result")
|
|
233
|
+
|
|
234
|
+
image_processor = ImageProcessor(config=ctx.config.image)
|
|
235
|
+
base64_images = image_processor.extract_base64_images(
|
|
236
|
+
ctx.conversion_result.markdown
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Count screenshots from page images
|
|
240
|
+
page_images = ctx.conversion_result.metadata.get("page_images", [])
|
|
241
|
+
ctx.screenshots_count = len(page_images)
|
|
242
|
+
|
|
243
|
+
# Count embedded images from two sources:
|
|
244
|
+
# 1. Base64 images in markdown (will be processed below)
|
|
245
|
+
# 2. Images already extracted by converter (e.g., PDF converter saves directly to assets)
|
|
246
|
+
converter_images = len(ctx.conversion_result.images)
|
|
247
|
+
ctx.embedded_images_count = len(base64_images) + converter_images
|
|
248
|
+
|
|
249
|
+
if base64_images:
|
|
250
|
+
logger.info(f"Processing {len(base64_images)} embedded images...")
|
|
251
|
+
|
|
252
|
+
# Use multiprocess for large batches if enabled
|
|
253
|
+
from markitai.constants import DEFAULT_IMAGE_MULTIPROCESS_THRESHOLD
|
|
254
|
+
|
|
255
|
+
if (
|
|
256
|
+
ctx.use_multiprocess_images
|
|
257
|
+
and len(base64_images) > DEFAULT_IMAGE_MULTIPROCESS_THRESHOLD
|
|
258
|
+
):
|
|
259
|
+
image_result = await image_processor.process_and_save_multiprocess(
|
|
260
|
+
base64_images,
|
|
261
|
+
output_dir=ctx.output_dir,
|
|
262
|
+
base_name=ctx.input_path.name,
|
|
263
|
+
)
|
|
264
|
+
else:
|
|
265
|
+
image_result = image_processor.process_and_save(
|
|
266
|
+
base64_images,
|
|
267
|
+
output_dir=ctx.output_dir,
|
|
268
|
+
base_name=ctx.input_path.name,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Update markdown with image paths using index mapping for correct replacement
|
|
272
|
+
ctx.conversion_result.markdown = image_processor.replace_base64_with_paths(
|
|
273
|
+
ctx.conversion_result.markdown,
|
|
274
|
+
image_result.saved_images,
|
|
275
|
+
index_mapping=image_result.index_mapping,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# Also update extracted_text in metadata if present (for PPTX+LLM mode)
|
|
279
|
+
if "extracted_text" in ctx.conversion_result.metadata:
|
|
280
|
+
ctx.conversion_result.metadata["extracted_text"] = (
|
|
281
|
+
image_processor.replace_base64_with_paths(
|
|
282
|
+
ctx.conversion_result.metadata["extracted_text"],
|
|
283
|
+
image_result.saved_images,
|
|
284
|
+
index_mapping=image_result.index_mapping,
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Update count: saved base64 images + converter-extracted images
|
|
289
|
+
ctx.embedded_images_count = len(image_result.saved_images) + converter_images
|
|
290
|
+
|
|
291
|
+
return ConversionStepResult(success=True)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def write_base_markdown(ctx: ConversionContext) -> ConversionStepResult:
|
|
295
|
+
"""Write base markdown file with basic frontmatter.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
ctx: Conversion context
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
ConversionStepResult indicating success or failure
|
|
302
|
+
"""
|
|
303
|
+
if ctx.conversion_result is None or ctx.output_file is None:
|
|
304
|
+
return ConversionStepResult(
|
|
305
|
+
success=False, error="Missing conversion result or output file"
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
base_md_content = add_basic_frontmatter(
|
|
309
|
+
ctx.conversion_result.markdown, ctx.input_path.name
|
|
310
|
+
)
|
|
311
|
+
atomic_write_text(ctx.output_file, base_md_content)
|
|
312
|
+
logger.info(f"Written output: {ctx.output_file}")
|
|
313
|
+
|
|
314
|
+
return ConversionStepResult(success=True)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def get_saved_images(ctx: ConversionContext) -> list[Path]:
|
|
318
|
+
"""Get list of saved images for this file from assets directory.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
ctx: Conversion context
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
List of image file paths
|
|
325
|
+
"""
|
|
326
|
+
assets_dir = ctx.output_dir / "assets"
|
|
327
|
+
if not assets_dir.exists():
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
escaped_name = escape_glob_pattern(ctx.input_path.name)
|
|
331
|
+
saved_images = list(assets_dir.glob(f"{escaped_name}*"))
|
|
332
|
+
return [p for p in saved_images if p.suffix.lower() in IMAGE_EXTENSIONS]
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def apply_alt_text_updates(
|
|
336
|
+
llm_file: Path,
|
|
337
|
+
image_analysis: Any,
|
|
338
|
+
) -> bool:
|
|
339
|
+
"""Apply alt text updates from image analysis to .llm.md file.
|
|
340
|
+
|
|
341
|
+
This is called after document processing completes to update alt text
|
|
342
|
+
in the .llm.md file with results from parallel image analysis.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
llm_file: Path to the .llm.md file
|
|
346
|
+
image_analysis: ImageAnalysisResult with analyzed images
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True if updates were applied, False otherwise
|
|
350
|
+
"""
|
|
351
|
+
if not llm_file.exists() or image_analysis is None:
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
try:
|
|
355
|
+
llm_content = llm_file.read_text(encoding="utf-8")
|
|
356
|
+
updated = False
|
|
357
|
+
|
|
358
|
+
for asset in image_analysis.assets:
|
|
359
|
+
asset_path = Path(asset.get("asset", ""))
|
|
360
|
+
alt_text = asset.get("alt", "")
|
|
361
|
+
if not alt_text or not asset_path.name:
|
|
362
|
+
continue
|
|
363
|
+
|
|
364
|
+
# Replace image references with new alt text
|
|
365
|
+
old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(asset_path.name)}\)"
|
|
366
|
+
new_ref = f""
|
|
367
|
+
new_content = re.sub(old_pattern, new_ref, llm_content)
|
|
368
|
+
if new_content != llm_content:
|
|
369
|
+
llm_content = new_content
|
|
370
|
+
updated = True
|
|
371
|
+
|
|
372
|
+
if updated:
|
|
373
|
+
atomic_write_text(llm_file, llm_content)
|
|
374
|
+
logger.debug(f"Applied alt text updates to {llm_file}")
|
|
375
|
+
return True
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
logger.warning(f"Failed to apply alt text updates: {e}")
|
|
379
|
+
|
|
380
|
+
return False
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
async def process_with_vision_llm(
|
|
384
|
+
ctx: ConversionContext,
|
|
385
|
+
) -> ConversionStepResult:
|
|
386
|
+
"""Process document with Vision LLM (screenshot mode).
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
ctx: Conversion context
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
ConversionStepResult indicating success or failure
|
|
393
|
+
"""
|
|
394
|
+
if ctx.conversion_result is None or ctx.output_file is None:
|
|
395
|
+
return ConversionStepResult(success=False, error="Missing conversion result")
|
|
396
|
+
|
|
397
|
+
from markitai.workflow.helpers import create_llm_processor
|
|
398
|
+
from markitai.workflow.single import SingleFileWorkflow
|
|
399
|
+
|
|
400
|
+
page_images = ctx.conversion_result.metadata.get("page_images", [])
|
|
401
|
+
if not page_images:
|
|
402
|
+
return ConversionStepResult(success=True)
|
|
403
|
+
|
|
404
|
+
logger.info(f"[LLM] {ctx.input_path.name}: Starting Screenshot+LLM processing")
|
|
405
|
+
|
|
406
|
+
# Use shared processor or create new one
|
|
407
|
+
processor = ctx.shared_processor
|
|
408
|
+
if processor is None:
|
|
409
|
+
processor = create_llm_processor(ctx.config, project_dir=ctx.project_dir)
|
|
410
|
+
|
|
411
|
+
workflow = SingleFileWorkflow(
|
|
412
|
+
ctx.config,
|
|
413
|
+
processor=processor,
|
|
414
|
+
project_dir=ctx.project_dir,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
# Get extracted text (use markdown which has base64 replaced)
|
|
418
|
+
extracted_text = ctx.conversion_result.markdown
|
|
419
|
+
|
|
420
|
+
# Enhance with vision
|
|
421
|
+
(
|
|
422
|
+
cleaned_content,
|
|
423
|
+
frontmatter,
|
|
424
|
+
enhance_cost,
|
|
425
|
+
enhance_usage,
|
|
426
|
+
) = await workflow.enhance_with_vision(
|
|
427
|
+
extracted_text,
|
|
428
|
+
page_images,
|
|
429
|
+
source=ctx.input_path.name,
|
|
430
|
+
)
|
|
431
|
+
ctx.llm_cost += enhance_cost
|
|
432
|
+
merge_llm_usage(ctx.llm_usage, enhance_usage)
|
|
433
|
+
|
|
434
|
+
# Build final content with page image comments
|
|
435
|
+
commented_images_str = ""
|
|
436
|
+
if page_images:
|
|
437
|
+
commented_images = [
|
|
438
|
+
f"<!-- ![Page {img['page']}](screenshots/{img['name']}) -->"
|
|
439
|
+
for img in sorted(page_images, key=lambda x: x.get("page", 0))
|
|
440
|
+
]
|
|
441
|
+
commented_images_str = "\n\n<!-- Page images for reference -->\n" + "\n".join(
|
|
442
|
+
commented_images
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
ctx.conversion_result.markdown = cleaned_content + commented_images_str
|
|
446
|
+
|
|
447
|
+
# Strip any hallucinated base64 images
|
|
448
|
+
image_processor = ImageProcessor(config=ctx.config.image)
|
|
449
|
+
ctx.conversion_result.markdown = image_processor.strip_base64_images(
|
|
450
|
+
ctx.conversion_result.markdown
|
|
451
|
+
)
|
|
452
|
+
|
|
453
|
+
# Validate image references
|
|
454
|
+
assets_dir = ctx.output_dir / "assets"
|
|
455
|
+
if assets_dir.exists():
|
|
456
|
+
ctx.conversion_result.markdown = ImageProcessor.remove_nonexistent_images(
|
|
457
|
+
ctx.conversion_result.markdown, assets_dir
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Write LLM version
|
|
461
|
+
llm_output = ctx.output_file.with_suffix(".llm.md")
|
|
462
|
+
llm_content = processor.format_llm_output(
|
|
463
|
+
ctx.conversion_result.markdown, frontmatter
|
|
464
|
+
)
|
|
465
|
+
atomic_write_text(llm_output, llm_content)
|
|
466
|
+
logger.info(f"Written LLM version: {llm_output}")
|
|
467
|
+
|
|
468
|
+
return ConversionStepResult(success=True)
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
async def process_with_standard_llm(
|
|
472
|
+
ctx: ConversionContext,
|
|
473
|
+
) -> ConversionStepResult:
|
|
474
|
+
"""Process document with standard LLM (no screenshots).
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
ctx: Conversion context
|
|
478
|
+
|
|
479
|
+
Returns:
|
|
480
|
+
ConversionStepResult indicating success or failure
|
|
481
|
+
"""
|
|
482
|
+
if ctx.conversion_result is None or ctx.output_file is None:
|
|
483
|
+
return ConversionStepResult(success=False, error="Missing conversion result")
|
|
484
|
+
|
|
485
|
+
from markitai.workflow.helpers import create_llm_processor
|
|
486
|
+
from markitai.workflow.single import SingleFileWorkflow
|
|
487
|
+
|
|
488
|
+
# Check if standalone image
|
|
489
|
+
is_standalone_image = ctx.input_path.suffix.lower() in IMAGE_EXTENSIONS
|
|
490
|
+
saved_images = get_saved_images(ctx)
|
|
491
|
+
|
|
492
|
+
# Use shared processor or create new one
|
|
493
|
+
processor = ctx.shared_processor
|
|
494
|
+
if processor is None:
|
|
495
|
+
processor = create_llm_processor(ctx.config, project_dir=ctx.project_dir)
|
|
496
|
+
|
|
497
|
+
workflow = SingleFileWorkflow(
|
|
498
|
+
ctx.config,
|
|
499
|
+
processor=processor,
|
|
500
|
+
project_dir=ctx.project_dir,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
if is_standalone_image and saved_images:
|
|
504
|
+
# Standalone image: only run image analysis
|
|
505
|
+
logger.info(f"[LLM] {ctx.input_path.name}: Processing standalone image")
|
|
506
|
+
(
|
|
507
|
+
_,
|
|
508
|
+
image_cost,
|
|
509
|
+
image_usage,
|
|
510
|
+
ctx.image_analysis,
|
|
511
|
+
) = await workflow.analyze_images(
|
|
512
|
+
saved_images,
|
|
513
|
+
ctx.conversion_result.markdown,
|
|
514
|
+
ctx.output_file,
|
|
515
|
+
ctx.input_path,
|
|
516
|
+
concurrency_limit=ctx.config.llm.concurrency,
|
|
517
|
+
)
|
|
518
|
+
ctx.llm_cost += image_cost
|
|
519
|
+
merge_llm_usage(ctx.llm_usage, image_usage)
|
|
520
|
+
else:
|
|
521
|
+
# Standard LLM processing
|
|
522
|
+
logger.info(f"[LLM] {ctx.input_path.name}: Starting standard LLM processing")
|
|
523
|
+
|
|
524
|
+
# Save original markdown for base .md file
|
|
525
|
+
original_markdown = ctx.conversion_result.markdown
|
|
526
|
+
|
|
527
|
+
# Check if image analysis should run
|
|
528
|
+
should_analyze_images = (
|
|
529
|
+
ctx.config.image.alt_enabled or ctx.config.image.desc_enabled
|
|
530
|
+
) and saved_images
|
|
531
|
+
|
|
532
|
+
# Run document processing and image analysis in parallel
|
|
533
|
+
# These are independent: doc processing writes .llm.md, image analysis generates descriptions
|
|
534
|
+
if should_analyze_images:
|
|
535
|
+
doc_task = workflow.process_document_with_llm(
|
|
536
|
+
ctx.conversion_result.markdown,
|
|
537
|
+
ctx.input_path.name,
|
|
538
|
+
ctx.output_file,
|
|
539
|
+
)
|
|
540
|
+
img_task = workflow.analyze_images(
|
|
541
|
+
saved_images,
|
|
542
|
+
ctx.conversion_result.markdown,
|
|
543
|
+
ctx.output_file,
|
|
544
|
+
ctx.input_path,
|
|
545
|
+
concurrency_limit=ctx.config.llm.concurrency,
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# Execute in parallel
|
|
549
|
+
doc_result, img_result = await asyncio.gather(doc_task, img_task)
|
|
550
|
+
|
|
551
|
+
# Unpack results
|
|
552
|
+
ctx.conversion_result.markdown, doc_cost, doc_usage = doc_result
|
|
553
|
+
_, image_cost, image_usage, ctx.image_analysis = img_result
|
|
554
|
+
|
|
555
|
+
ctx.llm_cost += doc_cost + image_cost
|
|
556
|
+
merge_llm_usage(ctx.llm_usage, doc_usage)
|
|
557
|
+
merge_llm_usage(ctx.llm_usage, image_usage)
|
|
558
|
+
|
|
559
|
+
# Apply alt text updates to .llm.md after document processing completes
|
|
560
|
+
# This ensures no race condition - .llm.md is guaranteed to exist
|
|
561
|
+
if ctx.config.image.alt_enabled and ctx.image_analysis:
|
|
562
|
+
llm_output = ctx.output_file.with_suffix(".llm.md")
|
|
563
|
+
apply_alt_text_updates(llm_output, ctx.image_analysis)
|
|
564
|
+
else:
|
|
565
|
+
# Only document processing
|
|
566
|
+
(
|
|
567
|
+
ctx.conversion_result.markdown,
|
|
568
|
+
doc_cost,
|
|
569
|
+
doc_usage,
|
|
570
|
+
) = await workflow.process_document_with_llm(
|
|
571
|
+
ctx.conversion_result.markdown,
|
|
572
|
+
ctx.input_path.name,
|
|
573
|
+
ctx.output_file,
|
|
574
|
+
)
|
|
575
|
+
ctx.llm_cost += doc_cost
|
|
576
|
+
merge_llm_usage(ctx.llm_usage, doc_usage)
|
|
577
|
+
|
|
578
|
+
# Re-write base .md with original markdown (without LLM alt text)
|
|
579
|
+
base_md_content = add_basic_frontmatter(original_markdown, ctx.input_path.name)
|
|
580
|
+
atomic_write_text(ctx.output_file, base_md_content)
|
|
581
|
+
|
|
582
|
+
return ConversionStepResult(success=True)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
async def analyze_embedded_images(ctx: ConversionContext) -> ConversionStepResult:
|
|
586
|
+
"""Analyze embedded images with LLM after Vision processing.
|
|
587
|
+
|
|
588
|
+
Used in screenshot+LLM mode to also analyze embedded document images.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
ctx: Conversion context
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
ConversionStepResult indicating success or failure
|
|
595
|
+
"""
|
|
596
|
+
if ctx.conversion_result is None or ctx.output_file is None:
|
|
597
|
+
return ConversionStepResult(success=True)
|
|
598
|
+
|
|
599
|
+
if not (ctx.config.image.alt_enabled or ctx.config.image.desc_enabled):
|
|
600
|
+
return ConversionStepResult(success=True)
|
|
601
|
+
|
|
602
|
+
saved_images = get_saved_images(ctx)
|
|
603
|
+
if not saved_images:
|
|
604
|
+
return ConversionStepResult(success=True)
|
|
605
|
+
|
|
606
|
+
# Filter out page/slide screenshots, only analyze embedded images
|
|
607
|
+
import re
|
|
608
|
+
|
|
609
|
+
page_pattern = re.compile(r"\.page\d+\.|\.slide\d+\.", re.IGNORECASE)
|
|
610
|
+
embedded_images = [p for p in saved_images if not page_pattern.search(p.name)]
|
|
611
|
+
|
|
612
|
+
if not embedded_images:
|
|
613
|
+
return ConversionStepResult(success=True)
|
|
614
|
+
|
|
615
|
+
from markitai.workflow.helpers import create_llm_processor
|
|
616
|
+
from markitai.workflow.single import SingleFileWorkflow
|
|
617
|
+
|
|
618
|
+
processor = ctx.shared_processor
|
|
619
|
+
if processor is None:
|
|
620
|
+
processor = create_llm_processor(ctx.config, project_dir=ctx.project_dir)
|
|
621
|
+
|
|
622
|
+
workflow = SingleFileWorkflow(
|
|
623
|
+
ctx.config,
|
|
624
|
+
processor=processor,
|
|
625
|
+
project_dir=ctx.project_dir,
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
logger.info(
|
|
629
|
+
f"[LLM] {ctx.input_path.name}: Analyzing {len(embedded_images)} embedded images"
|
|
630
|
+
)
|
|
631
|
+
|
|
632
|
+
(
|
|
633
|
+
ctx.conversion_result.markdown,
|
|
634
|
+
image_cost,
|
|
635
|
+
image_usage,
|
|
636
|
+
ctx.image_analysis,
|
|
637
|
+
) = await workflow.analyze_images(
|
|
638
|
+
embedded_images,
|
|
639
|
+
ctx.conversion_result.markdown,
|
|
640
|
+
ctx.output_file,
|
|
641
|
+
ctx.input_path,
|
|
642
|
+
concurrency_limit=ctx.config.llm.concurrency,
|
|
643
|
+
)
|
|
644
|
+
ctx.llm_cost += image_cost
|
|
645
|
+
merge_llm_usage(ctx.llm_usage, image_usage)
|
|
646
|
+
|
|
647
|
+
return ConversionStepResult(success=True)
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
async def convert_document_core(
|
|
651
|
+
ctx: ConversionContext,
|
|
652
|
+
max_document_size: int,
|
|
653
|
+
) -> ConversionStepResult:
|
|
654
|
+
"""Core document conversion pipeline.
|
|
655
|
+
|
|
656
|
+
This function implements the unified conversion logic shared between
|
|
657
|
+
single-file and batch processing modes.
|
|
658
|
+
|
|
659
|
+
The pipeline:
|
|
660
|
+
1. Validate file size and detect format
|
|
661
|
+
2. Prepare output directory
|
|
662
|
+
3. Execute document conversion
|
|
663
|
+
4. Resolve output file path (with conflict handling)
|
|
664
|
+
5. Process embedded images
|
|
665
|
+
6. Write base markdown file
|
|
666
|
+
7. LLM processing (if enabled):
|
|
667
|
+
- Vision mode (with page screenshots)
|
|
668
|
+
- Standard mode (no screenshots)
|
|
669
|
+
- Embedded image analysis
|
|
670
|
+
|
|
671
|
+
Args:
|
|
672
|
+
ctx: Conversion context with all inputs and state
|
|
673
|
+
max_document_size: Maximum allowed document size in bytes
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
ConversionStepResult indicating overall success or failure
|
|
677
|
+
"""
|
|
678
|
+
# Step 1: Validate and detect format
|
|
679
|
+
result = validate_and_detect_format(ctx, max_document_size)
|
|
680
|
+
if not result.success:
|
|
681
|
+
return result
|
|
682
|
+
|
|
683
|
+
# Step 2: Prepare output directory
|
|
684
|
+
result = prepare_output_directory(ctx)
|
|
685
|
+
if not result.success:
|
|
686
|
+
return result
|
|
687
|
+
|
|
688
|
+
# Step 3: Execute conversion
|
|
689
|
+
result = await convert_document(ctx)
|
|
690
|
+
if not result.success:
|
|
691
|
+
return result
|
|
692
|
+
|
|
693
|
+
# Step 4: Resolve output file
|
|
694
|
+
result = resolve_output_file(ctx)
|
|
695
|
+
if not result.success or result.skip_reason:
|
|
696
|
+
return result
|
|
697
|
+
|
|
698
|
+
# Step 5: Process embedded images
|
|
699
|
+
result = await process_embedded_images(ctx)
|
|
700
|
+
if not result.success:
|
|
701
|
+
return result
|
|
702
|
+
|
|
703
|
+
# Step 6: Write base markdown
|
|
704
|
+
result = write_base_markdown(ctx)
|
|
705
|
+
if not result.success:
|
|
706
|
+
return result
|
|
707
|
+
|
|
708
|
+
# Step 7: LLM processing (if enabled)
|
|
709
|
+
if ctx.config.llm.enabled and ctx.conversion_result is not None:
|
|
710
|
+
# Ensure shared processor exists for all LLM operations
|
|
711
|
+
# This is critical for:
|
|
712
|
+
# 1. Sharing semaphore (concurrency control)
|
|
713
|
+
# 2. Sharing Router instances (avoid duplicate creation)
|
|
714
|
+
# 3. Sharing cache connections
|
|
715
|
+
if ctx.shared_processor is None:
|
|
716
|
+
from markitai.workflow.helpers import create_llm_processor
|
|
717
|
+
|
|
718
|
+
ctx.shared_processor = create_llm_processor(
|
|
719
|
+
ctx.config, project_dir=ctx.project_dir
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
page_images = ctx.conversion_result.metadata.get("page_images", [])
|
|
723
|
+
has_page_images = len(page_images) > 0
|
|
724
|
+
|
|
725
|
+
if has_page_images:
|
|
726
|
+
# Vision mode with screenshots - run vision LLM and embedded image
|
|
727
|
+
# analysis in parallel for better performance
|
|
728
|
+
vision_task = asyncio.create_task(process_with_vision_llm(ctx))
|
|
729
|
+
embed_task = asyncio.create_task(analyze_embedded_images(ctx))
|
|
730
|
+
|
|
731
|
+
results = await asyncio.gather(
|
|
732
|
+
vision_task, embed_task, return_exceptions=True
|
|
733
|
+
)
|
|
734
|
+
vision_result_raw, embed_result_raw = results
|
|
735
|
+
|
|
736
|
+
# Check vision result (critical)
|
|
737
|
+
if isinstance(vision_result_raw, BaseException):
|
|
738
|
+
return ConversionStepResult(
|
|
739
|
+
success=False, error=f"Vision LLM failed: {vision_result_raw}"
|
|
740
|
+
)
|
|
741
|
+
vision_result: ConversionStepResult = vision_result_raw
|
|
742
|
+
if not vision_result.success:
|
|
743
|
+
return vision_result
|
|
744
|
+
|
|
745
|
+
# Check embed result (non-critical, log warning)
|
|
746
|
+
if isinstance(embed_result_raw, BaseException):
|
|
747
|
+
logger.warning(f"Embedded image analysis failed: {embed_result_raw}")
|
|
748
|
+
else:
|
|
749
|
+
embed_result: ConversionStepResult = embed_result_raw
|
|
750
|
+
if not embed_result.success:
|
|
751
|
+
logger.warning(
|
|
752
|
+
f"Embedded image analysis failed: {embed_result.error}"
|
|
753
|
+
)
|
|
754
|
+
else:
|
|
755
|
+
# Standard LLM mode
|
|
756
|
+
result = await process_with_standard_llm(ctx)
|
|
757
|
+
if not result.success:
|
|
758
|
+
return result
|
|
759
|
+
|
|
760
|
+
return ConversionStepResult(success=True)
|