markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,760 @@
1
+ """Core document conversion logic.
2
+
3
+ This module provides the unified core conversion flow shared between
4
+ single-file and batch processing modes.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import re
11
+ from collections.abc import Callable
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+
16
+ from loguru import logger
17
+
18
+ from markitai.constants import IMAGE_EXTENSIONS
19
+ from markitai.converter.base import FileFormat, detect_format, get_converter
20
+ from markitai.image import ImageProcessor
21
+ from markitai.security import (
22
+ atomic_write_text,
23
+ check_symlink_safety,
24
+ escape_glob_pattern,
25
+ validate_file_size,
26
+ )
27
+ from markitai.utils.paths import ensure_dir
28
+ from markitai.workflow.helpers import add_basic_frontmatter, merge_llm_usage
29
+
30
+ if TYPE_CHECKING:
31
+ from markitai.config import MarkitaiConfig
32
+ from markitai.converter.base import ConvertResult
33
+ from markitai.llm import LLMProcessor
34
+ from markitai.workflow.single import ImageAnalysisResult
35
+
36
+
37
+ @dataclass
38
+ class ConversionContext:
39
+ """Context for a document conversion operation.
40
+
41
+ This dataclass holds all the input parameters and intermediate state
42
+ for a single document conversion.
43
+ """
44
+
45
+ # Required inputs
46
+ input_path: Path
47
+ output_dir: Path
48
+ config: MarkitaiConfig
49
+
50
+ # Optional inputs
51
+ actual_file: Path | None = None # For pre-converted files (batch COM)
52
+ shared_processor: LLMProcessor | None = None
53
+ project_dir: Path | None = None
54
+
55
+ # Processing flags
56
+ use_multiprocess_images: bool = False
57
+
58
+ # Intermediate state (set during processing)
59
+ converter: Any = None
60
+ conversion_result: ConvertResult | None = None
61
+ output_file: Path | None = None
62
+ embedded_images_count: int = 0
63
+ screenshots_count: int = 0
64
+
65
+ # LLM tracking
66
+ llm_cost: float = 0.0
67
+ llm_usage: dict[str, dict[str, Any]] = field(default_factory=dict)
68
+ image_analysis: ImageAnalysisResult | None = None
69
+
70
+ # Additional tracking (for caller use)
71
+ duration: float = 0.0
72
+ cache_hit: bool = False
73
+ input_base_dir: Path | None = None # For batch relative path calculation
74
+
75
+ # Optional callback for stage completion (stage_name, duration)
76
+ on_stage_complete: Callable[[str, float], None] | None = None
77
+
78
+ @property
79
+ def effective_input(self) -> Path:
80
+ """Return actual file to process (handles pre-conversion)."""
81
+ return self.actual_file if self.actual_file else self.input_path
82
+
83
+ @property
84
+ def is_preconverted(self) -> bool:
85
+ """Check if this is a pre-converted file."""
86
+ return self.actual_file is not None and self.actual_file != self.input_path
87
+
88
+
89
+ @dataclass
90
+ class ConversionStepResult:
91
+ """Result of a conversion step."""
92
+
93
+ success: bool
94
+ error: str | None = None
95
+ skip_reason: str | None = None
96
+
97
+
98
+ class DocumentConversionError(Exception):
99
+ """Error during document conversion."""
100
+
101
+ pass
102
+
103
+
104
+ class UnsupportedFormatError(DocumentConversionError):
105
+ """Unsupported file format."""
106
+
107
+ pass
108
+
109
+
110
+ class FileSizeError(DocumentConversionError):
111
+ """File size exceeds limit."""
112
+
113
+ pass
114
+
115
+
116
+ async def run_in_converter_thread(func, *args, **kwargs):
117
+ """Run a converter function in the shared thread pool.
118
+
119
+ Uses the shared ThreadPoolExecutor from utils.executor to avoid
120
+ creating a new executor for each conversion.
121
+ """
122
+ from markitai.utils.executor import run_in_converter_thread as _run_in_thread
123
+
124
+ return await _run_in_thread(func, *args, **kwargs)
125
+
126
+
127
+ def validate_and_detect_format(
128
+ ctx: ConversionContext, max_size: int
129
+ ) -> ConversionStepResult:
130
+ """Validate file size and detect format.
131
+
132
+ Args:
133
+ ctx: Conversion context
134
+ max_size: Maximum file size in bytes
135
+
136
+ Returns:
137
+ ConversionStepResult indicating success or failure
138
+ """
139
+ try:
140
+ validate_file_size(ctx.input_path, max_size)
141
+ except ValueError as e:
142
+ return ConversionStepResult(success=False, error=str(e))
143
+
144
+ fmt = detect_format(ctx.effective_input)
145
+ if fmt == FileFormat.UNKNOWN:
146
+ return ConversionStepResult(
147
+ success=False, error=f"Unsupported file format: {ctx.input_path.suffix}"
148
+ )
149
+
150
+ ctx.converter = get_converter(ctx.effective_input, config=ctx.config)
151
+ if ctx.converter is None:
152
+ return ConversionStepResult(
153
+ success=False, error=f"No converter available for format: {fmt.value}"
154
+ )
155
+
156
+ return ConversionStepResult(success=True)
157
+
158
+
159
+ def prepare_output_directory(ctx: ConversionContext) -> ConversionStepResult:
160
+ """Create output directory with symlink safety check.
161
+
162
+ Args:
163
+ ctx: Conversion context
164
+
165
+ Returns:
166
+ ConversionStepResult indicating success or failure
167
+ """
168
+ try:
169
+ check_symlink_safety(
170
+ ctx.output_dir, allow_symlinks=ctx.config.output.allow_symlinks
171
+ )
172
+ ensure_dir(ctx.output_dir)
173
+ return ConversionStepResult(success=True)
174
+ except Exception as e:
175
+ return ConversionStepResult(success=False, error=str(e))
176
+
177
+
178
+ async def convert_document(ctx: ConversionContext) -> ConversionStepResult:
179
+ """Execute document conversion.
180
+
181
+ Args:
182
+ ctx: Conversion context
183
+
184
+ Returns:
185
+ ConversionStepResult indicating success or failure
186
+ """
187
+ try:
188
+ logger.info(f"Converting {ctx.input_path.name}...")
189
+ ctx.conversion_result = await run_in_converter_thread(
190
+ ctx.converter.convert,
191
+ ctx.effective_input,
192
+ output_dir=ctx.output_dir,
193
+ )
194
+ return ConversionStepResult(success=True)
195
+ except Exception as e:
196
+ return ConversionStepResult(success=False, error=f"Conversion failed: {e}")
197
+
198
+
199
+ def resolve_output_file(ctx: ConversionContext) -> ConversionStepResult:
200
+ """Resolve output file path with conflict handling.
201
+
202
+ Args:
203
+ ctx: Conversion context
204
+
205
+ Returns:
206
+ ConversionStepResult - may have skip_reason if file exists
207
+ """
208
+ from markitai.utils.output import resolve_output_path
209
+
210
+ base_output_file = ctx.output_dir / f"{ctx.input_path.name}.md"
211
+ ctx.output_file = resolve_output_path(
212
+ base_output_file, ctx.config.output.on_conflict
213
+ )
214
+
215
+ if ctx.output_file is None:
216
+ logger.info(f"[SKIP] Output exists: {base_output_file}")
217
+ return ConversionStepResult(success=True, skip_reason="exists")
218
+
219
+ return ConversionStepResult(success=True)
220
+
221
+
222
+ async def process_embedded_images(ctx: ConversionContext) -> ConversionStepResult:
223
+ """Extract and process embedded images from markdown.
224
+
225
+ Args:
226
+ ctx: Conversion context
227
+
228
+ Returns:
229
+ ConversionStepResult indicating success or failure
230
+ """
231
+ if ctx.conversion_result is None:
232
+ return ConversionStepResult(success=False, error="No conversion result")
233
+
234
+ image_processor = ImageProcessor(config=ctx.config.image)
235
+ base64_images = image_processor.extract_base64_images(
236
+ ctx.conversion_result.markdown
237
+ )
238
+
239
+ # Count screenshots from page images
240
+ page_images = ctx.conversion_result.metadata.get("page_images", [])
241
+ ctx.screenshots_count = len(page_images)
242
+
243
+ # Count embedded images from two sources:
244
+ # 1. Base64 images in markdown (will be processed below)
245
+ # 2. Images already extracted by converter (e.g., PDF converter saves directly to assets)
246
+ converter_images = len(ctx.conversion_result.images)
247
+ ctx.embedded_images_count = len(base64_images) + converter_images
248
+
249
+ if base64_images:
250
+ logger.info(f"Processing {len(base64_images)} embedded images...")
251
+
252
+ # Use multiprocess for large batches if enabled
253
+ from markitai.constants import DEFAULT_IMAGE_MULTIPROCESS_THRESHOLD
254
+
255
+ if (
256
+ ctx.use_multiprocess_images
257
+ and len(base64_images) > DEFAULT_IMAGE_MULTIPROCESS_THRESHOLD
258
+ ):
259
+ image_result = await image_processor.process_and_save_multiprocess(
260
+ base64_images,
261
+ output_dir=ctx.output_dir,
262
+ base_name=ctx.input_path.name,
263
+ )
264
+ else:
265
+ image_result = image_processor.process_and_save(
266
+ base64_images,
267
+ output_dir=ctx.output_dir,
268
+ base_name=ctx.input_path.name,
269
+ )
270
+
271
+ # Update markdown with image paths using index mapping for correct replacement
272
+ ctx.conversion_result.markdown = image_processor.replace_base64_with_paths(
273
+ ctx.conversion_result.markdown,
274
+ image_result.saved_images,
275
+ index_mapping=image_result.index_mapping,
276
+ )
277
+
278
+ # Also update extracted_text in metadata if present (for PPTX+LLM mode)
279
+ if "extracted_text" in ctx.conversion_result.metadata:
280
+ ctx.conversion_result.metadata["extracted_text"] = (
281
+ image_processor.replace_base64_with_paths(
282
+ ctx.conversion_result.metadata["extracted_text"],
283
+ image_result.saved_images,
284
+ index_mapping=image_result.index_mapping,
285
+ )
286
+ )
287
+
288
+ # Update count: saved base64 images + converter-extracted images
289
+ ctx.embedded_images_count = len(image_result.saved_images) + converter_images
290
+
291
+ return ConversionStepResult(success=True)
292
+
293
+
294
+ def write_base_markdown(ctx: ConversionContext) -> ConversionStepResult:
295
+ """Write base markdown file with basic frontmatter.
296
+
297
+ Args:
298
+ ctx: Conversion context
299
+
300
+ Returns:
301
+ ConversionStepResult indicating success or failure
302
+ """
303
+ if ctx.conversion_result is None or ctx.output_file is None:
304
+ return ConversionStepResult(
305
+ success=False, error="Missing conversion result or output file"
306
+ )
307
+
308
+ base_md_content = add_basic_frontmatter(
309
+ ctx.conversion_result.markdown, ctx.input_path.name
310
+ )
311
+ atomic_write_text(ctx.output_file, base_md_content)
312
+ logger.info(f"Written output: {ctx.output_file}")
313
+
314
+ return ConversionStepResult(success=True)
315
+
316
+
317
+ def get_saved_images(ctx: ConversionContext) -> list[Path]:
318
+ """Get list of saved images for this file from assets directory.
319
+
320
+ Args:
321
+ ctx: Conversion context
322
+
323
+ Returns:
324
+ List of image file paths
325
+ """
326
+ assets_dir = ctx.output_dir / "assets"
327
+ if not assets_dir.exists():
328
+ return []
329
+
330
+ escaped_name = escape_glob_pattern(ctx.input_path.name)
331
+ saved_images = list(assets_dir.glob(f"{escaped_name}*"))
332
+ return [p for p in saved_images if p.suffix.lower() in IMAGE_EXTENSIONS]
333
+
334
+
335
+ def apply_alt_text_updates(
336
+ llm_file: Path,
337
+ image_analysis: Any,
338
+ ) -> bool:
339
+ """Apply alt text updates from image analysis to .llm.md file.
340
+
341
+ This is called after document processing completes to update alt text
342
+ in the .llm.md file with results from parallel image analysis.
343
+
344
+ Args:
345
+ llm_file: Path to the .llm.md file
346
+ image_analysis: ImageAnalysisResult with analyzed images
347
+
348
+ Returns:
349
+ True if updates were applied, False otherwise
350
+ """
351
+ if not llm_file.exists() or image_analysis is None:
352
+ return False
353
+
354
+ try:
355
+ llm_content = llm_file.read_text(encoding="utf-8")
356
+ updated = False
357
+
358
+ for asset in image_analysis.assets:
359
+ asset_path = Path(asset.get("asset", ""))
360
+ alt_text = asset.get("alt", "")
361
+ if not alt_text or not asset_path.name:
362
+ continue
363
+
364
+ # Replace image references with new alt text
365
+ old_pattern = rf"!\[[^\]]*\]\([^)]*{re.escape(asset_path.name)}\)"
366
+ new_ref = f"![{alt_text}](assets/{asset_path.name})"
367
+ new_content = re.sub(old_pattern, new_ref, llm_content)
368
+ if new_content != llm_content:
369
+ llm_content = new_content
370
+ updated = True
371
+
372
+ if updated:
373
+ atomic_write_text(llm_file, llm_content)
374
+ logger.debug(f"Applied alt text updates to {llm_file}")
375
+ return True
376
+
377
+ except Exception as e:
378
+ logger.warning(f"Failed to apply alt text updates: {e}")
379
+
380
+ return False
381
+
382
+
383
+ async def process_with_vision_llm(
384
+ ctx: ConversionContext,
385
+ ) -> ConversionStepResult:
386
+ """Process document with Vision LLM (screenshot mode).
387
+
388
+ Args:
389
+ ctx: Conversion context
390
+
391
+ Returns:
392
+ ConversionStepResult indicating success or failure
393
+ """
394
+ if ctx.conversion_result is None or ctx.output_file is None:
395
+ return ConversionStepResult(success=False, error="Missing conversion result")
396
+
397
+ from markitai.workflow.helpers import create_llm_processor
398
+ from markitai.workflow.single import SingleFileWorkflow
399
+
400
+ page_images = ctx.conversion_result.metadata.get("page_images", [])
401
+ if not page_images:
402
+ return ConversionStepResult(success=True)
403
+
404
+ logger.info(f"[LLM] {ctx.input_path.name}: Starting Screenshot+LLM processing")
405
+
406
+ # Use shared processor or create new one
407
+ processor = ctx.shared_processor
408
+ if processor is None:
409
+ processor = create_llm_processor(ctx.config, project_dir=ctx.project_dir)
410
+
411
+ workflow = SingleFileWorkflow(
412
+ ctx.config,
413
+ processor=processor,
414
+ project_dir=ctx.project_dir,
415
+ )
416
+
417
+ # Get extracted text (use markdown which has base64 replaced)
418
+ extracted_text = ctx.conversion_result.markdown
419
+
420
+ # Enhance with vision
421
+ (
422
+ cleaned_content,
423
+ frontmatter,
424
+ enhance_cost,
425
+ enhance_usage,
426
+ ) = await workflow.enhance_with_vision(
427
+ extracted_text,
428
+ page_images,
429
+ source=ctx.input_path.name,
430
+ )
431
+ ctx.llm_cost += enhance_cost
432
+ merge_llm_usage(ctx.llm_usage, enhance_usage)
433
+
434
+ # Build final content with page image comments
435
+ commented_images_str = ""
436
+ if page_images:
437
+ commented_images = [
438
+ f"<!-- ![Page {img['page']}](screenshots/{img['name']}) -->"
439
+ for img in sorted(page_images, key=lambda x: x.get("page", 0))
440
+ ]
441
+ commented_images_str = "\n\n<!-- Page images for reference -->\n" + "\n".join(
442
+ commented_images
443
+ )
444
+
445
+ ctx.conversion_result.markdown = cleaned_content + commented_images_str
446
+
447
+ # Strip any hallucinated base64 images
448
+ image_processor = ImageProcessor(config=ctx.config.image)
449
+ ctx.conversion_result.markdown = image_processor.strip_base64_images(
450
+ ctx.conversion_result.markdown
451
+ )
452
+
453
+ # Validate image references
454
+ assets_dir = ctx.output_dir / "assets"
455
+ if assets_dir.exists():
456
+ ctx.conversion_result.markdown = ImageProcessor.remove_nonexistent_images(
457
+ ctx.conversion_result.markdown, assets_dir
458
+ )
459
+
460
+ # Write LLM version
461
+ llm_output = ctx.output_file.with_suffix(".llm.md")
462
+ llm_content = processor.format_llm_output(
463
+ ctx.conversion_result.markdown, frontmatter
464
+ )
465
+ atomic_write_text(llm_output, llm_content)
466
+ logger.info(f"Written LLM version: {llm_output}")
467
+
468
+ return ConversionStepResult(success=True)
469
+
470
+
471
+ async def process_with_standard_llm(
472
+ ctx: ConversionContext,
473
+ ) -> ConversionStepResult:
474
+ """Process document with standard LLM (no screenshots).
475
+
476
+ Args:
477
+ ctx: Conversion context
478
+
479
+ Returns:
480
+ ConversionStepResult indicating success or failure
481
+ """
482
+ if ctx.conversion_result is None or ctx.output_file is None:
483
+ return ConversionStepResult(success=False, error="Missing conversion result")
484
+
485
+ from markitai.workflow.helpers import create_llm_processor
486
+ from markitai.workflow.single import SingleFileWorkflow
487
+
488
+ # Check if standalone image
489
+ is_standalone_image = ctx.input_path.suffix.lower() in IMAGE_EXTENSIONS
490
+ saved_images = get_saved_images(ctx)
491
+
492
+ # Use shared processor or create new one
493
+ processor = ctx.shared_processor
494
+ if processor is None:
495
+ processor = create_llm_processor(ctx.config, project_dir=ctx.project_dir)
496
+
497
+ workflow = SingleFileWorkflow(
498
+ ctx.config,
499
+ processor=processor,
500
+ project_dir=ctx.project_dir,
501
+ )
502
+
503
+ if is_standalone_image and saved_images:
504
+ # Standalone image: only run image analysis
505
+ logger.info(f"[LLM] {ctx.input_path.name}: Processing standalone image")
506
+ (
507
+ _,
508
+ image_cost,
509
+ image_usage,
510
+ ctx.image_analysis,
511
+ ) = await workflow.analyze_images(
512
+ saved_images,
513
+ ctx.conversion_result.markdown,
514
+ ctx.output_file,
515
+ ctx.input_path,
516
+ concurrency_limit=ctx.config.llm.concurrency,
517
+ )
518
+ ctx.llm_cost += image_cost
519
+ merge_llm_usage(ctx.llm_usage, image_usage)
520
+ else:
521
+ # Standard LLM processing
522
+ logger.info(f"[LLM] {ctx.input_path.name}: Starting standard LLM processing")
523
+
524
+ # Save original markdown for base .md file
525
+ original_markdown = ctx.conversion_result.markdown
526
+
527
+ # Check if image analysis should run
528
+ should_analyze_images = (
529
+ ctx.config.image.alt_enabled or ctx.config.image.desc_enabled
530
+ ) and saved_images
531
+
532
+ # Run document processing and image analysis in parallel
533
+ # These are independent: doc processing writes .llm.md, image analysis generates descriptions
534
+ if should_analyze_images:
535
+ doc_task = workflow.process_document_with_llm(
536
+ ctx.conversion_result.markdown,
537
+ ctx.input_path.name,
538
+ ctx.output_file,
539
+ )
540
+ img_task = workflow.analyze_images(
541
+ saved_images,
542
+ ctx.conversion_result.markdown,
543
+ ctx.output_file,
544
+ ctx.input_path,
545
+ concurrency_limit=ctx.config.llm.concurrency,
546
+ )
547
+
548
+ # Execute in parallel
549
+ doc_result, img_result = await asyncio.gather(doc_task, img_task)
550
+
551
+ # Unpack results
552
+ ctx.conversion_result.markdown, doc_cost, doc_usage = doc_result
553
+ _, image_cost, image_usage, ctx.image_analysis = img_result
554
+
555
+ ctx.llm_cost += doc_cost + image_cost
556
+ merge_llm_usage(ctx.llm_usage, doc_usage)
557
+ merge_llm_usage(ctx.llm_usage, image_usage)
558
+
559
+ # Apply alt text updates to .llm.md after document processing completes
560
+ # This ensures no race condition - .llm.md is guaranteed to exist
561
+ if ctx.config.image.alt_enabled and ctx.image_analysis:
562
+ llm_output = ctx.output_file.with_suffix(".llm.md")
563
+ apply_alt_text_updates(llm_output, ctx.image_analysis)
564
+ else:
565
+ # Only document processing
566
+ (
567
+ ctx.conversion_result.markdown,
568
+ doc_cost,
569
+ doc_usage,
570
+ ) = await workflow.process_document_with_llm(
571
+ ctx.conversion_result.markdown,
572
+ ctx.input_path.name,
573
+ ctx.output_file,
574
+ )
575
+ ctx.llm_cost += doc_cost
576
+ merge_llm_usage(ctx.llm_usage, doc_usage)
577
+
578
+ # Re-write base .md with original markdown (without LLM alt text)
579
+ base_md_content = add_basic_frontmatter(original_markdown, ctx.input_path.name)
580
+ atomic_write_text(ctx.output_file, base_md_content)
581
+
582
+ return ConversionStepResult(success=True)
583
+
584
+
585
+ async def analyze_embedded_images(ctx: ConversionContext) -> ConversionStepResult:
586
+ """Analyze embedded images with LLM after Vision processing.
587
+
588
+ Used in screenshot+LLM mode to also analyze embedded document images.
589
+
590
+ Args:
591
+ ctx: Conversion context
592
+
593
+ Returns:
594
+ ConversionStepResult indicating success or failure
595
+ """
596
+ if ctx.conversion_result is None or ctx.output_file is None:
597
+ return ConversionStepResult(success=True)
598
+
599
+ if not (ctx.config.image.alt_enabled or ctx.config.image.desc_enabled):
600
+ return ConversionStepResult(success=True)
601
+
602
+ saved_images = get_saved_images(ctx)
603
+ if not saved_images:
604
+ return ConversionStepResult(success=True)
605
+
606
+ # Filter out page/slide screenshots, only analyze embedded images
607
+ import re
608
+
609
+ page_pattern = re.compile(r"\.page\d+\.|\.slide\d+\.", re.IGNORECASE)
610
+ embedded_images = [p for p in saved_images if not page_pattern.search(p.name)]
611
+
612
+ if not embedded_images:
613
+ return ConversionStepResult(success=True)
614
+
615
+ from markitai.workflow.helpers import create_llm_processor
616
+ from markitai.workflow.single import SingleFileWorkflow
617
+
618
+ processor = ctx.shared_processor
619
+ if processor is None:
620
+ processor = create_llm_processor(ctx.config, project_dir=ctx.project_dir)
621
+
622
+ workflow = SingleFileWorkflow(
623
+ ctx.config,
624
+ processor=processor,
625
+ project_dir=ctx.project_dir,
626
+ )
627
+
628
+ logger.info(
629
+ f"[LLM] {ctx.input_path.name}: Analyzing {len(embedded_images)} embedded images"
630
+ )
631
+
632
+ (
633
+ ctx.conversion_result.markdown,
634
+ image_cost,
635
+ image_usage,
636
+ ctx.image_analysis,
637
+ ) = await workflow.analyze_images(
638
+ embedded_images,
639
+ ctx.conversion_result.markdown,
640
+ ctx.output_file,
641
+ ctx.input_path,
642
+ concurrency_limit=ctx.config.llm.concurrency,
643
+ )
644
+ ctx.llm_cost += image_cost
645
+ merge_llm_usage(ctx.llm_usage, image_usage)
646
+
647
+ return ConversionStepResult(success=True)
648
+
649
+
650
+ async def convert_document_core(
651
+ ctx: ConversionContext,
652
+ max_document_size: int,
653
+ ) -> ConversionStepResult:
654
+ """Core document conversion pipeline.
655
+
656
+ This function implements the unified conversion logic shared between
657
+ single-file and batch processing modes.
658
+
659
+ The pipeline:
660
+ 1. Validate file size and detect format
661
+ 2. Prepare output directory
662
+ 3. Execute document conversion
663
+ 4. Resolve output file path (with conflict handling)
664
+ 5. Process embedded images
665
+ 6. Write base markdown file
666
+ 7. LLM processing (if enabled):
667
+ - Vision mode (with page screenshots)
668
+ - Standard mode (no screenshots)
669
+ - Embedded image analysis
670
+
671
+ Args:
672
+ ctx: Conversion context with all inputs and state
673
+ max_document_size: Maximum allowed document size in bytes
674
+
675
+ Returns:
676
+ ConversionStepResult indicating overall success or failure
677
+ """
678
+ # Step 1: Validate and detect format
679
+ result = validate_and_detect_format(ctx, max_document_size)
680
+ if not result.success:
681
+ return result
682
+
683
+ # Step 2: Prepare output directory
684
+ result = prepare_output_directory(ctx)
685
+ if not result.success:
686
+ return result
687
+
688
+ # Step 3: Execute conversion
689
+ result = await convert_document(ctx)
690
+ if not result.success:
691
+ return result
692
+
693
+ # Step 4: Resolve output file
694
+ result = resolve_output_file(ctx)
695
+ if not result.success or result.skip_reason:
696
+ return result
697
+
698
+ # Step 5: Process embedded images
699
+ result = await process_embedded_images(ctx)
700
+ if not result.success:
701
+ return result
702
+
703
+ # Step 6: Write base markdown
704
+ result = write_base_markdown(ctx)
705
+ if not result.success:
706
+ return result
707
+
708
+ # Step 7: LLM processing (if enabled)
709
+ if ctx.config.llm.enabled and ctx.conversion_result is not None:
710
+ # Ensure shared processor exists for all LLM operations
711
+ # This is critical for:
712
+ # 1. Sharing semaphore (concurrency control)
713
+ # 2. Sharing Router instances (avoid duplicate creation)
714
+ # 3. Sharing cache connections
715
+ if ctx.shared_processor is None:
716
+ from markitai.workflow.helpers import create_llm_processor
717
+
718
+ ctx.shared_processor = create_llm_processor(
719
+ ctx.config, project_dir=ctx.project_dir
720
+ )
721
+
722
+ page_images = ctx.conversion_result.metadata.get("page_images", [])
723
+ has_page_images = len(page_images) > 0
724
+
725
+ if has_page_images:
726
+ # Vision mode with screenshots - run vision LLM and embedded image
727
+ # analysis in parallel for better performance
728
+ vision_task = asyncio.create_task(process_with_vision_llm(ctx))
729
+ embed_task = asyncio.create_task(analyze_embedded_images(ctx))
730
+
731
+ results = await asyncio.gather(
732
+ vision_task, embed_task, return_exceptions=True
733
+ )
734
+ vision_result_raw, embed_result_raw = results
735
+
736
+ # Check vision result (critical)
737
+ if isinstance(vision_result_raw, BaseException):
738
+ return ConversionStepResult(
739
+ success=False, error=f"Vision LLM failed: {vision_result_raw}"
740
+ )
741
+ vision_result: ConversionStepResult = vision_result_raw
742
+ if not vision_result.success:
743
+ return vision_result
744
+
745
+ # Check embed result (non-critical, log warning)
746
+ if isinstance(embed_result_raw, BaseException):
747
+ logger.warning(f"Embedded image analysis failed: {embed_result_raw}")
748
+ else:
749
+ embed_result: ConversionStepResult = embed_result_raw
750
+ if not embed_result.success:
751
+ logger.warning(
752
+ f"Embedded image analysis failed: {embed_result.error}"
753
+ )
754
+ else:
755
+ # Standard LLM mode
756
+ result = await process_with_standard_llm(ctx)
757
+ if not result.success:
758
+ return result
759
+
760
+ return ConversionStepResult(success=True)