markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,679 @@
1
+ """PDF document converter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import re
7
+ import shutil
8
+ import tempfile
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Any, cast
12
+
13
+ import pymupdf4llm
14
+ from loguru import logger
15
+
16
+ from markitai.constants import DEFAULT_RENDER_DPI
17
+ from markitai.converter.base import (
18
+ BaseConverter,
19
+ ConvertResult,
20
+ ExtractedImage,
21
+ FileFormat,
22
+ register_converter,
23
+ )
24
+ from markitai.image import ImageProcessor
25
+ from markitai.utils.mime import get_mime_type
26
+ from markitai.utils.paths import ensure_assets_dir, ensure_screenshots_dir
27
+
28
+ if TYPE_CHECKING:
29
+ from markitai.config import MarkitaiConfig
30
+
31
+
32
+ @register_converter(FileFormat.PDF)
33
+ class PdfConverter(BaseConverter):
34
+ """Converter for PDF documents using pymupdf4llm.
35
+
36
+ Supports OCR mode for scanned PDFs when --ocr flag is enabled.
37
+ """
38
+
39
+ supported_formats = [FileFormat.PDF]
40
+
41
+ def __init__(self, config: MarkitaiConfig | None = None) -> None:
42
+ super().__init__(config)
43
+
44
+ def convert(
45
+ self, input_path: Path, output_dir: Path | None = None
46
+ ) -> ConvertResult:
47
+ """
48
+ Convert PDF document to Markdown.
49
+
50
+ Args:
51
+ input_path: Path to the input file
52
+ output_dir: Optional output directory for extracted images
53
+
54
+ Returns:
55
+ ConvertResult containing markdown and extracted images
56
+ """
57
+ input_path = Path(input_path)
58
+ images: list[ExtractedImage] = []
59
+
60
+ # Check if OCR mode is enabled
61
+ use_ocr = self.config and self.config.ocr.enabled
62
+ use_llm = self.config and self.config.llm.enabled
63
+
64
+ if use_ocr:
65
+ if use_llm:
66
+ # --ocr --llm: Render pages as images for LLM Vision analysis
67
+ return self._render_pages_for_llm(input_path, output_dir)
68
+ # --ocr only: Use RapidOCR for text extraction
69
+ return self._convert_with_ocr(input_path, output_dir)
70
+
71
+ # Determine image output path
72
+ temp_dir: Path | None = None
73
+ if output_dir:
74
+ image_path = output_dir / "assets"
75
+ image_path.mkdir(parents=True, exist_ok=True)
76
+ write_images = True
77
+ else:
78
+ # Use temp directory if no output dir specified
79
+ temp_dir = Path(tempfile.mkdtemp())
80
+ image_path = temp_dir
81
+ write_images = True
82
+
83
+ # Get image format from config
84
+ image_format = "png"
85
+ dpi = DEFAULT_RENDER_DPI
86
+ if self.config:
87
+ image_format = self.config.image.format
88
+ if image_format == "jpeg":
89
+ image_format = "jpg"
90
+
91
+ # Convert using pymupdf4llm with page_chunks=True for page-level splitting
92
+ # This allows proper text-to-screenshot alignment in batched LLM processing
93
+ page_results = pymupdf4llm.to_markdown(
94
+ str(input_path),
95
+ write_images=write_images,
96
+ image_path=str(image_path),
97
+ image_format=image_format,
98
+ dpi=dpi,
99
+ force_text=True,
100
+ page_chunks=True, # Return list of page chunks instead of single string
101
+ )
102
+
103
+ # Merge page chunks and add page markers for proper splitting
104
+ # Format: <!-- Page number: N --> (consistent with Slide number format)
105
+ # Ensure blank line after marker for proper markdown formatting
106
+ markdown_parts = []
107
+ for i, chunk in enumerate(page_results):
108
+ page_num = i + 1
109
+ page_marker = f"<!-- Page number: {page_num} -->"
110
+ page_text = chunk.get("text", "") if isinstance(chunk, dict) else str(chunk)
111
+ markdown_parts.append(f"{page_marker}\n\n{page_text}")
112
+
113
+ markdown = "\n\n".join(markdown_parts)
114
+
115
+ # Fix image paths in markdown: pymupdf4llm uses absolute/full paths,
116
+ # we need relative paths (assets/xxx.jpg)
117
+ markdown = self._fix_image_paths(markdown, image_path)
118
+
119
+ # Collect extracted images (only for current file)
120
+ if write_images and image_path.exists():
121
+ # Use input filename as prefix to filter images from this file only
122
+ file_prefix = input_path.name
123
+ image_processor = ImageProcessor(self.config.image if self.config else None)
124
+ for idx, img_file in enumerate(
125
+ sorted(image_path.glob(f"{file_prefix}*.{image_format}"))
126
+ ):
127
+ suffix = img_file.suffix.lower().lstrip(".")
128
+ width = 0
129
+ height = 0
130
+
131
+ # Optionally compress and overwrite to keep sizes consistent
132
+ if self.config and self.config.image.compress:
133
+ format_map = {
134
+ "jpg": "JPEG",
135
+ "jpeg": "JPEG",
136
+ "png": "PNG",
137
+ "webp": "WEBP",
138
+ }
139
+ output_format = format_map.get(suffix, "PNG")
140
+ try:
141
+ from PIL import Image
142
+
143
+ with Image.open(img_file) as img:
144
+ compressed_img, compressed_data = image_processor.compress(
145
+ img.copy(),
146
+ quality=self.config.image.quality,
147
+ max_size=(
148
+ self.config.image.max_width,
149
+ self.config.image.max_height,
150
+ ),
151
+ output_format=output_format,
152
+ )
153
+ img_file.write_bytes(compressed_data)
154
+ width, height = compressed_img.size
155
+ except Exception:
156
+ pass
157
+
158
+ if width == 0 or height == 0:
159
+ try:
160
+ from PIL import Image
161
+
162
+ with Image.open(img_file) as img:
163
+ width, height = img.size
164
+ except Exception:
165
+ width, height = 0, 0
166
+
167
+ # Determine MIME type
168
+ mime_type = get_mime_type(suffix, default="image/png")
169
+
170
+ images.append(
171
+ ExtractedImage(
172
+ path=img_file,
173
+ index=idx + 1,
174
+ original_name=img_file.name,
175
+ mime_type=mime_type,
176
+ width=width,
177
+ height=height,
178
+ )
179
+ )
180
+
181
+ metadata: dict[str, Any] = {
182
+ "source": str(input_path),
183
+ "format": "PDF",
184
+ "images": len(images),
185
+ }
186
+
187
+ # Render page screenshots if enabled (independent of OCR)
188
+ enable_screenshot = self.config and self.config.screenshot.enabled
189
+ if enable_screenshot and output_dir:
190
+ page_images: list[dict] = []
191
+ screenshots_dir = ensure_screenshots_dir(output_dir)
192
+
193
+ import pymupdf
194
+
195
+ # Create ImageProcessor for compression
196
+ img_processor = ImageProcessor(self.config.image if self.config else None)
197
+
198
+ doc = pymupdf.open(input_path)
199
+ try:
200
+ screenshot_dpi = DEFAULT_RENDER_DPI
201
+ screenshot_format = image_format if image_format != "png" else "jpg"
202
+ for page_num in range(len(doc)):
203
+ page = doc[page_num]
204
+
205
+ # Render page to image
206
+ mat = pymupdf.Matrix(screenshot_dpi / 72, screenshot_dpi / 72)
207
+ pix = page.get_pixmap(matrix=mat)
208
+
209
+ # Save page image with compression (ensures < 5MB for LLM)
210
+ image_name = (
211
+ f"{input_path.name}.page{page_num + 1:04d}.{screenshot_format}"
212
+ )
213
+ screenshot_path = screenshots_dir / image_name
214
+ img_processor.save_screenshot(
215
+ pix.samples, pix.width, pix.height, screenshot_path
216
+ )
217
+
218
+ page_images.append(
219
+ {
220
+ "page": page_num + 1,
221
+ "path": str(screenshot_path),
222
+ "name": image_name,
223
+ }
224
+ )
225
+ finally:
226
+ doc.close()
227
+
228
+ if page_images:
229
+ logger.debug(f"Rendered {len(page_images)} page screenshots")
230
+
231
+ metadata["page_images"] = page_images
232
+ metadata["pages"] = len(page_images)
233
+ metadata["extracted_text"] = markdown
234
+
235
+ # Clean up temporary directory if used
236
+ if temp_dir and temp_dir.exists():
237
+ shutil.rmtree(temp_dir, ignore_errors=True)
238
+
239
+ return ConvertResult(
240
+ markdown=markdown,
241
+ images=images,
242
+ metadata=metadata,
243
+ )
244
+
245
+ def _fix_image_paths(self, markdown: str, image_path: Path) -> str:
246
+ """Fix image paths to be relative to output directory.
247
+
248
+ pymupdf4llm generates paths like: ![](full/path/to/assets/image.jpg)
249
+ We need: ![](assets/image.jpg)
250
+ """
251
+ # Escape special regex characters in the path
252
+ escaped_path = re.escape(str(image_path))
253
+ # Match image references with the full path and replace with assets/filename
254
+ # Preserve alt text if present
255
+ pattern = rf"!\[([^\]]*)\]\({escaped_path}/([^)]+)\)"
256
+ replacement = r"![\1](assets/\2)"
257
+ return re.sub(pattern, replacement, markdown)
258
+
259
+ def _collect_embedded_images(
260
+ self, assets_dir: Path, input_name: str
261
+ ) -> list[ExtractedImage]:
262
+ """Collect embedded images extracted by pymupdf4llm.
263
+
264
+ pymupdf4llm extracts embedded images with names like: filename.pdf-0-0.png
265
+ (page index - image index on that page)
266
+
267
+ Args:
268
+ assets_dir: Directory where images were extracted
269
+ input_name: Original PDF filename
270
+
271
+ Returns:
272
+ List of ExtractedImage for embedded images
273
+ """
274
+ embedded_images: list[ExtractedImage] = []
275
+ # Pattern: filename.pdf-{page}-{index}.{ext}
276
+ pattern = re.compile(rf"^{re.escape(input_name)}-(\d+)-(\d+)\.(png|jpg|jpeg)$")
277
+
278
+ for image_file in assets_dir.iterdir():
279
+ match = pattern.match(image_file.name)
280
+ if match:
281
+ page_idx = int(match.group(1))
282
+ img_idx = int(match.group(2))
283
+ ext = match.group(3)
284
+
285
+ # Get image dimensions
286
+ try:
287
+ import pymupdf
288
+
289
+ pix = pymupdf.Pixmap(str(image_file))
290
+ width, height = pix.width, pix.height
291
+ except Exception:
292
+ width, height = 0, 0
293
+
294
+ embedded_images.append(
295
+ ExtractedImage(
296
+ path=image_file,
297
+ index=page_idx * 100 + img_idx, # Unique index
298
+ original_name=image_file.name,
299
+ mime_type=f"image/{'jpeg' if ext in ('jpg', 'jpeg') else ext}",
300
+ width=width,
301
+ height=height,
302
+ )
303
+ )
304
+
305
+ return embedded_images
306
+
307
+ def _convert_with_ocr(
308
+ self, input_path: Path, output_dir: Path | None = None
309
+ ) -> ConvertResult:
310
+ """Convert PDF using OCR for scanned documents.
311
+
312
+ Also renders each page as an image (if enable_screenshot) for reference.
313
+
314
+ Args:
315
+ input_path: Path to the PDF file
316
+ output_dir: Optional output directory for extracted images
317
+
318
+ Returns:
319
+ ConvertResult containing OCR-extracted markdown with commented page images
320
+ """
321
+ try:
322
+ import pymupdf
323
+ except ImportError as e:
324
+ raise ImportError(
325
+ "PyMuPDF is not installed. Install with: pip install pymupdf"
326
+ ) from e
327
+
328
+ from markitai.ocr import OCRProcessor
329
+
330
+ ocr_config = self.config.ocr if self.config else None
331
+ ocr = OCRProcessor(ocr_config)
332
+
333
+ logger.info(f"Converting PDF with OCR: {input_path.name}")
334
+
335
+ # Setup screenshots directory for page images
336
+ if output_dir:
337
+ screenshots_dir = ensure_screenshots_dir(output_dir)
338
+ else:
339
+ screenshots_dir = Path(tempfile.mkdtemp())
340
+
341
+ # Get image format from config
342
+ image_format = "jpg"
343
+ if self.config:
344
+ fmt = self.config.image.format
345
+ image_format = "jpg" if fmt == "jpeg" else fmt
346
+
347
+ # Check if screenshot is enabled
348
+ enable_screenshot = self.config and self.config.screenshot.enabled
349
+
350
+ images: list[ExtractedImage] = []
351
+ page_images: list[dict] = []
352
+ markdown_parts = []
353
+ dpi = DEFAULT_RENDER_DPI
354
+
355
+ # Step 2: Render each page as image (only if screenshot enabled)
356
+ # Use parallel processing for better performance
357
+ doc = pymupdf.open(input_path)
358
+ total_pages = len(doc)
359
+ doc.close()
360
+
361
+ # Determine optimal worker count based on file size and system resources
362
+ # Each worker opens its own PDF copy, so memory usage scales with workers × file_size
363
+ file_size_mb = input_path.stat().st_size / (1024 * 1024)
364
+ cpu_count = os.cpu_count() or 4
365
+
366
+ # Adaptive worker count:
367
+ # - Small files (<10MB): use up to cpu_count/2 workers
368
+ # - Medium files (10-50MB): use up to 4 workers
369
+ # - Large files (>50MB): use up to 2 workers to limit memory
370
+ if file_size_mb < 10:
371
+ max_workers = min(cpu_count // 2 or 2, total_pages, 6)
372
+ elif file_size_mb < 50:
373
+ max_workers = min(4, total_pages)
374
+ else:
375
+ max_workers = min(2, total_pages)
376
+
377
+ # Ensure at least 1 worker
378
+ max_workers = max(1, max_workers)
379
+
380
+ if enable_screenshot:
381
+ screenshots_dir.mkdir(parents=True, exist_ok=True)
382
+
383
+ def process_page_with_screenshot(page_num: int) -> dict:
384
+ """Process a single page: render + OCR (thread-safe)."""
385
+ # Each thread opens its own document (PyMuPDF not thread-safe)
386
+ thread_doc = pymupdf.open(input_path)
387
+ img_processor = ImageProcessor(
388
+ self.config.image if self.config else None
389
+ )
390
+ try:
391
+ page = thread_doc[page_num]
392
+
393
+ # Render page to image
394
+ mat = pymupdf.Matrix(dpi / 72, dpi / 72)
395
+ pix = page.get_pixmap(matrix=mat)
396
+
397
+ # Save page image with compression
398
+ image_name = (
399
+ f"{input_path.name}.page{page_num + 1:04d}.{image_format}"
400
+ )
401
+ image_path = screenshots_dir / image_name
402
+ final_size = img_processor.save_screenshot(
403
+ pix.samples, pix.width, pix.height, image_path
404
+ )
405
+
406
+ # OCR - reuse already rendered pixmap to avoid re-rendering
407
+ try:
408
+ result = ocr.recognize_pixmap(
409
+ pix.samples, pix.width, pix.height, pix.n
410
+ )
411
+ text_content = (
412
+ result.text.strip()
413
+ if result.text.strip()
414
+ else "*(No text detected)*"
415
+ )
416
+ except Exception as e:
417
+ logger.warning(f"OCR failed for page {page_num + 1}: {e}")
418
+ text_content = f"*(OCR failed: {e})*"
419
+
420
+ page_content = f"{text_content}\n\n<!-- ![Page {page_num + 1}](screenshots/{image_name}) -->"
421
+
422
+ return {
423
+ "page_num": page_num,
424
+ "image": ExtractedImage(
425
+ path=image_path,
426
+ index=page_num + 1,
427
+ original_name=image_name,
428
+ mime_type=f"image/{image_format}",
429
+ width=final_size[0],
430
+ height=final_size[1],
431
+ ),
432
+ "page_image": {
433
+ "page": page_num + 1,
434
+ "path": str(image_path),
435
+ "name": image_name,
436
+ },
437
+ "markdown": page_content,
438
+ }
439
+ finally:
440
+ thread_doc.close()
441
+
442
+ # Process pages in parallel
443
+ results: dict[int, dict] = {}
444
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
445
+ futures = {
446
+ executor.submit(process_page_with_screenshot, i): i
447
+ for i in range(total_pages)
448
+ }
449
+ for future in as_completed(futures):
450
+ page_num = futures[future]
451
+ try:
452
+ result = future.result()
453
+ results[page_num] = result
454
+ logger.debug(f"OCR processed page {page_num + 1}/{total_pages}")
455
+ except Exception as e:
456
+ logger.error(f"Failed to process page {page_num + 1}: {e}")
457
+ results[page_num] = {
458
+ "page_num": page_num,
459
+ "image": None,
460
+ "page_image": None,
461
+ "markdown": f"*(Page processing failed: {e})*",
462
+ }
463
+
464
+ # Collect results in order
465
+ for i in range(total_pages):
466
+ r = results[i]
467
+ if r["image"]:
468
+ images.append(r["image"])
469
+ if r["page_image"]:
470
+ page_images.append(r["page_image"])
471
+ markdown_parts.append(r["markdown"])
472
+ else:
473
+
474
+ def process_page_ocr_only(page_num: int) -> dict:
475
+ """Process a single page: OCR only (thread-safe)."""
476
+ try:
477
+ result = ocr.recognize_pdf_page(input_path, page_num, dpi=dpi)
478
+ text_content = (
479
+ result.text.strip()
480
+ if result.text.strip()
481
+ else "*(No text detected)*"
482
+ )
483
+ except Exception as e:
484
+ logger.warning(f"OCR failed for page {page_num + 1}: {e}")
485
+ text_content = f"*(OCR failed: {e})*"
486
+ return {"page_num": page_num, "markdown": text_content}
487
+
488
+ # Process pages in parallel
489
+ results: dict[int, dict] = {}
490
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
491
+ futures = {
492
+ executor.submit(process_page_ocr_only, i): i
493
+ for i in range(total_pages)
494
+ }
495
+ for future in as_completed(futures):
496
+ page_num = futures[future]
497
+ try:
498
+ result = future.result()
499
+ results[page_num] = result
500
+ logger.debug(f"OCR processed page {page_num + 1}/{total_pages}")
501
+ except Exception as e:
502
+ logger.error(f"Failed to process page {page_num + 1}: {e}")
503
+ results[page_num] = {
504
+ "page_num": page_num,
505
+ "markdown": f"*(OCR failed: {e})*",
506
+ }
507
+
508
+ # Collect results in order
509
+ for i in range(total_pages):
510
+ markdown_parts.append(results[i]["markdown"])
511
+
512
+ extracted_text = f"# {input_path.stem}\n\n" + "\n\n".join(markdown_parts)
513
+
514
+ return ConvertResult(
515
+ markdown=extracted_text,
516
+ images=images,
517
+ metadata={
518
+ "source": str(input_path),
519
+ "format": "PDF",
520
+ "ocr_used": True,
521
+ "pages": len(markdown_parts),
522
+ "extracted_text": extracted_text,
523
+ "page_images": page_images,
524
+ },
525
+ )
526
+
527
+ def _render_pages_for_llm(
528
+ self, input_path: Path, output_dir: Path | None = None
529
+ ) -> ConvertResult:
530
+ """Extract text and render pages for LLM Vision analysis.
531
+
532
+ This method:
533
+ 1. Extracts text using pymupdf4llm (fast, preserves links/tables)
534
+ 2. Renders each page as an image (if screenshot enabled)
535
+
536
+ Returns:
537
+ ConvertResult with extracted text and page images
538
+ """
539
+ try:
540
+ import pymupdf
541
+ except ImportError as e:
542
+ raise ImportError(
543
+ "PyMuPDF is not installed. Install with: pip install pymupdf"
544
+ ) from e
545
+
546
+ logger.info(f"Extracting text and rendering pages for LLM: {input_path.name}")
547
+
548
+ # Determine output paths
549
+ if output_dir:
550
+ assets_dir = ensure_assets_dir(output_dir)
551
+ screenshots_dir = ensure_screenshots_dir(output_dir)
552
+ else:
553
+ assets_dir = Path(tempfile.mkdtemp())
554
+ screenshots_dir = Path(tempfile.mkdtemp())
555
+
556
+ # Get image format from config
557
+ image_format = "jpg"
558
+ if self.config:
559
+ fmt = self.config.image.format
560
+ image_format = "jpg" if fmt == "jpeg" else fmt
561
+
562
+ # Step 1: Extract text using pymupdf4llm (fast, preserves structure)
563
+ logger.debug("Extracting text with pymupdf4llm...")
564
+ extracted_text = cast(
565
+ str,
566
+ pymupdf4llm.to_markdown(
567
+ str(input_path),
568
+ write_images=True,
569
+ image_path=str(assets_dir),
570
+ image_format=image_format,
571
+ dpi=DEFAULT_RENDER_DPI,
572
+ force_text=True,
573
+ ),
574
+ )
575
+ extracted_text = self._fix_image_paths(extracted_text, assets_dir)
576
+
577
+ # Collect embedded images extracted by pymupdf4llm
578
+ embedded_images = self._collect_embedded_images(assets_dir, input_path.name)
579
+
580
+ # Check if screenshot is enabled
581
+ enable_screenshot = self.config and self.config.screenshot.enabled
582
+
583
+ images: list[ExtractedImage] = list(embedded_images)
584
+ page_images: list[dict] = []
585
+
586
+ if enable_screenshot:
587
+ screenshots_dir.mkdir(parents=True, exist_ok=True)
588
+ # Create ImageProcessor for compression
589
+ img_processor = ImageProcessor(self.config.image if self.config else None)
590
+
591
+ # Get total pages (lightweight operation - only reads PDF metadata)
592
+ with pymupdf.open(input_path) as doc:
593
+ total_pages = len(doc)
594
+
595
+ dpi = DEFAULT_RENDER_DPI
596
+
597
+ def render_page(page_num: int) -> tuple[ExtractedImage, dict]:
598
+ """Render a single page (thread-safe).
599
+
600
+ Each thread opens its own document copy to ensure thread safety.
601
+ PyMuPDF is not thread-safe when sharing document objects.
602
+ """
603
+ # Open document in each thread for thread safety
604
+ thread_doc = pymupdf.open(input_path)
605
+ try:
606
+ page = thread_doc[page_num]
607
+
608
+ # Render page to image
609
+ mat = pymupdf.Matrix(dpi / 72, dpi / 72)
610
+ pix = page.get_pixmap(matrix=mat)
611
+
612
+ # Save page image with compression (ensures < 5MB for LLM)
613
+ image_name = (
614
+ f"{input_path.name}.page{page_num + 1:04d}.{image_format}"
615
+ )
616
+ image_path = screenshots_dir / image_name
617
+ final_size = img_processor.save_screenshot(
618
+ pix.samples, pix.width, pix.height, image_path
619
+ )
620
+
621
+ extracted_img = ExtractedImage(
622
+ path=image_path,
623
+ index=page_num + 1,
624
+ original_name=image_name,
625
+ mime_type=f"image/{image_format}",
626
+ width=final_size[0],
627
+ height=final_size[1],
628
+ )
629
+
630
+ page_info = {
631
+ "page": page_num + 1,
632
+ "path": str(image_path),
633
+ "name": image_name,
634
+ }
635
+
636
+ return (extracted_img, page_info)
637
+ finally:
638
+ thread_doc.close()
639
+
640
+ # Render pages in parallel using ThreadPoolExecutor
641
+ # Use min(4, total_pages) workers to balance parallelism and resource usage
642
+ max_workers = min(4, total_pages) if total_pages > 0 else 1
643
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
644
+ futures = {
645
+ executor.submit(render_page, i): i for i in range(total_pages)
646
+ }
647
+
648
+ # Collect results maintaining page order
649
+ results: list[tuple[int, ExtractedImage, dict]] = []
650
+ for future in as_completed(futures):
651
+ page_num = futures[future]
652
+ try:
653
+ extracted_img, page_info = future.result()
654
+ results.append((page_num, extracted_img, page_info))
655
+ except Exception as e:
656
+ logger.error(f"Failed to render page {page_num + 1}: {e}")
657
+ raise
658
+
659
+ # Sort by page number to maintain order
660
+ results.sort(key=lambda x: x[0])
661
+
662
+ for _, extracted_img, page_info in results:
663
+ images.append(extracted_img)
664
+ page_images.append(page_info)
665
+
666
+ if page_images:
667
+ logger.debug(f"Rendered {len(page_images)} page screenshots")
668
+
669
+ return ConvertResult(
670
+ markdown=extracted_text,
671
+ images=images,
672
+ metadata={
673
+ "source": str(input_path),
674
+ "format": "PDF",
675
+ "pages": len(page_images) if page_images else 0,
676
+ "extracted_text": extracted_text,
677
+ "page_images": page_images,
678
+ },
679
+ )