markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,679 @@
|
|
|
1
|
+
"""PDF document converter."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shutil
|
|
8
|
+
import tempfile
|
|
9
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING, Any, cast
|
|
12
|
+
|
|
13
|
+
import pymupdf4llm
|
|
14
|
+
from loguru import logger
|
|
15
|
+
|
|
16
|
+
from markitai.constants import DEFAULT_RENDER_DPI
|
|
17
|
+
from markitai.converter.base import (
|
|
18
|
+
BaseConverter,
|
|
19
|
+
ConvertResult,
|
|
20
|
+
ExtractedImage,
|
|
21
|
+
FileFormat,
|
|
22
|
+
register_converter,
|
|
23
|
+
)
|
|
24
|
+
from markitai.image import ImageProcessor
|
|
25
|
+
from markitai.utils.mime import get_mime_type
|
|
26
|
+
from markitai.utils.paths import ensure_assets_dir, ensure_screenshots_dir
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from markitai.config import MarkitaiConfig
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@register_converter(FileFormat.PDF)
|
|
33
|
+
class PdfConverter(BaseConverter):
|
|
34
|
+
"""Converter for PDF documents using pymupdf4llm.
|
|
35
|
+
|
|
36
|
+
Supports OCR mode for scanned PDFs when --ocr flag is enabled.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
supported_formats = [FileFormat.PDF]
|
|
40
|
+
|
|
41
|
+
def __init__(self, config: MarkitaiConfig | None = None) -> None:
|
|
42
|
+
super().__init__(config)
|
|
43
|
+
|
|
44
|
+
def convert(
|
|
45
|
+
self, input_path: Path, output_dir: Path | None = None
|
|
46
|
+
) -> ConvertResult:
|
|
47
|
+
"""
|
|
48
|
+
Convert PDF document to Markdown.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
input_path: Path to the input file
|
|
52
|
+
output_dir: Optional output directory for extracted images
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
ConvertResult containing markdown and extracted images
|
|
56
|
+
"""
|
|
57
|
+
input_path = Path(input_path)
|
|
58
|
+
images: list[ExtractedImage] = []
|
|
59
|
+
|
|
60
|
+
# Check if OCR mode is enabled
|
|
61
|
+
use_ocr = self.config and self.config.ocr.enabled
|
|
62
|
+
use_llm = self.config and self.config.llm.enabled
|
|
63
|
+
|
|
64
|
+
if use_ocr:
|
|
65
|
+
if use_llm:
|
|
66
|
+
# --ocr --llm: Render pages as images for LLM Vision analysis
|
|
67
|
+
return self._render_pages_for_llm(input_path, output_dir)
|
|
68
|
+
# --ocr only: Use RapidOCR for text extraction
|
|
69
|
+
return self._convert_with_ocr(input_path, output_dir)
|
|
70
|
+
|
|
71
|
+
# Determine image output path
|
|
72
|
+
temp_dir: Path | None = None
|
|
73
|
+
if output_dir:
|
|
74
|
+
image_path = output_dir / "assets"
|
|
75
|
+
image_path.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
write_images = True
|
|
77
|
+
else:
|
|
78
|
+
# Use temp directory if no output dir specified
|
|
79
|
+
temp_dir = Path(tempfile.mkdtemp())
|
|
80
|
+
image_path = temp_dir
|
|
81
|
+
write_images = True
|
|
82
|
+
|
|
83
|
+
# Get image format from config
|
|
84
|
+
image_format = "png"
|
|
85
|
+
dpi = DEFAULT_RENDER_DPI
|
|
86
|
+
if self.config:
|
|
87
|
+
image_format = self.config.image.format
|
|
88
|
+
if image_format == "jpeg":
|
|
89
|
+
image_format = "jpg"
|
|
90
|
+
|
|
91
|
+
# Convert using pymupdf4llm with page_chunks=True for page-level splitting
|
|
92
|
+
# This allows proper text-to-screenshot alignment in batched LLM processing
|
|
93
|
+
page_results = pymupdf4llm.to_markdown(
|
|
94
|
+
str(input_path),
|
|
95
|
+
write_images=write_images,
|
|
96
|
+
image_path=str(image_path),
|
|
97
|
+
image_format=image_format,
|
|
98
|
+
dpi=dpi,
|
|
99
|
+
force_text=True,
|
|
100
|
+
page_chunks=True, # Return list of page chunks instead of single string
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Merge page chunks and add page markers for proper splitting
|
|
104
|
+
# Format: <!-- Page number: N --> (consistent with Slide number format)
|
|
105
|
+
# Ensure blank line after marker for proper markdown formatting
|
|
106
|
+
markdown_parts = []
|
|
107
|
+
for i, chunk in enumerate(page_results):
|
|
108
|
+
page_num = i + 1
|
|
109
|
+
page_marker = f"<!-- Page number: {page_num} -->"
|
|
110
|
+
page_text = chunk.get("text", "") if isinstance(chunk, dict) else str(chunk)
|
|
111
|
+
markdown_parts.append(f"{page_marker}\n\n{page_text}")
|
|
112
|
+
|
|
113
|
+
markdown = "\n\n".join(markdown_parts)
|
|
114
|
+
|
|
115
|
+
# Fix image paths in markdown: pymupdf4llm uses absolute/full paths,
|
|
116
|
+
# we need relative paths (assets/xxx.jpg)
|
|
117
|
+
markdown = self._fix_image_paths(markdown, image_path)
|
|
118
|
+
|
|
119
|
+
# Collect extracted images (only for current file)
|
|
120
|
+
if write_images and image_path.exists():
|
|
121
|
+
# Use input filename as prefix to filter images from this file only
|
|
122
|
+
file_prefix = input_path.name
|
|
123
|
+
image_processor = ImageProcessor(self.config.image if self.config else None)
|
|
124
|
+
for idx, img_file in enumerate(
|
|
125
|
+
sorted(image_path.glob(f"{file_prefix}*.{image_format}"))
|
|
126
|
+
):
|
|
127
|
+
suffix = img_file.suffix.lower().lstrip(".")
|
|
128
|
+
width = 0
|
|
129
|
+
height = 0
|
|
130
|
+
|
|
131
|
+
# Optionally compress and overwrite to keep sizes consistent
|
|
132
|
+
if self.config and self.config.image.compress:
|
|
133
|
+
format_map = {
|
|
134
|
+
"jpg": "JPEG",
|
|
135
|
+
"jpeg": "JPEG",
|
|
136
|
+
"png": "PNG",
|
|
137
|
+
"webp": "WEBP",
|
|
138
|
+
}
|
|
139
|
+
output_format = format_map.get(suffix, "PNG")
|
|
140
|
+
try:
|
|
141
|
+
from PIL import Image
|
|
142
|
+
|
|
143
|
+
with Image.open(img_file) as img:
|
|
144
|
+
compressed_img, compressed_data = image_processor.compress(
|
|
145
|
+
img.copy(),
|
|
146
|
+
quality=self.config.image.quality,
|
|
147
|
+
max_size=(
|
|
148
|
+
self.config.image.max_width,
|
|
149
|
+
self.config.image.max_height,
|
|
150
|
+
),
|
|
151
|
+
output_format=output_format,
|
|
152
|
+
)
|
|
153
|
+
img_file.write_bytes(compressed_data)
|
|
154
|
+
width, height = compressed_img.size
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
if width == 0 or height == 0:
|
|
159
|
+
try:
|
|
160
|
+
from PIL import Image
|
|
161
|
+
|
|
162
|
+
with Image.open(img_file) as img:
|
|
163
|
+
width, height = img.size
|
|
164
|
+
except Exception:
|
|
165
|
+
width, height = 0, 0
|
|
166
|
+
|
|
167
|
+
# Determine MIME type
|
|
168
|
+
mime_type = get_mime_type(suffix, default="image/png")
|
|
169
|
+
|
|
170
|
+
images.append(
|
|
171
|
+
ExtractedImage(
|
|
172
|
+
path=img_file,
|
|
173
|
+
index=idx + 1,
|
|
174
|
+
original_name=img_file.name,
|
|
175
|
+
mime_type=mime_type,
|
|
176
|
+
width=width,
|
|
177
|
+
height=height,
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
metadata: dict[str, Any] = {
|
|
182
|
+
"source": str(input_path),
|
|
183
|
+
"format": "PDF",
|
|
184
|
+
"images": len(images),
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
# Render page screenshots if enabled (independent of OCR)
|
|
188
|
+
enable_screenshot = self.config and self.config.screenshot.enabled
|
|
189
|
+
if enable_screenshot and output_dir:
|
|
190
|
+
page_images: list[dict] = []
|
|
191
|
+
screenshots_dir = ensure_screenshots_dir(output_dir)
|
|
192
|
+
|
|
193
|
+
import pymupdf
|
|
194
|
+
|
|
195
|
+
# Create ImageProcessor for compression
|
|
196
|
+
img_processor = ImageProcessor(self.config.image if self.config else None)
|
|
197
|
+
|
|
198
|
+
doc = pymupdf.open(input_path)
|
|
199
|
+
try:
|
|
200
|
+
screenshot_dpi = DEFAULT_RENDER_DPI
|
|
201
|
+
screenshot_format = image_format if image_format != "png" else "jpg"
|
|
202
|
+
for page_num in range(len(doc)):
|
|
203
|
+
page = doc[page_num]
|
|
204
|
+
|
|
205
|
+
# Render page to image
|
|
206
|
+
mat = pymupdf.Matrix(screenshot_dpi / 72, screenshot_dpi / 72)
|
|
207
|
+
pix = page.get_pixmap(matrix=mat)
|
|
208
|
+
|
|
209
|
+
# Save page image with compression (ensures < 5MB for LLM)
|
|
210
|
+
image_name = (
|
|
211
|
+
f"{input_path.name}.page{page_num + 1:04d}.{screenshot_format}"
|
|
212
|
+
)
|
|
213
|
+
screenshot_path = screenshots_dir / image_name
|
|
214
|
+
img_processor.save_screenshot(
|
|
215
|
+
pix.samples, pix.width, pix.height, screenshot_path
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
page_images.append(
|
|
219
|
+
{
|
|
220
|
+
"page": page_num + 1,
|
|
221
|
+
"path": str(screenshot_path),
|
|
222
|
+
"name": image_name,
|
|
223
|
+
}
|
|
224
|
+
)
|
|
225
|
+
finally:
|
|
226
|
+
doc.close()
|
|
227
|
+
|
|
228
|
+
if page_images:
|
|
229
|
+
logger.debug(f"Rendered {len(page_images)} page screenshots")
|
|
230
|
+
|
|
231
|
+
metadata["page_images"] = page_images
|
|
232
|
+
metadata["pages"] = len(page_images)
|
|
233
|
+
metadata["extracted_text"] = markdown
|
|
234
|
+
|
|
235
|
+
# Clean up temporary directory if used
|
|
236
|
+
if temp_dir and temp_dir.exists():
|
|
237
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
238
|
+
|
|
239
|
+
return ConvertResult(
|
|
240
|
+
markdown=markdown,
|
|
241
|
+
images=images,
|
|
242
|
+
metadata=metadata,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def _fix_image_paths(self, markdown: str, image_path: Path) -> str:
|
|
246
|
+
"""Fix image paths to be relative to output directory.
|
|
247
|
+
|
|
248
|
+
pymupdf4llm generates paths like: 
|
|
249
|
+
We need: 
|
|
250
|
+
"""
|
|
251
|
+
# Escape special regex characters in the path
|
|
252
|
+
escaped_path = re.escape(str(image_path))
|
|
253
|
+
# Match image references with the full path and replace with assets/filename
|
|
254
|
+
# Preserve alt text if present
|
|
255
|
+
pattern = rf"!\[([^\]]*)\]\({escaped_path}/([^)]+)\)"
|
|
256
|
+
replacement = r""
|
|
257
|
+
return re.sub(pattern, replacement, markdown)
|
|
258
|
+
|
|
259
|
+
def _collect_embedded_images(
|
|
260
|
+
self, assets_dir: Path, input_name: str
|
|
261
|
+
) -> list[ExtractedImage]:
|
|
262
|
+
"""Collect embedded images extracted by pymupdf4llm.
|
|
263
|
+
|
|
264
|
+
pymupdf4llm extracts embedded images with names like: filename.pdf-0-0.png
|
|
265
|
+
(page index - image index on that page)
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
assets_dir: Directory where images were extracted
|
|
269
|
+
input_name: Original PDF filename
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
List of ExtractedImage for embedded images
|
|
273
|
+
"""
|
|
274
|
+
embedded_images: list[ExtractedImage] = []
|
|
275
|
+
# Pattern: filename.pdf-{page}-{index}.{ext}
|
|
276
|
+
pattern = re.compile(rf"^{re.escape(input_name)}-(\d+)-(\d+)\.(png|jpg|jpeg)$")
|
|
277
|
+
|
|
278
|
+
for image_file in assets_dir.iterdir():
|
|
279
|
+
match = pattern.match(image_file.name)
|
|
280
|
+
if match:
|
|
281
|
+
page_idx = int(match.group(1))
|
|
282
|
+
img_idx = int(match.group(2))
|
|
283
|
+
ext = match.group(3)
|
|
284
|
+
|
|
285
|
+
# Get image dimensions
|
|
286
|
+
try:
|
|
287
|
+
import pymupdf
|
|
288
|
+
|
|
289
|
+
pix = pymupdf.Pixmap(str(image_file))
|
|
290
|
+
width, height = pix.width, pix.height
|
|
291
|
+
except Exception:
|
|
292
|
+
width, height = 0, 0
|
|
293
|
+
|
|
294
|
+
embedded_images.append(
|
|
295
|
+
ExtractedImage(
|
|
296
|
+
path=image_file,
|
|
297
|
+
index=page_idx * 100 + img_idx, # Unique index
|
|
298
|
+
original_name=image_file.name,
|
|
299
|
+
mime_type=f"image/{'jpeg' if ext in ('jpg', 'jpeg') else ext}",
|
|
300
|
+
width=width,
|
|
301
|
+
height=height,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
return embedded_images
|
|
306
|
+
|
|
307
|
+
def _convert_with_ocr(
|
|
308
|
+
self, input_path: Path, output_dir: Path | None = None
|
|
309
|
+
) -> ConvertResult:
|
|
310
|
+
"""Convert PDF using OCR for scanned documents.
|
|
311
|
+
|
|
312
|
+
Also renders each page as an image (if enable_screenshot) for reference.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
input_path: Path to the PDF file
|
|
316
|
+
output_dir: Optional output directory for extracted images
|
|
317
|
+
|
|
318
|
+
Returns:
|
|
319
|
+
ConvertResult containing OCR-extracted markdown with commented page images
|
|
320
|
+
"""
|
|
321
|
+
try:
|
|
322
|
+
import pymupdf
|
|
323
|
+
except ImportError as e:
|
|
324
|
+
raise ImportError(
|
|
325
|
+
"PyMuPDF is not installed. Install with: pip install pymupdf"
|
|
326
|
+
) from e
|
|
327
|
+
|
|
328
|
+
from markitai.ocr import OCRProcessor
|
|
329
|
+
|
|
330
|
+
ocr_config = self.config.ocr if self.config else None
|
|
331
|
+
ocr = OCRProcessor(ocr_config)
|
|
332
|
+
|
|
333
|
+
logger.info(f"Converting PDF with OCR: {input_path.name}")
|
|
334
|
+
|
|
335
|
+
# Setup screenshots directory for page images
|
|
336
|
+
if output_dir:
|
|
337
|
+
screenshots_dir = ensure_screenshots_dir(output_dir)
|
|
338
|
+
else:
|
|
339
|
+
screenshots_dir = Path(tempfile.mkdtemp())
|
|
340
|
+
|
|
341
|
+
# Get image format from config
|
|
342
|
+
image_format = "jpg"
|
|
343
|
+
if self.config:
|
|
344
|
+
fmt = self.config.image.format
|
|
345
|
+
image_format = "jpg" if fmt == "jpeg" else fmt
|
|
346
|
+
|
|
347
|
+
# Check if screenshot is enabled
|
|
348
|
+
enable_screenshot = self.config and self.config.screenshot.enabled
|
|
349
|
+
|
|
350
|
+
images: list[ExtractedImage] = []
|
|
351
|
+
page_images: list[dict] = []
|
|
352
|
+
markdown_parts = []
|
|
353
|
+
dpi = DEFAULT_RENDER_DPI
|
|
354
|
+
|
|
355
|
+
# Step 2: Render each page as image (only if screenshot enabled)
|
|
356
|
+
# Use parallel processing for better performance
|
|
357
|
+
doc = pymupdf.open(input_path)
|
|
358
|
+
total_pages = len(doc)
|
|
359
|
+
doc.close()
|
|
360
|
+
|
|
361
|
+
# Determine optimal worker count based on file size and system resources
|
|
362
|
+
# Each worker opens its own PDF copy, so memory usage scales with workers × file_size
|
|
363
|
+
file_size_mb = input_path.stat().st_size / (1024 * 1024)
|
|
364
|
+
cpu_count = os.cpu_count() or 4
|
|
365
|
+
|
|
366
|
+
# Adaptive worker count:
|
|
367
|
+
# - Small files (<10MB): use up to cpu_count/2 workers
|
|
368
|
+
# - Medium files (10-50MB): use up to 4 workers
|
|
369
|
+
# - Large files (>50MB): use up to 2 workers to limit memory
|
|
370
|
+
if file_size_mb < 10:
|
|
371
|
+
max_workers = min(cpu_count // 2 or 2, total_pages, 6)
|
|
372
|
+
elif file_size_mb < 50:
|
|
373
|
+
max_workers = min(4, total_pages)
|
|
374
|
+
else:
|
|
375
|
+
max_workers = min(2, total_pages)
|
|
376
|
+
|
|
377
|
+
# Ensure at least 1 worker
|
|
378
|
+
max_workers = max(1, max_workers)
|
|
379
|
+
|
|
380
|
+
if enable_screenshot:
|
|
381
|
+
screenshots_dir.mkdir(parents=True, exist_ok=True)
|
|
382
|
+
|
|
383
|
+
def process_page_with_screenshot(page_num: int) -> dict:
|
|
384
|
+
"""Process a single page: render + OCR (thread-safe)."""
|
|
385
|
+
# Each thread opens its own document (PyMuPDF not thread-safe)
|
|
386
|
+
thread_doc = pymupdf.open(input_path)
|
|
387
|
+
img_processor = ImageProcessor(
|
|
388
|
+
self.config.image if self.config else None
|
|
389
|
+
)
|
|
390
|
+
try:
|
|
391
|
+
page = thread_doc[page_num]
|
|
392
|
+
|
|
393
|
+
# Render page to image
|
|
394
|
+
mat = pymupdf.Matrix(dpi / 72, dpi / 72)
|
|
395
|
+
pix = page.get_pixmap(matrix=mat)
|
|
396
|
+
|
|
397
|
+
# Save page image with compression
|
|
398
|
+
image_name = (
|
|
399
|
+
f"{input_path.name}.page{page_num + 1:04d}.{image_format}"
|
|
400
|
+
)
|
|
401
|
+
image_path = screenshots_dir / image_name
|
|
402
|
+
final_size = img_processor.save_screenshot(
|
|
403
|
+
pix.samples, pix.width, pix.height, image_path
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# OCR - reuse already rendered pixmap to avoid re-rendering
|
|
407
|
+
try:
|
|
408
|
+
result = ocr.recognize_pixmap(
|
|
409
|
+
pix.samples, pix.width, pix.height, pix.n
|
|
410
|
+
)
|
|
411
|
+
text_content = (
|
|
412
|
+
result.text.strip()
|
|
413
|
+
if result.text.strip()
|
|
414
|
+
else "*(No text detected)*"
|
|
415
|
+
)
|
|
416
|
+
except Exception as e:
|
|
417
|
+
logger.warning(f"OCR failed for page {page_num + 1}: {e}")
|
|
418
|
+
text_content = f"*(OCR failed: {e})*"
|
|
419
|
+
|
|
420
|
+
page_content = f"{text_content}\n\n<!--  -->"
|
|
421
|
+
|
|
422
|
+
return {
|
|
423
|
+
"page_num": page_num,
|
|
424
|
+
"image": ExtractedImage(
|
|
425
|
+
path=image_path,
|
|
426
|
+
index=page_num + 1,
|
|
427
|
+
original_name=image_name,
|
|
428
|
+
mime_type=f"image/{image_format}",
|
|
429
|
+
width=final_size[0],
|
|
430
|
+
height=final_size[1],
|
|
431
|
+
),
|
|
432
|
+
"page_image": {
|
|
433
|
+
"page": page_num + 1,
|
|
434
|
+
"path": str(image_path),
|
|
435
|
+
"name": image_name,
|
|
436
|
+
},
|
|
437
|
+
"markdown": page_content,
|
|
438
|
+
}
|
|
439
|
+
finally:
|
|
440
|
+
thread_doc.close()
|
|
441
|
+
|
|
442
|
+
# Process pages in parallel
|
|
443
|
+
results: dict[int, dict] = {}
|
|
444
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
445
|
+
futures = {
|
|
446
|
+
executor.submit(process_page_with_screenshot, i): i
|
|
447
|
+
for i in range(total_pages)
|
|
448
|
+
}
|
|
449
|
+
for future in as_completed(futures):
|
|
450
|
+
page_num = futures[future]
|
|
451
|
+
try:
|
|
452
|
+
result = future.result()
|
|
453
|
+
results[page_num] = result
|
|
454
|
+
logger.debug(f"OCR processed page {page_num + 1}/{total_pages}")
|
|
455
|
+
except Exception as e:
|
|
456
|
+
logger.error(f"Failed to process page {page_num + 1}: {e}")
|
|
457
|
+
results[page_num] = {
|
|
458
|
+
"page_num": page_num,
|
|
459
|
+
"image": None,
|
|
460
|
+
"page_image": None,
|
|
461
|
+
"markdown": f"*(Page processing failed: {e})*",
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
# Collect results in order
|
|
465
|
+
for i in range(total_pages):
|
|
466
|
+
r = results[i]
|
|
467
|
+
if r["image"]:
|
|
468
|
+
images.append(r["image"])
|
|
469
|
+
if r["page_image"]:
|
|
470
|
+
page_images.append(r["page_image"])
|
|
471
|
+
markdown_parts.append(r["markdown"])
|
|
472
|
+
else:
|
|
473
|
+
|
|
474
|
+
def process_page_ocr_only(page_num: int) -> dict:
|
|
475
|
+
"""Process a single page: OCR only (thread-safe)."""
|
|
476
|
+
try:
|
|
477
|
+
result = ocr.recognize_pdf_page(input_path, page_num, dpi=dpi)
|
|
478
|
+
text_content = (
|
|
479
|
+
result.text.strip()
|
|
480
|
+
if result.text.strip()
|
|
481
|
+
else "*(No text detected)*"
|
|
482
|
+
)
|
|
483
|
+
except Exception as e:
|
|
484
|
+
logger.warning(f"OCR failed for page {page_num + 1}: {e}")
|
|
485
|
+
text_content = f"*(OCR failed: {e})*"
|
|
486
|
+
return {"page_num": page_num, "markdown": text_content}
|
|
487
|
+
|
|
488
|
+
# Process pages in parallel
|
|
489
|
+
results: dict[int, dict] = {}
|
|
490
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
491
|
+
futures = {
|
|
492
|
+
executor.submit(process_page_ocr_only, i): i
|
|
493
|
+
for i in range(total_pages)
|
|
494
|
+
}
|
|
495
|
+
for future in as_completed(futures):
|
|
496
|
+
page_num = futures[future]
|
|
497
|
+
try:
|
|
498
|
+
result = future.result()
|
|
499
|
+
results[page_num] = result
|
|
500
|
+
logger.debug(f"OCR processed page {page_num + 1}/{total_pages}")
|
|
501
|
+
except Exception as e:
|
|
502
|
+
logger.error(f"Failed to process page {page_num + 1}: {e}")
|
|
503
|
+
results[page_num] = {
|
|
504
|
+
"page_num": page_num,
|
|
505
|
+
"markdown": f"*(OCR failed: {e})*",
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
# Collect results in order
|
|
509
|
+
for i in range(total_pages):
|
|
510
|
+
markdown_parts.append(results[i]["markdown"])
|
|
511
|
+
|
|
512
|
+
extracted_text = f"# {input_path.stem}\n\n" + "\n\n".join(markdown_parts)
|
|
513
|
+
|
|
514
|
+
return ConvertResult(
|
|
515
|
+
markdown=extracted_text,
|
|
516
|
+
images=images,
|
|
517
|
+
metadata={
|
|
518
|
+
"source": str(input_path),
|
|
519
|
+
"format": "PDF",
|
|
520
|
+
"ocr_used": True,
|
|
521
|
+
"pages": len(markdown_parts),
|
|
522
|
+
"extracted_text": extracted_text,
|
|
523
|
+
"page_images": page_images,
|
|
524
|
+
},
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
def _render_pages_for_llm(
|
|
528
|
+
self, input_path: Path, output_dir: Path | None = None
|
|
529
|
+
) -> ConvertResult:
|
|
530
|
+
"""Extract text and render pages for LLM Vision analysis.
|
|
531
|
+
|
|
532
|
+
This method:
|
|
533
|
+
1. Extracts text using pymupdf4llm (fast, preserves links/tables)
|
|
534
|
+
2. Renders each page as an image (if screenshot enabled)
|
|
535
|
+
|
|
536
|
+
Returns:
|
|
537
|
+
ConvertResult with extracted text and page images
|
|
538
|
+
"""
|
|
539
|
+
try:
|
|
540
|
+
import pymupdf
|
|
541
|
+
except ImportError as e:
|
|
542
|
+
raise ImportError(
|
|
543
|
+
"PyMuPDF is not installed. Install with: pip install pymupdf"
|
|
544
|
+
) from e
|
|
545
|
+
|
|
546
|
+
logger.info(f"Extracting text and rendering pages for LLM: {input_path.name}")
|
|
547
|
+
|
|
548
|
+
# Determine output paths
|
|
549
|
+
if output_dir:
|
|
550
|
+
assets_dir = ensure_assets_dir(output_dir)
|
|
551
|
+
screenshots_dir = ensure_screenshots_dir(output_dir)
|
|
552
|
+
else:
|
|
553
|
+
assets_dir = Path(tempfile.mkdtemp())
|
|
554
|
+
screenshots_dir = Path(tempfile.mkdtemp())
|
|
555
|
+
|
|
556
|
+
# Get image format from config
|
|
557
|
+
image_format = "jpg"
|
|
558
|
+
if self.config:
|
|
559
|
+
fmt = self.config.image.format
|
|
560
|
+
image_format = "jpg" if fmt == "jpeg" else fmt
|
|
561
|
+
|
|
562
|
+
# Step 1: Extract text using pymupdf4llm (fast, preserves structure)
|
|
563
|
+
logger.debug("Extracting text with pymupdf4llm...")
|
|
564
|
+
extracted_text = cast(
|
|
565
|
+
str,
|
|
566
|
+
pymupdf4llm.to_markdown(
|
|
567
|
+
str(input_path),
|
|
568
|
+
write_images=True,
|
|
569
|
+
image_path=str(assets_dir),
|
|
570
|
+
image_format=image_format,
|
|
571
|
+
dpi=DEFAULT_RENDER_DPI,
|
|
572
|
+
force_text=True,
|
|
573
|
+
),
|
|
574
|
+
)
|
|
575
|
+
extracted_text = self._fix_image_paths(extracted_text, assets_dir)
|
|
576
|
+
|
|
577
|
+
# Collect embedded images extracted by pymupdf4llm
|
|
578
|
+
embedded_images = self._collect_embedded_images(assets_dir, input_path.name)
|
|
579
|
+
|
|
580
|
+
# Check if screenshot is enabled
|
|
581
|
+
enable_screenshot = self.config and self.config.screenshot.enabled
|
|
582
|
+
|
|
583
|
+
images: list[ExtractedImage] = list(embedded_images)
|
|
584
|
+
page_images: list[dict] = []
|
|
585
|
+
|
|
586
|
+
if enable_screenshot:
|
|
587
|
+
screenshots_dir.mkdir(parents=True, exist_ok=True)
|
|
588
|
+
# Create ImageProcessor for compression
|
|
589
|
+
img_processor = ImageProcessor(self.config.image if self.config else None)
|
|
590
|
+
|
|
591
|
+
# Get total pages (lightweight operation - only reads PDF metadata)
|
|
592
|
+
with pymupdf.open(input_path) as doc:
|
|
593
|
+
total_pages = len(doc)
|
|
594
|
+
|
|
595
|
+
dpi = DEFAULT_RENDER_DPI
|
|
596
|
+
|
|
597
|
+
def render_page(page_num: int) -> tuple[ExtractedImage, dict]:
|
|
598
|
+
"""Render a single page (thread-safe).
|
|
599
|
+
|
|
600
|
+
Each thread opens its own document copy to ensure thread safety.
|
|
601
|
+
PyMuPDF is not thread-safe when sharing document objects.
|
|
602
|
+
"""
|
|
603
|
+
# Open document in each thread for thread safety
|
|
604
|
+
thread_doc = pymupdf.open(input_path)
|
|
605
|
+
try:
|
|
606
|
+
page = thread_doc[page_num]
|
|
607
|
+
|
|
608
|
+
# Render page to image
|
|
609
|
+
mat = pymupdf.Matrix(dpi / 72, dpi / 72)
|
|
610
|
+
pix = page.get_pixmap(matrix=mat)
|
|
611
|
+
|
|
612
|
+
# Save page image with compression (ensures < 5MB for LLM)
|
|
613
|
+
image_name = (
|
|
614
|
+
f"{input_path.name}.page{page_num + 1:04d}.{image_format}"
|
|
615
|
+
)
|
|
616
|
+
image_path = screenshots_dir / image_name
|
|
617
|
+
final_size = img_processor.save_screenshot(
|
|
618
|
+
pix.samples, pix.width, pix.height, image_path
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
extracted_img = ExtractedImage(
|
|
622
|
+
path=image_path,
|
|
623
|
+
index=page_num + 1,
|
|
624
|
+
original_name=image_name,
|
|
625
|
+
mime_type=f"image/{image_format}",
|
|
626
|
+
width=final_size[0],
|
|
627
|
+
height=final_size[1],
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
page_info = {
|
|
631
|
+
"page": page_num + 1,
|
|
632
|
+
"path": str(image_path),
|
|
633
|
+
"name": image_name,
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
return (extracted_img, page_info)
|
|
637
|
+
finally:
|
|
638
|
+
thread_doc.close()
|
|
639
|
+
|
|
640
|
+
# Render pages in parallel using ThreadPoolExecutor
|
|
641
|
+
# Use min(4, total_pages) workers to balance parallelism and resource usage
|
|
642
|
+
max_workers = min(4, total_pages) if total_pages > 0 else 1
|
|
643
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
644
|
+
futures = {
|
|
645
|
+
executor.submit(render_page, i): i for i in range(total_pages)
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
# Collect results maintaining page order
|
|
649
|
+
results: list[tuple[int, ExtractedImage, dict]] = []
|
|
650
|
+
for future in as_completed(futures):
|
|
651
|
+
page_num = futures[future]
|
|
652
|
+
try:
|
|
653
|
+
extracted_img, page_info = future.result()
|
|
654
|
+
results.append((page_num, extracted_img, page_info))
|
|
655
|
+
except Exception as e:
|
|
656
|
+
logger.error(f"Failed to render page {page_num + 1}: {e}")
|
|
657
|
+
raise
|
|
658
|
+
|
|
659
|
+
# Sort by page number to maintain order
|
|
660
|
+
results.sort(key=lambda x: x[0])
|
|
661
|
+
|
|
662
|
+
for _, extracted_img, page_info in results:
|
|
663
|
+
images.append(extracted_img)
|
|
664
|
+
page_images.append(page_info)
|
|
665
|
+
|
|
666
|
+
if page_images:
|
|
667
|
+
logger.debug(f"Rendered {len(page_images)} page screenshots")
|
|
668
|
+
|
|
669
|
+
return ConvertResult(
|
|
670
|
+
markdown=extracted_text,
|
|
671
|
+
images=images,
|
|
672
|
+
metadata={
|
|
673
|
+
"source": str(input_path),
|
|
674
|
+
"format": "PDF",
|
|
675
|
+
"pages": len(page_images) if page_images else 0,
|
|
676
|
+
"extracted_text": extracted_text,
|
|
677
|
+
"page_images": page_images,
|
|
678
|
+
},
|
|
679
|
+
)
|