markitai 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markitai/__init__.py +3 -0
- markitai/batch.py +1316 -0
- markitai/cli.py +3979 -0
- markitai/config.py +602 -0
- markitai/config.schema.json +748 -0
- markitai/constants.py +222 -0
- markitai/converter/__init__.py +49 -0
- markitai/converter/_patches.py +98 -0
- markitai/converter/base.py +164 -0
- markitai/converter/image.py +181 -0
- markitai/converter/legacy.py +606 -0
- markitai/converter/office.py +526 -0
- markitai/converter/pdf.py +679 -0
- markitai/converter/text.py +63 -0
- markitai/fetch.py +1725 -0
- markitai/image.py +1335 -0
- markitai/json_order.py +550 -0
- markitai/llm.py +4339 -0
- markitai/ocr.py +347 -0
- markitai/prompts/__init__.py +159 -0
- markitai/prompts/cleaner.md +93 -0
- markitai/prompts/document_enhance.md +77 -0
- markitai/prompts/document_enhance_complete.md +65 -0
- markitai/prompts/document_process.md +60 -0
- markitai/prompts/frontmatter.md +28 -0
- markitai/prompts/image_analysis.md +21 -0
- markitai/prompts/image_caption.md +8 -0
- markitai/prompts/image_description.md +13 -0
- markitai/prompts/page_content.md +17 -0
- markitai/prompts/url_enhance.md +78 -0
- markitai/security.py +286 -0
- markitai/types.py +30 -0
- markitai/urls.py +187 -0
- markitai/utils/__init__.py +33 -0
- markitai/utils/executor.py +69 -0
- markitai/utils/mime.py +85 -0
- markitai/utils/office.py +262 -0
- markitai/utils/output.py +53 -0
- markitai/utils/paths.py +81 -0
- markitai/utils/text.py +359 -0
- markitai/workflow/__init__.py +37 -0
- markitai/workflow/core.py +760 -0
- markitai/workflow/helpers.py +509 -0
- markitai/workflow/single.py +369 -0
- markitai-0.3.0.dist-info/METADATA +159 -0
- markitai-0.3.0.dist-info/RECORD +48 -0
- markitai-0.3.0.dist-info/WHEEL +4 -0
- markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/image.py
ADDED
|
@@ -0,0 +1,1335 @@
|
|
|
1
|
+
"""Image processing module for extraction, compression, and filtering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import base64
|
|
7
|
+
import hashlib
|
|
8
|
+
import io
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
from urllib.parse import urljoin, urlparse
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
from loguru import logger
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
21
|
+
from markitai.constants import (
|
|
22
|
+
DEFAULT_IMAGE_IO_CONCURRENCY,
|
|
23
|
+
DEFAULT_IMAGE_MAX_HEIGHT,
|
|
24
|
+
DEFAULT_IMAGE_MAX_WIDTH,
|
|
25
|
+
DEFAULT_IMAGE_QUALITY,
|
|
26
|
+
DEFAULT_SCREENSHOT_MAX_BYTES,
|
|
27
|
+
)
|
|
28
|
+
from markitai.utils.mime import get_extension_from_mime
|
|
29
|
+
from markitai.utils.paths import ensure_assets_dir
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from markitai.config import ImageConfig
|
|
33
|
+
from markitai.converter.base import ExtractedImage
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Module-level function for multiprocessing (must be picklable)
|
|
37
|
+
def _compress_image_worker(
|
|
38
|
+
image_data: bytes,
|
|
39
|
+
quality: int,
|
|
40
|
+
max_size: tuple[int, int],
|
|
41
|
+
output_format: str,
|
|
42
|
+
min_width: int,
|
|
43
|
+
min_height: int,
|
|
44
|
+
min_area: int,
|
|
45
|
+
) -> tuple[bytes, int, int] | None:
|
|
46
|
+
"""Compress a single image in a worker process.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
image_data: Raw image bytes
|
|
50
|
+
quality: JPEG quality (1-100)
|
|
51
|
+
max_size: Maximum dimensions (width, height)
|
|
52
|
+
output_format: Output format (JPEG, PNG, WEBP)
|
|
53
|
+
min_width: Minimum width filter
|
|
54
|
+
min_height: Minimum height filter
|
|
55
|
+
min_area: Minimum area filter
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Tuple of (compressed_data, final_width, final_height) or None if filtered
|
|
59
|
+
"""
|
|
60
|
+
try:
|
|
61
|
+
with io.BytesIO(image_data) as buffer:
|
|
62
|
+
img = Image.open(buffer)
|
|
63
|
+
img.load()
|
|
64
|
+
width, height = img.size
|
|
65
|
+
|
|
66
|
+
# Apply filter
|
|
67
|
+
if width < min_width or height < min_height or width * height < min_area:
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
# Resize if needed
|
|
71
|
+
img.thumbnail(max_size, Image.Resampling.LANCZOS)
|
|
72
|
+
|
|
73
|
+
# Convert to RGB for JPEG
|
|
74
|
+
if output_format.upper() == "JPEG" and img.mode in ("RGBA", "P", "LA"):
|
|
75
|
+
background = Image.new("RGB", img.size, (255, 255, 255))
|
|
76
|
+
if img.mode == "P":
|
|
77
|
+
img = img.convert("RGBA")
|
|
78
|
+
if img.mode in ("RGBA", "LA"):
|
|
79
|
+
background.paste(img, mask=img.split()[-1])
|
|
80
|
+
else:
|
|
81
|
+
background.paste(img)
|
|
82
|
+
img = background
|
|
83
|
+
|
|
84
|
+
# Compress to bytes
|
|
85
|
+
out_buffer = io.BytesIO()
|
|
86
|
+
save_kwargs: dict[str, Any] = {"format": output_format}
|
|
87
|
+
if output_format.upper() in ("JPEG", "WEBP"):
|
|
88
|
+
save_kwargs["quality"] = quality
|
|
89
|
+
if output_format.upper() == "PNG":
|
|
90
|
+
save_kwargs["optimize"] = True
|
|
91
|
+
|
|
92
|
+
img.save(out_buffer, **save_kwargs)
|
|
93
|
+
return out_buffer.getvalue(), img.size[0], img.size[1]
|
|
94
|
+
except Exception:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class ProcessedImage:
|
|
100
|
+
"""Result of processing a single image.
|
|
101
|
+
|
|
102
|
+
Tracks the original position and processing outcome for each image,
|
|
103
|
+
enabling correct mapping during base64 replacement.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
original_index: int # 1-indexed position in original markdown
|
|
107
|
+
saved_path: Path | None # None if filtered/deduplicated
|
|
108
|
+
skip_reason: str | None # "duplicate" | "filtered" | None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class ImageProcessResult:
|
|
113
|
+
"""Result of image processing."""
|
|
114
|
+
|
|
115
|
+
saved_images: list[ExtractedImage]
|
|
116
|
+
filtered_count: int
|
|
117
|
+
deduplicated_count: int
|
|
118
|
+
# Mapping from original 1-indexed position to processing result
|
|
119
|
+
# This enables correct base64 replacement even when images are filtered
|
|
120
|
+
index_mapping: dict[int, ProcessedImage] | None = None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class ImageProcessor:
|
|
124
|
+
"""Processor for image extraction, compression, and filtering."""
|
|
125
|
+
|
|
126
|
+
# Regex pattern to match base64 data URIs in markdown
|
|
127
|
+
# Support MIME types like png, jpeg, x-emf, x-wmf (with hyphens)
|
|
128
|
+
DATA_URI_PATTERN = re.compile(
|
|
129
|
+
r"!\[([^\]]*)\]\(data:image/([\w+.-]+);base64,([A-Za-z0-9+/=]+)\)"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
def __init__(self, config: ImageConfig | None = None) -> None:
|
|
133
|
+
"""Initialize with optional image configuration."""
|
|
134
|
+
self.config = config
|
|
135
|
+
self._seen_hashes: set[str] = set()
|
|
136
|
+
|
|
137
|
+
def _convert_to_png(self, image_data: bytes, original_fmt: str) -> bytes:
|
|
138
|
+
"""Convert unsupported image formats (EMF/WMF) to PNG.
|
|
139
|
+
|
|
140
|
+
On Windows, uses Pillow which has native EMF/WMF support.
|
|
141
|
+
On other platforms, falls back to LibreOffice if available.
|
|
142
|
+
"""
|
|
143
|
+
import platform
|
|
144
|
+
|
|
145
|
+
# Normalize format name
|
|
146
|
+
fmt_lower = original_fmt.lower().replace("x-", "") # x-emf -> emf
|
|
147
|
+
|
|
148
|
+
# On Windows, Pillow can natively read EMF/WMF files
|
|
149
|
+
if platform.system() == "Windows" and fmt_lower in ("emf", "wmf"):
|
|
150
|
+
try:
|
|
151
|
+
with io.BytesIO(image_data) as buffer:
|
|
152
|
+
img = Image.open(buffer)
|
|
153
|
+
# Load at higher DPI for better quality
|
|
154
|
+
# WmfImagePlugin.load() accepts dpi parameter
|
|
155
|
+
img.load(dpi=150) # type: ignore[call-arg]
|
|
156
|
+
|
|
157
|
+
# Convert to RGB if necessary (EMF/WMF loads as RGB)
|
|
158
|
+
if img.mode not in ("RGB", "RGBA"):
|
|
159
|
+
img = img.convert("RGB")
|
|
160
|
+
|
|
161
|
+
# Save as PNG
|
|
162
|
+
out_buffer = io.BytesIO()
|
|
163
|
+
img.save(out_buffer, format="PNG")
|
|
164
|
+
return out_buffer.getvalue()
|
|
165
|
+
except Exception:
|
|
166
|
+
# Fall through to LibreOffice fallback
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
# Fallback to LibreOffice (for non-Windows or if Pillow fails)
|
|
170
|
+
import subprocess
|
|
171
|
+
import tempfile
|
|
172
|
+
import uuid
|
|
173
|
+
|
|
174
|
+
from markitai.utils.office import find_libreoffice
|
|
175
|
+
|
|
176
|
+
soffice = find_libreoffice()
|
|
177
|
+
if not soffice:
|
|
178
|
+
return image_data
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
182
|
+
temp_path = Path(temp_dir)
|
|
183
|
+
# Ensure extension doesn't have special chars
|
|
184
|
+
ext = re.sub(r"[^a-zA-Z0-9]", "", original_fmt)
|
|
185
|
+
temp_in = temp_path / f"temp_{uuid.uuid4().hex[:8]}.{ext}"
|
|
186
|
+
temp_in.write_bytes(image_data)
|
|
187
|
+
|
|
188
|
+
# Create isolated user profile for concurrent LibreOffice execution
|
|
189
|
+
profile_path = temp_path / "lo_profile"
|
|
190
|
+
profile_path.mkdir()
|
|
191
|
+
profile_url = profile_path.as_uri()
|
|
192
|
+
|
|
193
|
+
cmd = [
|
|
194
|
+
soffice,
|
|
195
|
+
"--headless",
|
|
196
|
+
f"-env:UserInstallation={profile_url}",
|
|
197
|
+
"--convert-to",
|
|
198
|
+
"png",
|
|
199
|
+
"--outdir",
|
|
200
|
+
str(temp_path),
|
|
201
|
+
str(temp_in),
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
subprocess.run(cmd, capture_output=True, timeout=30)
|
|
205
|
+
|
|
206
|
+
# LibreOffice output filename depends on input filename
|
|
207
|
+
temp_out = temp_path / f"{temp_in.stem}.png"
|
|
208
|
+
if temp_out.exists():
|
|
209
|
+
return temp_out.read_bytes()
|
|
210
|
+
except Exception:
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
return image_data
|
|
214
|
+
|
|
215
|
+
def extract_base64_images(self, markdown: str) -> list[tuple[str, str, bytes]]:
|
|
216
|
+
"""
|
|
217
|
+
Extract base64-encoded images from markdown content.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
markdown: Markdown content containing data URIs
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
List of (alt_text, mime_type, image_data) tuples
|
|
224
|
+
"""
|
|
225
|
+
images = []
|
|
226
|
+
for match in self.DATA_URI_PATTERN.finditer(markdown):
|
|
227
|
+
alt_text = match.group(1)
|
|
228
|
+
image_type = match.group(2)
|
|
229
|
+
base64_data = match.group(3)
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
image_data = base64.b64decode(base64_data)
|
|
233
|
+
|
|
234
|
+
# Handle EMF/WMF conversion
|
|
235
|
+
if image_type.lower() in ("x-emf", "emf", "x-wmf", "wmf"):
|
|
236
|
+
image_data = self._convert_to_png(image_data, image_type)
|
|
237
|
+
image_type = "png"
|
|
238
|
+
|
|
239
|
+
mime_type = f"image/{image_type}"
|
|
240
|
+
images.append((alt_text, mime_type, image_data))
|
|
241
|
+
except Exception:
|
|
242
|
+
# Skip invalid base64 data
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
return images
|
|
246
|
+
|
|
247
|
+
def replace_base64_with_paths(
|
|
248
|
+
self,
|
|
249
|
+
markdown: str,
|
|
250
|
+
images: list[ExtractedImage],
|
|
251
|
+
assets_path: str = "assets",
|
|
252
|
+
index_mapping: dict[int, ProcessedImage] | None = None,
|
|
253
|
+
) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Replace base64 data URIs with file paths in markdown.
|
|
256
|
+
|
|
257
|
+
When index_mapping is provided, uses position-based replacement to ensure
|
|
258
|
+
each base64 image is replaced with the correct saved image, even when
|
|
259
|
+
some images were filtered or deduplicated.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
markdown: Original markdown with data URIs
|
|
263
|
+
images: List of saved images with paths
|
|
264
|
+
assets_path: Relative path to assets directory
|
|
265
|
+
index_mapping: Optional mapping from original index to ProcessedImage
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Markdown with data URIs replaced by file paths (filtered images removed)
|
|
269
|
+
"""
|
|
270
|
+
if index_mapping:
|
|
271
|
+
# Use position-based replacement for correct mapping
|
|
272
|
+
current_index = 0
|
|
273
|
+
|
|
274
|
+
def replace_match_indexed(match: re.Match) -> str:
|
|
275
|
+
nonlocal current_index
|
|
276
|
+
current_index += 1 # 1-indexed
|
|
277
|
+
processed = index_mapping.get(current_index)
|
|
278
|
+
if processed is None:
|
|
279
|
+
# No mapping for this index, keep original
|
|
280
|
+
return match.group(0)
|
|
281
|
+
if processed.saved_path is None:
|
|
282
|
+
# Image was filtered/deduplicated, remove from output
|
|
283
|
+
return ""
|
|
284
|
+
return f""
|
|
285
|
+
|
|
286
|
+
return self.DATA_URI_PATTERN.sub(replace_match_indexed, markdown)
|
|
287
|
+
|
|
288
|
+
# Legacy: sequential iteration (for backward compatibility)
|
|
289
|
+
image_iter = iter(images)
|
|
290
|
+
|
|
291
|
+
def replace_match(match: re.Match) -> str:
|
|
292
|
+
try:
|
|
293
|
+
img = next(image_iter)
|
|
294
|
+
return f""
|
|
295
|
+
except StopIteration:
|
|
296
|
+
return match.group(0)
|
|
297
|
+
|
|
298
|
+
return self.DATA_URI_PATTERN.sub(replace_match, markdown)
|
|
299
|
+
|
|
300
|
+
def strip_base64_images(
|
|
301
|
+
self,
|
|
302
|
+
markdown: str,
|
|
303
|
+
replacement_path: str | None = None,
|
|
304
|
+
) -> str:
|
|
305
|
+
"""
|
|
306
|
+
Remove all base64 data URIs from markdown.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
markdown: Markdown content with data URIs
|
|
310
|
+
replacement_path: If provided, replace with this path; otherwise remove
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Markdown with base64 images removed or replaced
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
def replace_match(match: re.Match) -> str:
|
|
317
|
+
alt_text = match.group(1)
|
|
318
|
+
if replacement_path:
|
|
319
|
+
return f""
|
|
320
|
+
return "" # Remove the image entirely
|
|
321
|
+
|
|
322
|
+
return self.DATA_URI_PATTERN.sub(replace_match, markdown)
|
|
323
|
+
|
|
324
|
+
@staticmethod
|
|
325
|
+
def remove_nonexistent_images(
|
|
326
|
+
markdown: str,
|
|
327
|
+
assets_dir: Path,
|
|
328
|
+
) -> str:
|
|
329
|
+
"""
|
|
330
|
+
Remove image references that don't exist in assets directory.
|
|
331
|
+
|
|
332
|
+
LLM may hallucinate non-existent image references. This method
|
|
333
|
+
validates each assets/ image reference and removes those that
|
|
334
|
+
don't exist on disk.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
markdown: Markdown content with image references
|
|
338
|
+
assets_dir: Path to the assets directory
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Markdown with non-existent image references removed
|
|
342
|
+
"""
|
|
343
|
+
# Pattern to match image references:  or 
|
|
344
|
+
# Support both forward slash and backslash for Windows compatibility
|
|
345
|
+
img_pattern = re.compile(r"!\[[^\]]*\]\(assets[/\\]([^)]+)\)")
|
|
346
|
+
|
|
347
|
+
# Invalid filename patterns that indicate placeholders or hallucinations
|
|
348
|
+
invalid_patterns = {"...", "..", ".", "placeholder", "image", "filename"}
|
|
349
|
+
|
|
350
|
+
def validate_image(match: re.Match) -> str:
|
|
351
|
+
filename = match.group(1)
|
|
352
|
+
# Check for placeholder patterns
|
|
353
|
+
if filename.strip() in invalid_patterns or filename.strip() == "":
|
|
354
|
+
return ""
|
|
355
|
+
image_path = assets_dir / filename
|
|
356
|
+
if image_path.exists():
|
|
357
|
+
return match.group(0) # Keep existing image
|
|
358
|
+
# Remove non-existent image reference
|
|
359
|
+
return ""
|
|
360
|
+
|
|
361
|
+
result = img_pattern.sub(validate_image, markdown)
|
|
362
|
+
|
|
363
|
+
# Clean up any resulting double spaces or empty lines
|
|
364
|
+
result = re.sub(r" +", " ", result) # Multiple spaces to single
|
|
365
|
+
result = re.sub(r"\n{3,}", "\n\n", result) # 3+ newlines to 2
|
|
366
|
+
|
|
367
|
+
return result
|
|
368
|
+
|
|
369
|
+
@staticmethod
|
|
370
|
+
def remove_hallucinated_images(
|
|
371
|
+
llm_output: str,
|
|
372
|
+
original_content: str,
|
|
373
|
+
) -> str:
|
|
374
|
+
"""Remove hallucinated image URLs from LLM output.
|
|
375
|
+
|
|
376
|
+
LLM may hallucinate image URLs that don't exist in the original content.
|
|
377
|
+
This method compares image URLs in the LLM output against the original
|
|
378
|
+
and removes any that weren't present originally.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
llm_output: LLM processed markdown content
|
|
382
|
+
original_content: Original markdown before LLM processing
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
LLM output with hallucinated image references removed
|
|
386
|
+
"""
|
|
387
|
+
# Extract all image URLs from original content
|
|
388
|
+
img_pattern = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
|
|
389
|
+
original_urls = set(img_pattern.findall(original_content))
|
|
390
|
+
|
|
391
|
+
# Also extract URLs without markdown syntax (bare URLs in original)
|
|
392
|
+
url_pattern = re.compile(r"https?://[^\s\)\"'>]+")
|
|
393
|
+
original_urls.update(url_pattern.findall(original_content))
|
|
394
|
+
|
|
395
|
+
def validate_image(match: re.Match) -> str:
|
|
396
|
+
full_match = match.group(0)
|
|
397
|
+
url = match.group(1)
|
|
398
|
+
|
|
399
|
+
# Keep local asset references (handled by remove_nonexistent_images)
|
|
400
|
+
if url.startswith("assets/") or url.startswith("assets\\"):
|
|
401
|
+
return full_match
|
|
402
|
+
|
|
403
|
+
# Keep relative URLs (likely internal links)
|
|
404
|
+
if not url.startswith("http://") and not url.startswith("https://"):
|
|
405
|
+
return full_match
|
|
406
|
+
|
|
407
|
+
# Check if this URL existed in original
|
|
408
|
+
if url in original_urls:
|
|
409
|
+
return full_match
|
|
410
|
+
|
|
411
|
+
# URL is hallucinated - remove it
|
|
412
|
+
logger.debug(f"Removing hallucinated image URL: {url}")
|
|
413
|
+
return ""
|
|
414
|
+
|
|
415
|
+
result = img_pattern.sub(validate_image, llm_output)
|
|
416
|
+
|
|
417
|
+
# Clean up any resulting empty lines
|
|
418
|
+
result = re.sub(r"\n{3,}", "\n\n", result)
|
|
419
|
+
|
|
420
|
+
return result
|
|
421
|
+
|
|
422
|
+
def compress(
|
|
423
|
+
self,
|
|
424
|
+
image: Image.Image,
|
|
425
|
+
quality: int = DEFAULT_IMAGE_QUALITY,
|
|
426
|
+
max_size: tuple[int, int] = (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT),
|
|
427
|
+
output_format: str = "JPEG",
|
|
428
|
+
) -> tuple[Image.Image, bytes]:
|
|
429
|
+
"""
|
|
430
|
+
Compress an image.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
image: PIL Image to compress
|
|
434
|
+
quality: JPEG quality (1-100)
|
|
435
|
+
max_size: Maximum dimensions (width, height)
|
|
436
|
+
output_format: Output format (JPEG, PNG, WEBP)
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
Tuple of (compressed image, compressed data)
|
|
440
|
+
"""
|
|
441
|
+
# Resize if needed
|
|
442
|
+
image.thumbnail(max_size, Image.Resampling.LANCZOS)
|
|
443
|
+
|
|
444
|
+
# Convert to RGB for JPEG (no alpha channel)
|
|
445
|
+
if output_format.upper() == "JPEG" and image.mode in ("RGBA", "P", "LA"):
|
|
446
|
+
# Create white background
|
|
447
|
+
background = Image.new("RGB", image.size, (255, 255, 255))
|
|
448
|
+
if image.mode == "P":
|
|
449
|
+
image = image.convert("RGBA")
|
|
450
|
+
background.paste(
|
|
451
|
+
image, mask=image.split()[-1] if image.mode == "RGBA" else None
|
|
452
|
+
)
|
|
453
|
+
image = background
|
|
454
|
+
|
|
455
|
+
# Compress to bytes
|
|
456
|
+
buffer = io.BytesIO()
|
|
457
|
+
save_kwargs: dict[str, Any] = {"format": output_format}
|
|
458
|
+
if output_format.upper() in ("JPEG", "WEBP"):
|
|
459
|
+
save_kwargs["quality"] = quality
|
|
460
|
+
if output_format.upper() == "PNG":
|
|
461
|
+
save_kwargs["optimize"] = True
|
|
462
|
+
|
|
463
|
+
image.save(buffer, **save_kwargs)
|
|
464
|
+
compressed_data = buffer.getvalue()
|
|
465
|
+
|
|
466
|
+
return image, compressed_data
|
|
467
|
+
|
|
468
|
+
def save_screenshot(
|
|
469
|
+
self,
|
|
470
|
+
pix_samples: bytes,
|
|
471
|
+
width: int,
|
|
472
|
+
height: int,
|
|
473
|
+
output_path: Path,
|
|
474
|
+
max_bytes: int = DEFAULT_SCREENSHOT_MAX_BYTES,
|
|
475
|
+
) -> tuple[int, int]:
|
|
476
|
+
"""
|
|
477
|
+
Save a screenshot with compression to ensure it's under the size limit.
|
|
478
|
+
|
|
479
|
+
Converts raw pixel data to PIL Image, compresses using config quality,
|
|
480
|
+
and progressively reduces quality if needed to stay under max_bytes.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
pix_samples: Raw RGB pixel data from pymupdf pixmap.samples
|
|
484
|
+
width: Image width
|
|
485
|
+
height: Image height
|
|
486
|
+
output_path: Path to save the image
|
|
487
|
+
max_bytes: Maximum file size in bytes (default 5MB for LLM providers)
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
Tuple of (final_width, final_height) after any resizing
|
|
491
|
+
"""
|
|
492
|
+
from loguru import logger
|
|
493
|
+
|
|
494
|
+
# Convert raw samples to PIL Image
|
|
495
|
+
image = Image.frombytes("RGB", (width, height), pix_samples)
|
|
496
|
+
|
|
497
|
+
# Get quality from config or use default
|
|
498
|
+
quality = self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
|
|
499
|
+
max_width = self.config.max_width if self.config else DEFAULT_IMAGE_MAX_WIDTH
|
|
500
|
+
max_height = self.config.max_height if self.config else DEFAULT_IMAGE_MAX_HEIGHT
|
|
501
|
+
output_format = (self.config.format if self.config else "jpeg").upper()
|
|
502
|
+
if output_format == "JPG":
|
|
503
|
+
output_format = "JPEG"
|
|
504
|
+
|
|
505
|
+
# Resize to configured max dimensions
|
|
506
|
+
image.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)
|
|
507
|
+
|
|
508
|
+
# Convert to RGB for JPEG
|
|
509
|
+
if output_format == "JPEG" and image.mode in ("RGBA", "P", "LA"):
|
|
510
|
+
background = Image.new("RGB", image.size, (255, 255, 255))
|
|
511
|
+
if image.mode == "P":
|
|
512
|
+
image = image.convert("RGBA")
|
|
513
|
+
if image.mode in ("RGBA", "LA"):
|
|
514
|
+
background.paste(image, mask=image.split()[-1])
|
|
515
|
+
else:
|
|
516
|
+
background.paste(image)
|
|
517
|
+
image = background
|
|
518
|
+
|
|
519
|
+
# Try compressing with configured quality first
|
|
520
|
+
for q in [quality, 70, 55, 40, 25]:
|
|
521
|
+
buffer = io.BytesIO()
|
|
522
|
+
save_kwargs: dict[str, Any] = {"format": output_format}
|
|
523
|
+
if output_format in ("JPEG", "WEBP"):
|
|
524
|
+
save_kwargs["quality"] = q
|
|
525
|
+
save_kwargs["optimize"] = True
|
|
526
|
+
elif output_format == "PNG":
|
|
527
|
+
save_kwargs["optimize"] = True
|
|
528
|
+
|
|
529
|
+
image.save(buffer, **save_kwargs)
|
|
530
|
+
data = buffer.getvalue()
|
|
531
|
+
|
|
532
|
+
if len(data) <= max_bytes:
|
|
533
|
+
output_path.write_bytes(data)
|
|
534
|
+
if q < quality:
|
|
535
|
+
logger.debug(
|
|
536
|
+
f"Screenshot compressed: quality {quality}->{q}, "
|
|
537
|
+
f"size {len(data) / 1024:.1f}KB"
|
|
538
|
+
)
|
|
539
|
+
return image.size
|
|
540
|
+
|
|
541
|
+
# Last resort: aggressive resize
|
|
542
|
+
image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
|
|
543
|
+
buffer = io.BytesIO()
|
|
544
|
+
image.save(buffer, format="JPEG", quality=20, optimize=True)
|
|
545
|
+
data = buffer.getvalue()
|
|
546
|
+
output_path.write_bytes(data)
|
|
547
|
+
logger.warning(f"Screenshot aggressively compressed: {len(data) / 1024:.1f}KB")
|
|
548
|
+
return image.size
|
|
549
|
+
|
|
550
|
+
def should_filter(self, width: int, height: int) -> bool:
|
|
551
|
+
"""
|
|
552
|
+
Check if an image should be filtered out based on size.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
width: Image width in pixels
|
|
556
|
+
height: Image height in pixels
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
True if image should be filtered out
|
|
560
|
+
"""
|
|
561
|
+
if not self.config:
|
|
562
|
+
return False
|
|
563
|
+
|
|
564
|
+
filter_config = self.config.filter
|
|
565
|
+
|
|
566
|
+
if width < filter_config.min_width:
|
|
567
|
+
return True
|
|
568
|
+
if height < filter_config.min_height:
|
|
569
|
+
return True
|
|
570
|
+
if width * height < filter_config.min_area:
|
|
571
|
+
return True
|
|
572
|
+
|
|
573
|
+
return False
|
|
574
|
+
|
|
575
|
+
def is_duplicate(self, image_data: bytes) -> bool:
|
|
576
|
+
"""
|
|
577
|
+
Check if image is a duplicate based on hash.
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
image_data: Raw image data
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
True if image is a duplicate
|
|
584
|
+
"""
|
|
585
|
+
if not self.config or not self.config.filter.deduplicate:
|
|
586
|
+
return False
|
|
587
|
+
|
|
588
|
+
image_hash = hashlib.md5(image_data).hexdigest()
|
|
589
|
+
if image_hash in self._seen_hashes:
|
|
590
|
+
return True
|
|
591
|
+
|
|
592
|
+
self._seen_hashes.add(image_hash)
|
|
593
|
+
return False
|
|
594
|
+
|
|
595
|
+
def process_and_save(
|
|
596
|
+
self,
|
|
597
|
+
images: list[tuple[str, str, bytes]],
|
|
598
|
+
output_dir: Path,
|
|
599
|
+
base_name: str,
|
|
600
|
+
) -> ImageProcessResult:
|
|
601
|
+
"""
|
|
602
|
+
Process and save a list of images.
|
|
603
|
+
|
|
604
|
+
Args:
|
|
605
|
+
images: List of (alt_text, mime_type, image_data) tuples
|
|
606
|
+
output_dir: Directory to save images
|
|
607
|
+
base_name: Base name for image files
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
ImageProcessResult with saved images, statistics, and index mapping
|
|
611
|
+
"""
|
|
612
|
+
# Delayed import to avoid circular import
|
|
613
|
+
from markitai.converter.base import ExtractedImage
|
|
614
|
+
|
|
615
|
+
# Create assets directory
|
|
616
|
+
assets_dir = ensure_assets_dir(output_dir)
|
|
617
|
+
|
|
618
|
+
saved_images: list[ExtractedImage] = []
|
|
619
|
+
filtered_count = 0
|
|
620
|
+
deduplicated_count = 0
|
|
621
|
+
index_mapping: dict[int, ProcessedImage] = {}
|
|
622
|
+
|
|
623
|
+
# Determine output format
|
|
624
|
+
output_format = "JPEG"
|
|
625
|
+
extension = "jpg"
|
|
626
|
+
if self.config:
|
|
627
|
+
format_map = {
|
|
628
|
+
"jpeg": ("JPEG", "jpg"),
|
|
629
|
+
"png": ("PNG", "png"),
|
|
630
|
+
"webp": ("WEBP", "webp"),
|
|
631
|
+
}
|
|
632
|
+
output_format, extension = format_map.get(
|
|
633
|
+
self.config.format, ("JPEG", "jpg")
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
for idx, (_alt_text, _mime_type, image_data) in enumerate(images, start=1):
|
|
637
|
+
# Check for duplicates
|
|
638
|
+
if self.is_duplicate(image_data):
|
|
639
|
+
deduplicated_count += 1
|
|
640
|
+
index_mapping[idx] = ProcessedImage(
|
|
641
|
+
original_index=idx, saved_path=None, skip_reason="duplicate"
|
|
642
|
+
)
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
# Load image
|
|
646
|
+
try:
|
|
647
|
+
# Use BytesIO as context manager to ensure buffer is released
|
|
648
|
+
img_buffer = io.BytesIO(image_data)
|
|
649
|
+
try:
|
|
650
|
+
img = Image.open(img_buffer)
|
|
651
|
+
# Load image data immediately so we can release the buffer
|
|
652
|
+
img.load()
|
|
653
|
+
|
|
654
|
+
width, height = img.size
|
|
655
|
+
|
|
656
|
+
# Check filter
|
|
657
|
+
if self.should_filter(width, height):
|
|
658
|
+
filtered_count += 1
|
|
659
|
+
index_mapping[idx] = ProcessedImage(
|
|
660
|
+
original_index=idx, saved_path=None, skip_reason="filtered"
|
|
661
|
+
)
|
|
662
|
+
img.close()
|
|
663
|
+
continue
|
|
664
|
+
|
|
665
|
+
# Compress
|
|
666
|
+
quality = (
|
|
667
|
+
self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
|
|
668
|
+
)
|
|
669
|
+
max_size = (
|
|
670
|
+
(self.config.max_width, self.config.max_height)
|
|
671
|
+
if self.config
|
|
672
|
+
else (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT)
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
if self.config and self.config.compress:
|
|
676
|
+
# No need for img.copy() - compress can modify the image
|
|
677
|
+
# since we don't need the original after this
|
|
678
|
+
compressed_img, compressed_data = self.compress(
|
|
679
|
+
img,
|
|
680
|
+
quality=quality,
|
|
681
|
+
max_size=max_size,
|
|
682
|
+
output_format=output_format,
|
|
683
|
+
)
|
|
684
|
+
final_width, final_height = compressed_img.size
|
|
685
|
+
# Release the compressed image
|
|
686
|
+
compressed_img.close()
|
|
687
|
+
else:
|
|
688
|
+
compressed_data = image_data
|
|
689
|
+
final_width, final_height = width, height
|
|
690
|
+
|
|
691
|
+
# Close original image to release memory
|
|
692
|
+
img.close()
|
|
693
|
+
|
|
694
|
+
# Generate filename
|
|
695
|
+
filename = f"{base_name}.{idx:04d}.{extension}"
|
|
696
|
+
output_path = assets_dir / filename
|
|
697
|
+
|
|
698
|
+
# Save
|
|
699
|
+
output_path.write_bytes(compressed_data)
|
|
700
|
+
|
|
701
|
+
# Release compressed data reference
|
|
702
|
+
del compressed_data
|
|
703
|
+
|
|
704
|
+
extracted = ExtractedImage(
|
|
705
|
+
path=output_path,
|
|
706
|
+
index=idx,
|
|
707
|
+
original_name=filename,
|
|
708
|
+
mime_type=f"image/{extension}",
|
|
709
|
+
width=final_width,
|
|
710
|
+
height=final_height,
|
|
711
|
+
)
|
|
712
|
+
saved_images.append(extracted)
|
|
713
|
+
index_mapping[idx] = ProcessedImage(
|
|
714
|
+
original_index=idx, saved_path=output_path, skip_reason=None
|
|
715
|
+
)
|
|
716
|
+
finally:
|
|
717
|
+
img_buffer.close()
|
|
718
|
+
|
|
719
|
+
except Exception:
|
|
720
|
+
# Skip invalid images - record as filtered
|
|
721
|
+
index_mapping[idx] = ProcessedImage(
|
|
722
|
+
original_index=idx, saved_path=None, skip_reason="error"
|
|
723
|
+
)
|
|
724
|
+
continue
|
|
725
|
+
|
|
726
|
+
return ImageProcessResult(
|
|
727
|
+
saved_images=saved_images,
|
|
728
|
+
filtered_count=filtered_count,
|
|
729
|
+
deduplicated_count=deduplicated_count,
|
|
730
|
+
index_mapping=index_mapping,
|
|
731
|
+
)
|
|
732
|
+
|
|
733
|
+
def reset_dedup_cache(self) -> None:
|
|
734
|
+
"""Reset the deduplication hash cache."""
|
|
735
|
+
self._seen_hashes.clear()
|
|
736
|
+
|
|
737
|
+
async def process_and_save_async(
|
|
738
|
+
self,
|
|
739
|
+
images: list[tuple[str, str, bytes]],
|
|
740
|
+
output_dir: Path,
|
|
741
|
+
base_name: str,
|
|
742
|
+
max_concurrency: int = DEFAULT_IMAGE_IO_CONCURRENCY,
|
|
743
|
+
) -> ImageProcessResult:
|
|
744
|
+
"""Process and save a list of images with async I/O.
|
|
745
|
+
|
|
746
|
+
This is an optimized version that uses asyncio for concurrent I/O
|
|
747
|
+
operations while keeping CPU-bound image processing sequential.
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
images: List of (alt_text, mime_type, image_data) tuples
|
|
751
|
+
output_dir: Directory to save images
|
|
752
|
+
base_name: Base name for image files
|
|
753
|
+
max_concurrency: Maximum concurrent I/O operations
|
|
754
|
+
|
|
755
|
+
Returns:
|
|
756
|
+
ImageProcessResult with saved images, statistics, and index mapping
|
|
757
|
+
"""
|
|
758
|
+
import asyncio
|
|
759
|
+
|
|
760
|
+
# Delayed imports to avoid circular import
|
|
761
|
+
from markitai.converter.base import ExtractedImage
|
|
762
|
+
from markitai.security import write_bytes_async
|
|
763
|
+
|
|
764
|
+
# Create assets directory
|
|
765
|
+
assets_dir = ensure_assets_dir(output_dir)
|
|
766
|
+
|
|
767
|
+
saved_images: list[ExtractedImage] = []
|
|
768
|
+
filtered_count = 0
|
|
769
|
+
deduplicated_count = 0
|
|
770
|
+
index_mapping: dict[int, ProcessedImage] = {}
|
|
771
|
+
|
|
772
|
+
# Determine output format
|
|
773
|
+
output_format = "JPEG"
|
|
774
|
+
extension = "jpg"
|
|
775
|
+
if self.config:
|
|
776
|
+
format_map = {
|
|
777
|
+
"jpeg": ("JPEG", "jpg"),
|
|
778
|
+
"png": ("PNG", "png"),
|
|
779
|
+
"webp": ("WEBP", "webp"),
|
|
780
|
+
}
|
|
781
|
+
output_format, extension = format_map.get(
|
|
782
|
+
self.config.format, ("JPEG", "jpg")
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# First pass: process images (CPU-bound, sequential)
|
|
786
|
+
processed_images: list[tuple[int, bytes, int, int]] = []
|
|
787
|
+
for idx, (_alt_text, _mime_type, image_data) in enumerate(images, start=1):
|
|
788
|
+
# Check for duplicates
|
|
789
|
+
if self.is_duplicate(image_data):
|
|
790
|
+
deduplicated_count += 1
|
|
791
|
+
index_mapping[idx] = ProcessedImage(
|
|
792
|
+
original_index=idx, saved_path=None, skip_reason="duplicate"
|
|
793
|
+
)
|
|
794
|
+
continue
|
|
795
|
+
|
|
796
|
+
# Load and process image
|
|
797
|
+
try:
|
|
798
|
+
with Image.open(io.BytesIO(image_data)) as img:
|
|
799
|
+
width, height = img.size
|
|
800
|
+
|
|
801
|
+
# Check filter
|
|
802
|
+
if self.should_filter(width, height):
|
|
803
|
+
filtered_count += 1
|
|
804
|
+
index_mapping[idx] = ProcessedImage(
|
|
805
|
+
original_index=idx, saved_path=None, skip_reason="filtered"
|
|
806
|
+
)
|
|
807
|
+
continue
|
|
808
|
+
|
|
809
|
+
# Compress
|
|
810
|
+
quality = (
|
|
811
|
+
self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
|
|
812
|
+
)
|
|
813
|
+
max_size = (
|
|
814
|
+
(self.config.max_width, self.config.max_height)
|
|
815
|
+
if self.config
|
|
816
|
+
else (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT)
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
if self.config and self.config.compress:
|
|
820
|
+
compressed_img, compressed_data = self.compress(
|
|
821
|
+
img.copy(),
|
|
822
|
+
quality=quality,
|
|
823
|
+
max_size=max_size,
|
|
824
|
+
output_format=output_format,
|
|
825
|
+
)
|
|
826
|
+
final_width, final_height = compressed_img.size
|
|
827
|
+
else:
|
|
828
|
+
compressed_data = image_data
|
|
829
|
+
final_width, final_height = width, height
|
|
830
|
+
|
|
831
|
+
processed_images.append(
|
|
832
|
+
(idx, compressed_data, final_width, final_height)
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
except Exception:
|
|
836
|
+
# Skip invalid images
|
|
837
|
+
index_mapping[idx] = ProcessedImage(
|
|
838
|
+
original_index=idx, saved_path=None, skip_reason="error"
|
|
839
|
+
)
|
|
840
|
+
continue
|
|
841
|
+
|
|
842
|
+
# Second pass: save images concurrently (I/O-bound)
|
|
843
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
844
|
+
|
|
845
|
+
async def save_image(
|
|
846
|
+
idx: int, data: bytes, width: int, height: int
|
|
847
|
+
) -> tuple[int, ExtractedImage | None, Path | None]:
|
|
848
|
+
filename = f"{base_name}.{idx:04d}.{extension}"
|
|
849
|
+
output_path = assets_dir / filename
|
|
850
|
+
|
|
851
|
+
async with semaphore:
|
|
852
|
+
try:
|
|
853
|
+
await write_bytes_async(output_path, data)
|
|
854
|
+
return (
|
|
855
|
+
idx,
|
|
856
|
+
ExtractedImage(
|
|
857
|
+
path=output_path,
|
|
858
|
+
index=idx,
|
|
859
|
+
original_name=filename,
|
|
860
|
+
mime_type=f"image/{extension}",
|
|
861
|
+
width=width,
|
|
862
|
+
height=height,
|
|
863
|
+
),
|
|
864
|
+
output_path,
|
|
865
|
+
)
|
|
866
|
+
except Exception:
|
|
867
|
+
return idx, None, None
|
|
868
|
+
|
|
869
|
+
# Run all saves concurrently
|
|
870
|
+
tasks = [
|
|
871
|
+
save_image(idx, data, width, height)
|
|
872
|
+
for idx, data, width, height in processed_images
|
|
873
|
+
]
|
|
874
|
+
results = await asyncio.gather(*tasks)
|
|
875
|
+
|
|
876
|
+
# Collect successful saves and build index mapping
|
|
877
|
+
for idx, extracted, output_path in results:
|
|
878
|
+
if extracted is not None:
|
|
879
|
+
saved_images.append(extracted)
|
|
880
|
+
index_mapping[idx] = ProcessedImage(
|
|
881
|
+
original_index=idx, saved_path=output_path, skip_reason=None
|
|
882
|
+
)
|
|
883
|
+
else:
|
|
884
|
+
index_mapping[idx] = ProcessedImage(
|
|
885
|
+
original_index=idx, saved_path=None, skip_reason="error"
|
|
886
|
+
)
|
|
887
|
+
|
|
888
|
+
# Sort by index to maintain order
|
|
889
|
+
saved_images.sort(key=lambda x: x.index)
|
|
890
|
+
|
|
891
|
+
return ImageProcessResult(
|
|
892
|
+
saved_images=saved_images,
|
|
893
|
+
filtered_count=filtered_count,
|
|
894
|
+
deduplicated_count=deduplicated_count,
|
|
895
|
+
index_mapping=index_mapping,
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
async def process_and_save_multiprocess(
|
|
899
|
+
self,
|
|
900
|
+
images: list[tuple[str, str, bytes]],
|
|
901
|
+
output_dir: Path,
|
|
902
|
+
base_name: str,
|
|
903
|
+
max_workers: int | None = None,
|
|
904
|
+
max_io_concurrency: int = DEFAULT_IMAGE_IO_CONCURRENCY,
|
|
905
|
+
) -> ImageProcessResult:
|
|
906
|
+
"""Process and save images using multiprocessing for CPU-bound compression.
|
|
907
|
+
|
|
908
|
+
This version uses ProcessPoolExecutor to parallelize image compression
|
|
909
|
+
across multiple CPU cores, bypassing the GIL limitation.
|
|
910
|
+
|
|
911
|
+
Args:
|
|
912
|
+
images: List of (alt_text, mime_type, image_data) tuples
|
|
913
|
+
output_dir: Directory to save images
|
|
914
|
+
base_name: Base name for image files
|
|
915
|
+
max_workers: Max worker processes (default: cpu_count // 2)
|
|
916
|
+
max_io_concurrency: Maximum concurrent I/O operations
|
|
917
|
+
|
|
918
|
+
Returns:
|
|
919
|
+
ImageProcessResult with saved images, statistics, and index mapping
|
|
920
|
+
"""
|
|
921
|
+
import asyncio
|
|
922
|
+
|
|
923
|
+
from markitai.converter.base import ExtractedImage
|
|
924
|
+
from markitai.security import write_bytes_async
|
|
925
|
+
|
|
926
|
+
if not images:
|
|
927
|
+
return ImageProcessResult(
|
|
928
|
+
saved_images=[],
|
|
929
|
+
filtered_count=0,
|
|
930
|
+
deduplicated_count=0,
|
|
931
|
+
index_mapping={},
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# Create assets directory
|
|
935
|
+
assets_dir = ensure_assets_dir(output_dir)
|
|
936
|
+
|
|
937
|
+
# Determine output format
|
|
938
|
+
output_format = "JPEG"
|
|
939
|
+
extension = "jpg"
|
|
940
|
+
if self.config:
|
|
941
|
+
format_map = {
|
|
942
|
+
"jpeg": ("JPEG", "jpg"),
|
|
943
|
+
"png": ("PNG", "png"),
|
|
944
|
+
"webp": ("WEBP", "webp"),
|
|
945
|
+
}
|
|
946
|
+
output_format, extension = format_map.get(
|
|
947
|
+
self.config.format, ("JPEG", "jpg")
|
|
948
|
+
)
|
|
949
|
+
|
|
950
|
+
# Get compression parameters
|
|
951
|
+
quality = self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
|
|
952
|
+
max_size = (
|
|
953
|
+
(self.config.max_width, self.config.max_height)
|
|
954
|
+
if self.config
|
|
955
|
+
else (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT)
|
|
956
|
+
)
|
|
957
|
+
compress_enabled = self.config.compress if self.config else True
|
|
958
|
+
|
|
959
|
+
# Get filter parameters
|
|
960
|
+
min_width = self.config.filter.min_width if self.config else 50
|
|
961
|
+
min_height = self.config.filter.min_height if self.config else 50
|
|
962
|
+
min_area = self.config.filter.min_area if self.config else 5000
|
|
963
|
+
|
|
964
|
+
# Prepare work items (filter duplicates first)
|
|
965
|
+
work_items: list[tuple[int, bytes]] = []
|
|
966
|
+
deduplicated_count = 0
|
|
967
|
+
index_mapping: dict[int, ProcessedImage] = {}
|
|
968
|
+
for idx, (_alt_text, _mime_type, image_data) in enumerate(images, start=1):
|
|
969
|
+
if self.is_duplicate(image_data):
|
|
970
|
+
deduplicated_count += 1
|
|
971
|
+
index_mapping[idx] = ProcessedImage(
|
|
972
|
+
original_index=idx, saved_path=None, skip_reason="duplicate"
|
|
973
|
+
)
|
|
974
|
+
continue
|
|
975
|
+
work_items.append((idx, image_data))
|
|
976
|
+
|
|
977
|
+
if not work_items:
|
|
978
|
+
return ImageProcessResult(
|
|
979
|
+
saved_images=[],
|
|
980
|
+
filtered_count=0,
|
|
981
|
+
deduplicated_count=deduplicated_count,
|
|
982
|
+
index_mapping=index_mapping,
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
# Determine worker count (use half of CPUs to avoid system overload)
|
|
986
|
+
if max_workers is None:
|
|
987
|
+
max_workers = max(1, (os.cpu_count() or 4) // 2)
|
|
988
|
+
|
|
989
|
+
# Process images in parallel using ProcessPoolExecutor
|
|
990
|
+
loop = asyncio.get_running_loop()
|
|
991
|
+
processed_results: list[tuple[int, bytes, int, int]] = []
|
|
992
|
+
filtered_count = 0
|
|
993
|
+
|
|
994
|
+
# Use ProcessPoolExecutor for CPU-bound compression
|
|
995
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
996
|
+
futures = []
|
|
997
|
+
for idx, image_data in work_items:
|
|
998
|
+
if compress_enabled:
|
|
999
|
+
future = loop.run_in_executor(
|
|
1000
|
+
executor,
|
|
1001
|
+
_compress_image_worker,
|
|
1002
|
+
image_data,
|
|
1003
|
+
quality,
|
|
1004
|
+
max_size,
|
|
1005
|
+
output_format,
|
|
1006
|
+
min_width,
|
|
1007
|
+
min_height,
|
|
1008
|
+
min_area,
|
|
1009
|
+
)
|
|
1010
|
+
futures.append((idx, future))
|
|
1011
|
+
else:
|
|
1012
|
+
# No compression, just validate size
|
|
1013
|
+
try:
|
|
1014
|
+
with io.BytesIO(image_data) as buffer:
|
|
1015
|
+
img = Image.open(buffer)
|
|
1016
|
+
w, h = img.size
|
|
1017
|
+
if w >= min_width and h >= min_height and w * h >= min_area:
|
|
1018
|
+
processed_results.append((idx, image_data, w, h))
|
|
1019
|
+
else:
|
|
1020
|
+
filtered_count += 1
|
|
1021
|
+
index_mapping[idx] = ProcessedImage(
|
|
1022
|
+
original_index=idx,
|
|
1023
|
+
saved_path=None,
|
|
1024
|
+
skip_reason="filtered",
|
|
1025
|
+
)
|
|
1026
|
+
except Exception:
|
|
1027
|
+
index_mapping[idx] = ProcessedImage(
|
|
1028
|
+
original_index=idx, saved_path=None, skip_reason="error"
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
# Gather results from workers
|
|
1032
|
+
for idx, future in futures:
|
|
1033
|
+
try:
|
|
1034
|
+
result = await future
|
|
1035
|
+
if result is None:
|
|
1036
|
+
filtered_count += 1
|
|
1037
|
+
index_mapping[idx] = ProcessedImage(
|
|
1038
|
+
original_index=idx, saved_path=None, skip_reason="filtered"
|
|
1039
|
+
)
|
|
1040
|
+
else:
|
|
1041
|
+
compressed_data, final_w, final_h = result
|
|
1042
|
+
processed_results.append(
|
|
1043
|
+
(idx, compressed_data, final_w, final_h)
|
|
1044
|
+
)
|
|
1045
|
+
except Exception:
|
|
1046
|
+
filtered_count += 1
|
|
1047
|
+
index_mapping[idx] = ProcessedImage(
|
|
1048
|
+
original_index=idx, saved_path=None, skip_reason="error"
|
|
1049
|
+
)
|
|
1050
|
+
|
|
1051
|
+
# Second pass: save images concurrently (I/O-bound)
|
|
1052
|
+
semaphore = asyncio.Semaphore(max_io_concurrency)
|
|
1053
|
+
saved_images: list[ExtractedImage] = []
|
|
1054
|
+
|
|
1055
|
+
async def save_image(
|
|
1056
|
+
idx: int, data: bytes, width: int, height: int
|
|
1057
|
+
) -> tuple[int, ExtractedImage | None, Path | None]:
|
|
1058
|
+
filename = f"{base_name}.{idx:04d}.{extension}"
|
|
1059
|
+
output_path = assets_dir / filename
|
|
1060
|
+
|
|
1061
|
+
async with semaphore:
|
|
1062
|
+
try:
|
|
1063
|
+
await write_bytes_async(output_path, data)
|
|
1064
|
+
return (
|
|
1065
|
+
idx,
|
|
1066
|
+
ExtractedImage(
|
|
1067
|
+
path=output_path,
|
|
1068
|
+
index=idx,
|
|
1069
|
+
original_name=filename,
|
|
1070
|
+
mime_type=f"image/{extension}",
|
|
1071
|
+
width=width,
|
|
1072
|
+
height=height,
|
|
1073
|
+
),
|
|
1074
|
+
output_path,
|
|
1075
|
+
)
|
|
1076
|
+
except Exception:
|
|
1077
|
+
return idx, None, None
|
|
1078
|
+
|
|
1079
|
+
# Run all saves concurrently
|
|
1080
|
+
tasks = [
|
|
1081
|
+
save_image(idx, data, width, height)
|
|
1082
|
+
for idx, data, width, height in processed_results
|
|
1083
|
+
]
|
|
1084
|
+
results = await asyncio.gather(*tasks)
|
|
1085
|
+
|
|
1086
|
+
# Collect successful saves and build index mapping
|
|
1087
|
+
for idx, extracted, output_path in results:
|
|
1088
|
+
if extracted is not None:
|
|
1089
|
+
saved_images.append(extracted)
|
|
1090
|
+
index_mapping[idx] = ProcessedImage(
|
|
1091
|
+
original_index=idx, saved_path=output_path, skip_reason=None
|
|
1092
|
+
)
|
|
1093
|
+
else:
|
|
1094
|
+
index_mapping[idx] = ProcessedImage(
|
|
1095
|
+
original_index=idx, saved_path=None, skip_reason="error"
|
|
1096
|
+
)
|
|
1097
|
+
|
|
1098
|
+
# Sort by index to maintain order
|
|
1099
|
+
saved_images.sort(key=lambda x: x.index)
|
|
1100
|
+
|
|
1101
|
+
return ImageProcessResult(
|
|
1102
|
+
saved_images=saved_images,
|
|
1103
|
+
filtered_count=filtered_count,
|
|
1104
|
+
deduplicated_count=deduplicated_count,
|
|
1105
|
+
index_mapping=index_mapping,
|
|
1106
|
+
)
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
# =============================================================================
|
|
1110
|
+
# URL Image Download
|
|
1111
|
+
# =============================================================================
|
|
1112
|
+
|
|
1113
|
+
# Pattern to match markdown images: 
|
|
1114
|
+
# Excludes data: URIs (base64 encoded images)
|
|
1115
|
+
_URL_IMAGE_PATTERN = re.compile(
|
|
1116
|
+
r"!\[([^\]]*)\]\((?!data:)([^)]+)\)",
|
|
1117
|
+
re.IGNORECASE,
|
|
1118
|
+
)
|
|
1119
|
+
|
|
1120
|
+
# Common image extensions
|
|
1121
|
+
_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".ico"}
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
@dataclass
|
|
1125
|
+
class UrlImageDownloadResult:
|
|
1126
|
+
"""Result of downloading images from URLs."""
|
|
1127
|
+
|
|
1128
|
+
updated_markdown: str
|
|
1129
|
+
downloaded_paths: list[Path]
|
|
1130
|
+
failed_urls: list[str]
|
|
1131
|
+
url_to_path: dict[str, Path] = field(
|
|
1132
|
+
default_factory=dict
|
|
1133
|
+
) # URL -> local path mapping
|
|
1134
|
+
|
|
1135
|
+
|
|
1136
|
+
def _get_extension_from_content_type(content_type: str) -> str:
|
|
1137
|
+
"""Get file extension from content-type header."""
|
|
1138
|
+
return get_extension_from_mime(content_type)
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
def _get_extension_from_url(url: str) -> str | None:
|
|
1142
|
+
"""Extract image extension from URL path."""
|
|
1143
|
+
parsed = urlparse(url)
|
|
1144
|
+
path = parsed.path.lower()
|
|
1145
|
+
# Remove query params from path
|
|
1146
|
+
path = path.split("?")[0]
|
|
1147
|
+
for ext in _IMAGE_EXTENSIONS:
|
|
1148
|
+
if path.endswith(ext):
|
|
1149
|
+
return ext
|
|
1150
|
+
return None
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
def _sanitize_image_filename(name: str, max_length: int = 100) -> str:
|
|
1154
|
+
"""Sanitize filename for cross-platform compatibility."""
|
|
1155
|
+
# Remove or replace invalid characters
|
|
1156
|
+
invalid_chars = '<>:"/\\|?*'
|
|
1157
|
+
for char in invalid_chars:
|
|
1158
|
+
name = name.replace(char, "_")
|
|
1159
|
+
# Remove control characters
|
|
1160
|
+
name = "".join(c for c in name if ord(c) >= 32)
|
|
1161
|
+
# Limit length
|
|
1162
|
+
if len(name) > max_length:
|
|
1163
|
+
name = name[:max_length]
|
|
1164
|
+
return name.strip() or "image"
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
async def download_url_images(
|
|
1168
|
+
markdown: str,
|
|
1169
|
+
output_dir: Path,
|
|
1170
|
+
base_url: str,
|
|
1171
|
+
config: ImageConfig,
|
|
1172
|
+
source_name: str = "url",
|
|
1173
|
+
concurrency: int = 5,
|
|
1174
|
+
timeout: int = 30,
|
|
1175
|
+
) -> UrlImageDownloadResult:
|
|
1176
|
+
"""Download images from URLs in markdown and save to assets directory.
|
|
1177
|
+
|
|
1178
|
+
This function:
|
|
1179
|
+
1. Finds all image URLs in markdown (excluding data: URIs)
|
|
1180
|
+
2. Downloads images concurrently with rate limiting
|
|
1181
|
+
3. Saves to assets directory with proper naming
|
|
1182
|
+
4. Replaces URLs with local paths in markdown
|
|
1183
|
+
5. Skips failed downloads (keeps original URL, logs warning)
|
|
1184
|
+
|
|
1185
|
+
Args:
|
|
1186
|
+
markdown: Markdown content with image URLs
|
|
1187
|
+
output_dir: Output directory (assets will be created inside)
|
|
1188
|
+
base_url: Base URL for resolving relative image paths
|
|
1189
|
+
config: Image configuration (format, quality, etc.)
|
|
1190
|
+
source_name: Source identifier for naming images
|
|
1191
|
+
concurrency: Max concurrent downloads (default 5)
|
|
1192
|
+
timeout: HTTP request timeout in seconds (default 30)
|
|
1193
|
+
|
|
1194
|
+
Returns:
|
|
1195
|
+
UrlImageDownloadResult with:
|
|
1196
|
+
- updated_markdown: Markdown with local paths for downloaded images
|
|
1197
|
+
- downloaded_paths: List of successfully downloaded image paths
|
|
1198
|
+
- failed_urls: List of URLs that failed to download
|
|
1199
|
+
"""
|
|
1200
|
+
# Find all image URLs
|
|
1201
|
+
matches = list(_URL_IMAGE_PATTERN.finditer(markdown))
|
|
1202
|
+
if not matches:
|
|
1203
|
+
return UrlImageDownloadResult(
|
|
1204
|
+
updated_markdown=markdown,
|
|
1205
|
+
downloaded_paths=[],
|
|
1206
|
+
failed_urls=[],
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
# Create assets directory
|
|
1210
|
+
assets_dir = ensure_assets_dir(output_dir)
|
|
1211
|
+
|
|
1212
|
+
# Prepare download tasks
|
|
1213
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
1214
|
+
downloaded_paths: list[Path] = []
|
|
1215
|
+
failed_urls: list[str] = []
|
|
1216
|
+
replacements: dict[str, str] = {} # original_match -> replacement
|
|
1217
|
+
url_to_path: dict[str, Path] = {} # image_url -> local_path mapping
|
|
1218
|
+
|
|
1219
|
+
# Sanitize source name for filenames
|
|
1220
|
+
safe_source = _sanitize_image_filename(source_name, max_length=50)
|
|
1221
|
+
|
|
1222
|
+
async def download_single(
|
|
1223
|
+
client: httpx.AsyncClient,
|
|
1224
|
+
match: re.Match,
|
|
1225
|
+
index: int,
|
|
1226
|
+
) -> None:
|
|
1227
|
+
"""Download a single image."""
|
|
1228
|
+
alt_text = match.group(1)
|
|
1229
|
+
image_url = match.group(2).strip()
|
|
1230
|
+
original_match = match.group(0)
|
|
1231
|
+
|
|
1232
|
+
# Resolve relative URLs
|
|
1233
|
+
if not image_url.startswith(("http://", "https://", "//")):
|
|
1234
|
+
image_url = urljoin(base_url, image_url)
|
|
1235
|
+
elif image_url.startswith("//"):
|
|
1236
|
+
# Protocol-relative URL
|
|
1237
|
+
parsed_base = urlparse(base_url)
|
|
1238
|
+
image_url = f"{parsed_base.scheme}:{image_url}"
|
|
1239
|
+
|
|
1240
|
+
async with semaphore:
|
|
1241
|
+
try:
|
|
1242
|
+
response = await client.get(
|
|
1243
|
+
image_url,
|
|
1244
|
+
follow_redirects=True,
|
|
1245
|
+
timeout=timeout,
|
|
1246
|
+
)
|
|
1247
|
+
response.raise_for_status()
|
|
1248
|
+
|
|
1249
|
+
# Determine file extension
|
|
1250
|
+
content_type = response.headers.get("content-type", "")
|
|
1251
|
+
ext = _get_extension_from_url(image_url)
|
|
1252
|
+
if not ext:
|
|
1253
|
+
ext = _get_extension_from_content_type(content_type)
|
|
1254
|
+
|
|
1255
|
+
# Generate filename: source_name.NNNN.ext (1-indexed, 4 digits)
|
|
1256
|
+
filename = f"{safe_source}.{index + 1:04d}{ext}"
|
|
1257
|
+
output_path = assets_dir / filename
|
|
1258
|
+
|
|
1259
|
+
# Process image (apply quality settings if configured)
|
|
1260
|
+
image_data = response.content
|
|
1261
|
+
if ext.lower() in (".jpg", ".jpeg", ".png", ".webp"):
|
|
1262
|
+
try:
|
|
1263
|
+
processed = _compress_image_worker(
|
|
1264
|
+
image_data,
|
|
1265
|
+
quality=config.quality,
|
|
1266
|
+
max_size=(config.max_width, config.max_height),
|
|
1267
|
+
output_format=config.format.upper(),
|
|
1268
|
+
min_width=config.filter.min_width,
|
|
1269
|
+
min_height=config.filter.min_height,
|
|
1270
|
+
min_area=config.filter.min_area,
|
|
1271
|
+
)
|
|
1272
|
+
if processed:
|
|
1273
|
+
image_data, _, _ = processed
|
|
1274
|
+
# Update extension if format changed
|
|
1275
|
+
if config.format.lower() != ext[1:].lower():
|
|
1276
|
+
ext = f".{config.format.lower()}"
|
|
1277
|
+
filename = f"{safe_source}.{index + 1:04d}{ext}"
|
|
1278
|
+
output_path = assets_dir / filename
|
|
1279
|
+
else:
|
|
1280
|
+
# Image was filtered out (too small)
|
|
1281
|
+
logger.debug(
|
|
1282
|
+
f"Image filtered (too small): {image_url[:60]}..."
|
|
1283
|
+
)
|
|
1284
|
+
return
|
|
1285
|
+
except Exception as e:
|
|
1286
|
+
logger.debug(f"Image processing failed, saving original: {e}")
|
|
1287
|
+
|
|
1288
|
+
# Save to file
|
|
1289
|
+
output_path.write_bytes(image_data)
|
|
1290
|
+
downloaded_paths.append(output_path)
|
|
1291
|
+
|
|
1292
|
+
# Prepare replacement with local path
|
|
1293
|
+
local_path = f"assets/{filename}"
|
|
1294
|
+
replacements[original_match] = f""
|
|
1295
|
+
|
|
1296
|
+
# Track URL to path mapping for post-processing
|
|
1297
|
+
url_to_path[image_url] = output_path
|
|
1298
|
+
|
|
1299
|
+
logger.debug(f"Downloaded: {image_url[:60]}... -> {output_path}")
|
|
1300
|
+
|
|
1301
|
+
except httpx.TimeoutException:
|
|
1302
|
+
logger.warning(f"Timeout downloading image: {image_url[:80]}...")
|
|
1303
|
+
failed_urls.append(image_url)
|
|
1304
|
+
except httpx.HTTPStatusError as e:
|
|
1305
|
+
logger.warning(
|
|
1306
|
+
f"HTTP {e.response.status_code} downloading: {image_url[:80]}..."
|
|
1307
|
+
)
|
|
1308
|
+
failed_urls.append(image_url)
|
|
1309
|
+
except Exception as e:
|
|
1310
|
+
logger.warning(f"Failed to download image: {image_url[:80]}... - {e}")
|
|
1311
|
+
failed_urls.append(image_url)
|
|
1312
|
+
|
|
1313
|
+
# Download all images concurrently
|
|
1314
|
+
async with httpx.AsyncClient(
|
|
1315
|
+
headers={
|
|
1316
|
+
"User-Agent": "Mozilla/5.0 (compatible; markitai/0.3.0; +https://github.com/Ynewtime/markitai)"
|
|
1317
|
+
},
|
|
1318
|
+
follow_redirects=True,
|
|
1319
|
+
) as client:
|
|
1320
|
+
tasks = [
|
|
1321
|
+
download_single(client, match, idx) for idx, match in enumerate(matches)
|
|
1322
|
+
]
|
|
1323
|
+
await asyncio.gather(*tasks)
|
|
1324
|
+
|
|
1325
|
+
# Apply replacements to markdown
|
|
1326
|
+
updated_markdown = markdown
|
|
1327
|
+
for original, replacement in replacements.items():
|
|
1328
|
+
updated_markdown = updated_markdown.replace(original, replacement)
|
|
1329
|
+
|
|
1330
|
+
return UrlImageDownloadResult(
|
|
1331
|
+
updated_markdown=updated_markdown,
|
|
1332
|
+
downloaded_paths=downloaded_paths,
|
|
1333
|
+
failed_urls=failed_urls,
|
|
1334
|
+
url_to_path=url_to_path,
|
|
1335
|
+
)
|