markitai 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. markitai/__init__.py +3 -0
  2. markitai/batch.py +1316 -0
  3. markitai/cli.py +3979 -0
  4. markitai/config.py +602 -0
  5. markitai/config.schema.json +748 -0
  6. markitai/constants.py +222 -0
  7. markitai/converter/__init__.py +49 -0
  8. markitai/converter/_patches.py +98 -0
  9. markitai/converter/base.py +164 -0
  10. markitai/converter/image.py +181 -0
  11. markitai/converter/legacy.py +606 -0
  12. markitai/converter/office.py +526 -0
  13. markitai/converter/pdf.py +679 -0
  14. markitai/converter/text.py +63 -0
  15. markitai/fetch.py +1725 -0
  16. markitai/image.py +1335 -0
  17. markitai/json_order.py +550 -0
  18. markitai/llm.py +4339 -0
  19. markitai/ocr.py +347 -0
  20. markitai/prompts/__init__.py +159 -0
  21. markitai/prompts/cleaner.md +93 -0
  22. markitai/prompts/document_enhance.md +77 -0
  23. markitai/prompts/document_enhance_complete.md +65 -0
  24. markitai/prompts/document_process.md +60 -0
  25. markitai/prompts/frontmatter.md +28 -0
  26. markitai/prompts/image_analysis.md +21 -0
  27. markitai/prompts/image_caption.md +8 -0
  28. markitai/prompts/image_description.md +13 -0
  29. markitai/prompts/page_content.md +17 -0
  30. markitai/prompts/url_enhance.md +78 -0
  31. markitai/security.py +286 -0
  32. markitai/types.py +30 -0
  33. markitai/urls.py +187 -0
  34. markitai/utils/__init__.py +33 -0
  35. markitai/utils/executor.py +69 -0
  36. markitai/utils/mime.py +85 -0
  37. markitai/utils/office.py +262 -0
  38. markitai/utils/output.py +53 -0
  39. markitai/utils/paths.py +81 -0
  40. markitai/utils/text.py +359 -0
  41. markitai/workflow/__init__.py +37 -0
  42. markitai/workflow/core.py +760 -0
  43. markitai/workflow/helpers.py +509 -0
  44. markitai/workflow/single.py +369 -0
  45. markitai-0.3.0.dist-info/METADATA +159 -0
  46. markitai-0.3.0.dist-info/RECORD +48 -0
  47. markitai-0.3.0.dist-info/WHEEL +4 -0
  48. markitai-0.3.0.dist-info/entry_points.txt +2 -0
markitai/image.py ADDED
@@ -0,0 +1,1335 @@
1
+ """Image processing module for extraction, compression, and filtering."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import base64
7
+ import hashlib
8
+ import io
9
+ import os
10
+ import re
11
+ from concurrent.futures import ProcessPoolExecutor
12
+ from dataclasses import dataclass, field
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+ from urllib.parse import urljoin, urlparse
16
+
17
+ import httpx
18
+ from loguru import logger
19
+ from PIL import Image
20
+
21
+ from markitai.constants import (
22
+ DEFAULT_IMAGE_IO_CONCURRENCY,
23
+ DEFAULT_IMAGE_MAX_HEIGHT,
24
+ DEFAULT_IMAGE_MAX_WIDTH,
25
+ DEFAULT_IMAGE_QUALITY,
26
+ DEFAULT_SCREENSHOT_MAX_BYTES,
27
+ )
28
+ from markitai.utils.mime import get_extension_from_mime
29
+ from markitai.utils.paths import ensure_assets_dir
30
+
31
+ if TYPE_CHECKING:
32
+ from markitai.config import ImageConfig
33
+ from markitai.converter.base import ExtractedImage
34
+
35
+
36
+ # Module-level function for multiprocessing (must be picklable)
37
+ def _compress_image_worker(
38
+ image_data: bytes,
39
+ quality: int,
40
+ max_size: tuple[int, int],
41
+ output_format: str,
42
+ min_width: int,
43
+ min_height: int,
44
+ min_area: int,
45
+ ) -> tuple[bytes, int, int] | None:
46
+ """Compress a single image in a worker process.
47
+
48
+ Args:
49
+ image_data: Raw image bytes
50
+ quality: JPEG quality (1-100)
51
+ max_size: Maximum dimensions (width, height)
52
+ output_format: Output format (JPEG, PNG, WEBP)
53
+ min_width: Minimum width filter
54
+ min_height: Minimum height filter
55
+ min_area: Minimum area filter
56
+
57
+ Returns:
58
+ Tuple of (compressed_data, final_width, final_height) or None if filtered
59
+ """
60
+ try:
61
+ with io.BytesIO(image_data) as buffer:
62
+ img = Image.open(buffer)
63
+ img.load()
64
+ width, height = img.size
65
+
66
+ # Apply filter
67
+ if width < min_width or height < min_height or width * height < min_area:
68
+ return None
69
+
70
+ # Resize if needed
71
+ img.thumbnail(max_size, Image.Resampling.LANCZOS)
72
+
73
+ # Convert to RGB for JPEG
74
+ if output_format.upper() == "JPEG" and img.mode in ("RGBA", "P", "LA"):
75
+ background = Image.new("RGB", img.size, (255, 255, 255))
76
+ if img.mode == "P":
77
+ img = img.convert("RGBA")
78
+ if img.mode in ("RGBA", "LA"):
79
+ background.paste(img, mask=img.split()[-1])
80
+ else:
81
+ background.paste(img)
82
+ img = background
83
+
84
+ # Compress to bytes
85
+ out_buffer = io.BytesIO()
86
+ save_kwargs: dict[str, Any] = {"format": output_format}
87
+ if output_format.upper() in ("JPEG", "WEBP"):
88
+ save_kwargs["quality"] = quality
89
+ if output_format.upper() == "PNG":
90
+ save_kwargs["optimize"] = True
91
+
92
+ img.save(out_buffer, **save_kwargs)
93
+ return out_buffer.getvalue(), img.size[0], img.size[1]
94
+ except Exception:
95
+ return None
96
+
97
+
98
+ @dataclass
99
+ class ProcessedImage:
100
+ """Result of processing a single image.
101
+
102
+ Tracks the original position and processing outcome for each image,
103
+ enabling correct mapping during base64 replacement.
104
+ """
105
+
106
+ original_index: int # 1-indexed position in original markdown
107
+ saved_path: Path | None # None if filtered/deduplicated
108
+ skip_reason: str | None # "duplicate" | "filtered" | None
109
+
110
+
111
+ @dataclass
112
+ class ImageProcessResult:
113
+ """Result of image processing."""
114
+
115
+ saved_images: list[ExtractedImage]
116
+ filtered_count: int
117
+ deduplicated_count: int
118
+ # Mapping from original 1-indexed position to processing result
119
+ # This enables correct base64 replacement even when images are filtered
120
+ index_mapping: dict[int, ProcessedImage] | None = None
121
+
122
+
123
+ class ImageProcessor:
124
+ """Processor for image extraction, compression, and filtering."""
125
+
126
+ # Regex pattern to match base64 data URIs in markdown
127
+ # Support MIME types like png, jpeg, x-emf, x-wmf (with hyphens)
128
+ DATA_URI_PATTERN = re.compile(
129
+ r"!\[([^\]]*)\]\(data:image/([\w+.-]+);base64,([A-Za-z0-9+/=]+)\)"
130
+ )
131
+
132
+ def __init__(self, config: ImageConfig | None = None) -> None:
133
+ """Initialize with optional image configuration."""
134
+ self.config = config
135
+ self._seen_hashes: set[str] = set()
136
+
137
+ def _convert_to_png(self, image_data: bytes, original_fmt: str) -> bytes:
138
+ """Convert unsupported image formats (EMF/WMF) to PNG.
139
+
140
+ On Windows, uses Pillow which has native EMF/WMF support.
141
+ On other platforms, falls back to LibreOffice if available.
142
+ """
143
+ import platform
144
+
145
+ # Normalize format name
146
+ fmt_lower = original_fmt.lower().replace("x-", "") # x-emf -> emf
147
+
148
+ # On Windows, Pillow can natively read EMF/WMF files
149
+ if platform.system() == "Windows" and fmt_lower in ("emf", "wmf"):
150
+ try:
151
+ with io.BytesIO(image_data) as buffer:
152
+ img = Image.open(buffer)
153
+ # Load at higher DPI for better quality
154
+ # WmfImagePlugin.load() accepts dpi parameter
155
+ img.load(dpi=150) # type: ignore[call-arg]
156
+
157
+ # Convert to RGB if necessary (EMF/WMF loads as RGB)
158
+ if img.mode not in ("RGB", "RGBA"):
159
+ img = img.convert("RGB")
160
+
161
+ # Save as PNG
162
+ out_buffer = io.BytesIO()
163
+ img.save(out_buffer, format="PNG")
164
+ return out_buffer.getvalue()
165
+ except Exception:
166
+ # Fall through to LibreOffice fallback
167
+ pass
168
+
169
+ # Fallback to LibreOffice (for non-Windows or if Pillow fails)
170
+ import subprocess
171
+ import tempfile
172
+ import uuid
173
+
174
+ from markitai.utils.office import find_libreoffice
175
+
176
+ soffice = find_libreoffice()
177
+ if not soffice:
178
+ return image_data
179
+
180
+ try:
181
+ with tempfile.TemporaryDirectory() as temp_dir:
182
+ temp_path = Path(temp_dir)
183
+ # Ensure extension doesn't have special chars
184
+ ext = re.sub(r"[^a-zA-Z0-9]", "", original_fmt)
185
+ temp_in = temp_path / f"temp_{uuid.uuid4().hex[:8]}.{ext}"
186
+ temp_in.write_bytes(image_data)
187
+
188
+ # Create isolated user profile for concurrent LibreOffice execution
189
+ profile_path = temp_path / "lo_profile"
190
+ profile_path.mkdir()
191
+ profile_url = profile_path.as_uri()
192
+
193
+ cmd = [
194
+ soffice,
195
+ "--headless",
196
+ f"-env:UserInstallation={profile_url}",
197
+ "--convert-to",
198
+ "png",
199
+ "--outdir",
200
+ str(temp_path),
201
+ str(temp_in),
202
+ ]
203
+
204
+ subprocess.run(cmd, capture_output=True, timeout=30)
205
+
206
+ # LibreOffice output filename depends on input filename
207
+ temp_out = temp_path / f"{temp_in.stem}.png"
208
+ if temp_out.exists():
209
+ return temp_out.read_bytes()
210
+ except Exception:
211
+ pass
212
+
213
+ return image_data
214
+
215
+ def extract_base64_images(self, markdown: str) -> list[tuple[str, str, bytes]]:
216
+ """
217
+ Extract base64-encoded images from markdown content.
218
+
219
+ Args:
220
+ markdown: Markdown content containing data URIs
221
+
222
+ Returns:
223
+ List of (alt_text, mime_type, image_data) tuples
224
+ """
225
+ images = []
226
+ for match in self.DATA_URI_PATTERN.finditer(markdown):
227
+ alt_text = match.group(1)
228
+ image_type = match.group(2)
229
+ base64_data = match.group(3)
230
+
231
+ try:
232
+ image_data = base64.b64decode(base64_data)
233
+
234
+ # Handle EMF/WMF conversion
235
+ if image_type.lower() in ("x-emf", "emf", "x-wmf", "wmf"):
236
+ image_data = self._convert_to_png(image_data, image_type)
237
+ image_type = "png"
238
+
239
+ mime_type = f"image/{image_type}"
240
+ images.append((alt_text, mime_type, image_data))
241
+ except Exception:
242
+ # Skip invalid base64 data
243
+ continue
244
+
245
+ return images
246
+
247
+ def replace_base64_with_paths(
248
+ self,
249
+ markdown: str,
250
+ images: list[ExtractedImage],
251
+ assets_path: str = "assets",
252
+ index_mapping: dict[int, ProcessedImage] | None = None,
253
+ ) -> str:
254
+ """
255
+ Replace base64 data URIs with file paths in markdown.
256
+
257
+ When index_mapping is provided, uses position-based replacement to ensure
258
+ each base64 image is replaced with the correct saved image, even when
259
+ some images were filtered or deduplicated.
260
+
261
+ Args:
262
+ markdown: Original markdown with data URIs
263
+ images: List of saved images with paths
264
+ assets_path: Relative path to assets directory
265
+ index_mapping: Optional mapping from original index to ProcessedImage
266
+
267
+ Returns:
268
+ Markdown with data URIs replaced by file paths (filtered images removed)
269
+ """
270
+ if index_mapping:
271
+ # Use position-based replacement for correct mapping
272
+ current_index = 0
273
+
274
+ def replace_match_indexed(match: re.Match) -> str:
275
+ nonlocal current_index
276
+ current_index += 1 # 1-indexed
277
+ processed = index_mapping.get(current_index)
278
+ if processed is None:
279
+ # No mapping for this index, keep original
280
+ return match.group(0)
281
+ if processed.saved_path is None:
282
+ # Image was filtered/deduplicated, remove from output
283
+ return ""
284
+ return f"![{match.group(1)}]({assets_path}/{processed.saved_path.name})"
285
+
286
+ return self.DATA_URI_PATTERN.sub(replace_match_indexed, markdown)
287
+
288
+ # Legacy: sequential iteration (for backward compatibility)
289
+ image_iter = iter(images)
290
+
291
+ def replace_match(match: re.Match) -> str:
292
+ try:
293
+ img = next(image_iter)
294
+ return f"![{match.group(1)}]({assets_path}/{img.path.name})"
295
+ except StopIteration:
296
+ return match.group(0)
297
+
298
+ return self.DATA_URI_PATTERN.sub(replace_match, markdown)
299
+
300
+ def strip_base64_images(
301
+ self,
302
+ markdown: str,
303
+ replacement_path: str | None = None,
304
+ ) -> str:
305
+ """
306
+ Remove all base64 data URIs from markdown.
307
+
308
+ Args:
309
+ markdown: Markdown content with data URIs
310
+ replacement_path: If provided, replace with this path; otherwise remove
311
+
312
+ Returns:
313
+ Markdown with base64 images removed or replaced
314
+ """
315
+
316
+ def replace_match(match: re.Match) -> str:
317
+ alt_text = match.group(1)
318
+ if replacement_path:
319
+ return f"![{alt_text}]({replacement_path})"
320
+ return "" # Remove the image entirely
321
+
322
+ return self.DATA_URI_PATTERN.sub(replace_match, markdown)
323
+
324
+ @staticmethod
325
+ def remove_nonexistent_images(
326
+ markdown: str,
327
+ assets_dir: Path,
328
+ ) -> str:
329
+ """
330
+ Remove image references that don't exist in assets directory.
331
+
332
+ LLM may hallucinate non-existent image references. This method
333
+ validates each assets/ image reference and removes those that
334
+ don't exist on disk.
335
+
336
+ Args:
337
+ markdown: Markdown content with image references
338
+ assets_dir: Path to the assets directory
339
+
340
+ Returns:
341
+ Markdown with non-existent image references removed
342
+ """
343
+ # Pattern to match image references: ![alt](assets/filename) or ![alt](assets\filename)
344
+ # Support both forward slash and backslash for Windows compatibility
345
+ img_pattern = re.compile(r"!\[[^\]]*\]\(assets[/\\]([^)]+)\)")
346
+
347
+ # Invalid filename patterns that indicate placeholders or hallucinations
348
+ invalid_patterns = {"...", "..", ".", "placeholder", "image", "filename"}
349
+
350
+ def validate_image(match: re.Match) -> str:
351
+ filename = match.group(1)
352
+ # Check for placeholder patterns
353
+ if filename.strip() in invalid_patterns or filename.strip() == "":
354
+ return ""
355
+ image_path = assets_dir / filename
356
+ if image_path.exists():
357
+ return match.group(0) # Keep existing image
358
+ # Remove non-existent image reference
359
+ return ""
360
+
361
+ result = img_pattern.sub(validate_image, markdown)
362
+
363
+ # Clean up any resulting double spaces or empty lines
364
+ result = re.sub(r" +", " ", result) # Multiple spaces to single
365
+ result = re.sub(r"\n{3,}", "\n\n", result) # 3+ newlines to 2
366
+
367
+ return result
368
+
369
+ @staticmethod
370
+ def remove_hallucinated_images(
371
+ llm_output: str,
372
+ original_content: str,
373
+ ) -> str:
374
+ """Remove hallucinated image URLs from LLM output.
375
+
376
+ LLM may hallucinate image URLs that don't exist in the original content.
377
+ This method compares image URLs in the LLM output against the original
378
+ and removes any that weren't present originally.
379
+
380
+ Args:
381
+ llm_output: LLM processed markdown content
382
+ original_content: Original markdown before LLM processing
383
+
384
+ Returns:
385
+ LLM output with hallucinated image references removed
386
+ """
387
+ # Extract all image URLs from original content
388
+ img_pattern = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
389
+ original_urls = set(img_pattern.findall(original_content))
390
+
391
+ # Also extract URLs without markdown syntax (bare URLs in original)
392
+ url_pattern = re.compile(r"https?://[^\s\)\"'>]+")
393
+ original_urls.update(url_pattern.findall(original_content))
394
+
395
+ def validate_image(match: re.Match) -> str:
396
+ full_match = match.group(0)
397
+ url = match.group(1)
398
+
399
+ # Keep local asset references (handled by remove_nonexistent_images)
400
+ if url.startswith("assets/") or url.startswith("assets\\"):
401
+ return full_match
402
+
403
+ # Keep relative URLs (likely internal links)
404
+ if not url.startswith("http://") and not url.startswith("https://"):
405
+ return full_match
406
+
407
+ # Check if this URL existed in original
408
+ if url in original_urls:
409
+ return full_match
410
+
411
+ # URL is hallucinated - remove it
412
+ logger.debug(f"Removing hallucinated image URL: {url}")
413
+ return ""
414
+
415
+ result = img_pattern.sub(validate_image, llm_output)
416
+
417
+ # Clean up any resulting empty lines
418
+ result = re.sub(r"\n{3,}", "\n\n", result)
419
+
420
+ return result
421
+
422
+ def compress(
423
+ self,
424
+ image: Image.Image,
425
+ quality: int = DEFAULT_IMAGE_QUALITY,
426
+ max_size: tuple[int, int] = (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT),
427
+ output_format: str = "JPEG",
428
+ ) -> tuple[Image.Image, bytes]:
429
+ """
430
+ Compress an image.
431
+
432
+ Args:
433
+ image: PIL Image to compress
434
+ quality: JPEG quality (1-100)
435
+ max_size: Maximum dimensions (width, height)
436
+ output_format: Output format (JPEG, PNG, WEBP)
437
+
438
+ Returns:
439
+ Tuple of (compressed image, compressed data)
440
+ """
441
+ # Resize if needed
442
+ image.thumbnail(max_size, Image.Resampling.LANCZOS)
443
+
444
+ # Convert to RGB for JPEG (no alpha channel)
445
+ if output_format.upper() == "JPEG" and image.mode in ("RGBA", "P", "LA"):
446
+ # Create white background
447
+ background = Image.new("RGB", image.size, (255, 255, 255))
448
+ if image.mode == "P":
449
+ image = image.convert("RGBA")
450
+ background.paste(
451
+ image, mask=image.split()[-1] if image.mode == "RGBA" else None
452
+ )
453
+ image = background
454
+
455
+ # Compress to bytes
456
+ buffer = io.BytesIO()
457
+ save_kwargs: dict[str, Any] = {"format": output_format}
458
+ if output_format.upper() in ("JPEG", "WEBP"):
459
+ save_kwargs["quality"] = quality
460
+ if output_format.upper() == "PNG":
461
+ save_kwargs["optimize"] = True
462
+
463
+ image.save(buffer, **save_kwargs)
464
+ compressed_data = buffer.getvalue()
465
+
466
+ return image, compressed_data
467
+
468
+ def save_screenshot(
469
+ self,
470
+ pix_samples: bytes,
471
+ width: int,
472
+ height: int,
473
+ output_path: Path,
474
+ max_bytes: int = DEFAULT_SCREENSHOT_MAX_BYTES,
475
+ ) -> tuple[int, int]:
476
+ """
477
+ Save a screenshot with compression to ensure it's under the size limit.
478
+
479
+ Converts raw pixel data to PIL Image, compresses using config quality,
480
+ and progressively reduces quality if needed to stay under max_bytes.
481
+
482
+ Args:
483
+ pix_samples: Raw RGB pixel data from pymupdf pixmap.samples
484
+ width: Image width
485
+ height: Image height
486
+ output_path: Path to save the image
487
+ max_bytes: Maximum file size in bytes (default 5MB for LLM providers)
488
+
489
+ Returns:
490
+ Tuple of (final_width, final_height) after any resizing
491
+ """
492
+ from loguru import logger
493
+
494
+ # Convert raw samples to PIL Image
495
+ image = Image.frombytes("RGB", (width, height), pix_samples)
496
+
497
+ # Get quality from config or use default
498
+ quality = self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
499
+ max_width = self.config.max_width if self.config else DEFAULT_IMAGE_MAX_WIDTH
500
+ max_height = self.config.max_height if self.config else DEFAULT_IMAGE_MAX_HEIGHT
501
+ output_format = (self.config.format if self.config else "jpeg").upper()
502
+ if output_format == "JPG":
503
+ output_format = "JPEG"
504
+
505
+ # Resize to configured max dimensions
506
+ image.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)
507
+
508
+ # Convert to RGB for JPEG
509
+ if output_format == "JPEG" and image.mode in ("RGBA", "P", "LA"):
510
+ background = Image.new("RGB", image.size, (255, 255, 255))
511
+ if image.mode == "P":
512
+ image = image.convert("RGBA")
513
+ if image.mode in ("RGBA", "LA"):
514
+ background.paste(image, mask=image.split()[-1])
515
+ else:
516
+ background.paste(image)
517
+ image = background
518
+
519
+ # Try compressing with configured quality first
520
+ for q in [quality, 70, 55, 40, 25]:
521
+ buffer = io.BytesIO()
522
+ save_kwargs: dict[str, Any] = {"format": output_format}
523
+ if output_format in ("JPEG", "WEBP"):
524
+ save_kwargs["quality"] = q
525
+ save_kwargs["optimize"] = True
526
+ elif output_format == "PNG":
527
+ save_kwargs["optimize"] = True
528
+
529
+ image.save(buffer, **save_kwargs)
530
+ data = buffer.getvalue()
531
+
532
+ if len(data) <= max_bytes:
533
+ output_path.write_bytes(data)
534
+ if q < quality:
535
+ logger.debug(
536
+ f"Screenshot compressed: quality {quality}->{q}, "
537
+ f"size {len(data) / 1024:.1f}KB"
538
+ )
539
+ return image.size
540
+
541
+ # Last resort: aggressive resize
542
+ image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
543
+ buffer = io.BytesIO()
544
+ image.save(buffer, format="JPEG", quality=20, optimize=True)
545
+ data = buffer.getvalue()
546
+ output_path.write_bytes(data)
547
+ logger.warning(f"Screenshot aggressively compressed: {len(data) / 1024:.1f}KB")
548
+ return image.size
549
+
550
+ def should_filter(self, width: int, height: int) -> bool:
551
+ """
552
+ Check if an image should be filtered out based on size.
553
+
554
+ Args:
555
+ width: Image width in pixels
556
+ height: Image height in pixels
557
+
558
+ Returns:
559
+ True if image should be filtered out
560
+ """
561
+ if not self.config:
562
+ return False
563
+
564
+ filter_config = self.config.filter
565
+
566
+ if width < filter_config.min_width:
567
+ return True
568
+ if height < filter_config.min_height:
569
+ return True
570
+ if width * height < filter_config.min_area:
571
+ return True
572
+
573
+ return False
574
+
575
+ def is_duplicate(self, image_data: bytes) -> bool:
576
+ """
577
+ Check if image is a duplicate based on hash.
578
+
579
+ Args:
580
+ image_data: Raw image data
581
+
582
+ Returns:
583
+ True if image is a duplicate
584
+ """
585
+ if not self.config or not self.config.filter.deduplicate:
586
+ return False
587
+
588
+ image_hash = hashlib.md5(image_data).hexdigest()
589
+ if image_hash in self._seen_hashes:
590
+ return True
591
+
592
+ self._seen_hashes.add(image_hash)
593
+ return False
594
+
595
+ def process_and_save(
596
+ self,
597
+ images: list[tuple[str, str, bytes]],
598
+ output_dir: Path,
599
+ base_name: str,
600
+ ) -> ImageProcessResult:
601
+ """
602
+ Process and save a list of images.
603
+
604
+ Args:
605
+ images: List of (alt_text, mime_type, image_data) tuples
606
+ output_dir: Directory to save images
607
+ base_name: Base name for image files
608
+
609
+ Returns:
610
+ ImageProcessResult with saved images, statistics, and index mapping
611
+ """
612
+ # Delayed import to avoid circular import
613
+ from markitai.converter.base import ExtractedImage
614
+
615
+ # Create assets directory
616
+ assets_dir = ensure_assets_dir(output_dir)
617
+
618
+ saved_images: list[ExtractedImage] = []
619
+ filtered_count = 0
620
+ deduplicated_count = 0
621
+ index_mapping: dict[int, ProcessedImage] = {}
622
+
623
+ # Determine output format
624
+ output_format = "JPEG"
625
+ extension = "jpg"
626
+ if self.config:
627
+ format_map = {
628
+ "jpeg": ("JPEG", "jpg"),
629
+ "png": ("PNG", "png"),
630
+ "webp": ("WEBP", "webp"),
631
+ }
632
+ output_format, extension = format_map.get(
633
+ self.config.format, ("JPEG", "jpg")
634
+ )
635
+
636
+ for idx, (_alt_text, _mime_type, image_data) in enumerate(images, start=1):
637
+ # Check for duplicates
638
+ if self.is_duplicate(image_data):
639
+ deduplicated_count += 1
640
+ index_mapping[idx] = ProcessedImage(
641
+ original_index=idx, saved_path=None, skip_reason="duplicate"
642
+ )
643
+ continue
644
+
645
+ # Load image
646
+ try:
647
+ # Use BytesIO as context manager to ensure buffer is released
648
+ img_buffer = io.BytesIO(image_data)
649
+ try:
650
+ img = Image.open(img_buffer)
651
+ # Load image data immediately so we can release the buffer
652
+ img.load()
653
+
654
+ width, height = img.size
655
+
656
+ # Check filter
657
+ if self.should_filter(width, height):
658
+ filtered_count += 1
659
+ index_mapping[idx] = ProcessedImage(
660
+ original_index=idx, saved_path=None, skip_reason="filtered"
661
+ )
662
+ img.close()
663
+ continue
664
+
665
+ # Compress
666
+ quality = (
667
+ self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
668
+ )
669
+ max_size = (
670
+ (self.config.max_width, self.config.max_height)
671
+ if self.config
672
+ else (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT)
673
+ )
674
+
675
+ if self.config and self.config.compress:
676
+ # No need for img.copy() - compress can modify the image
677
+ # since we don't need the original after this
678
+ compressed_img, compressed_data = self.compress(
679
+ img,
680
+ quality=quality,
681
+ max_size=max_size,
682
+ output_format=output_format,
683
+ )
684
+ final_width, final_height = compressed_img.size
685
+ # Release the compressed image
686
+ compressed_img.close()
687
+ else:
688
+ compressed_data = image_data
689
+ final_width, final_height = width, height
690
+
691
+ # Close original image to release memory
692
+ img.close()
693
+
694
+ # Generate filename
695
+ filename = f"{base_name}.{idx:04d}.{extension}"
696
+ output_path = assets_dir / filename
697
+
698
+ # Save
699
+ output_path.write_bytes(compressed_data)
700
+
701
+ # Release compressed data reference
702
+ del compressed_data
703
+
704
+ extracted = ExtractedImage(
705
+ path=output_path,
706
+ index=idx,
707
+ original_name=filename,
708
+ mime_type=f"image/{extension}",
709
+ width=final_width,
710
+ height=final_height,
711
+ )
712
+ saved_images.append(extracted)
713
+ index_mapping[idx] = ProcessedImage(
714
+ original_index=idx, saved_path=output_path, skip_reason=None
715
+ )
716
+ finally:
717
+ img_buffer.close()
718
+
719
+ except Exception:
720
+ # Skip invalid images - record as filtered
721
+ index_mapping[idx] = ProcessedImage(
722
+ original_index=idx, saved_path=None, skip_reason="error"
723
+ )
724
+ continue
725
+
726
+ return ImageProcessResult(
727
+ saved_images=saved_images,
728
+ filtered_count=filtered_count,
729
+ deduplicated_count=deduplicated_count,
730
+ index_mapping=index_mapping,
731
+ )
732
+
733
+ def reset_dedup_cache(self) -> None:
734
+ """Reset the deduplication hash cache."""
735
+ self._seen_hashes.clear()
736
+
737
+ async def process_and_save_async(
738
+ self,
739
+ images: list[tuple[str, str, bytes]],
740
+ output_dir: Path,
741
+ base_name: str,
742
+ max_concurrency: int = DEFAULT_IMAGE_IO_CONCURRENCY,
743
+ ) -> ImageProcessResult:
744
+ """Process and save a list of images with async I/O.
745
+
746
+ This is an optimized version that uses asyncio for concurrent I/O
747
+ operations while keeping CPU-bound image processing sequential.
748
+
749
+ Args:
750
+ images: List of (alt_text, mime_type, image_data) tuples
751
+ output_dir: Directory to save images
752
+ base_name: Base name for image files
753
+ max_concurrency: Maximum concurrent I/O operations
754
+
755
+ Returns:
756
+ ImageProcessResult with saved images, statistics, and index mapping
757
+ """
758
+ import asyncio
759
+
760
+ # Delayed imports to avoid circular import
761
+ from markitai.converter.base import ExtractedImage
762
+ from markitai.security import write_bytes_async
763
+
764
+ # Create assets directory
765
+ assets_dir = ensure_assets_dir(output_dir)
766
+
767
+ saved_images: list[ExtractedImage] = []
768
+ filtered_count = 0
769
+ deduplicated_count = 0
770
+ index_mapping: dict[int, ProcessedImage] = {}
771
+
772
+ # Determine output format
773
+ output_format = "JPEG"
774
+ extension = "jpg"
775
+ if self.config:
776
+ format_map = {
777
+ "jpeg": ("JPEG", "jpg"),
778
+ "png": ("PNG", "png"),
779
+ "webp": ("WEBP", "webp"),
780
+ }
781
+ output_format, extension = format_map.get(
782
+ self.config.format, ("JPEG", "jpg")
783
+ )
784
+
785
+ # First pass: process images (CPU-bound, sequential)
786
+ processed_images: list[tuple[int, bytes, int, int]] = []
787
+ for idx, (_alt_text, _mime_type, image_data) in enumerate(images, start=1):
788
+ # Check for duplicates
789
+ if self.is_duplicate(image_data):
790
+ deduplicated_count += 1
791
+ index_mapping[idx] = ProcessedImage(
792
+ original_index=idx, saved_path=None, skip_reason="duplicate"
793
+ )
794
+ continue
795
+
796
+ # Load and process image
797
+ try:
798
+ with Image.open(io.BytesIO(image_data)) as img:
799
+ width, height = img.size
800
+
801
+ # Check filter
802
+ if self.should_filter(width, height):
803
+ filtered_count += 1
804
+ index_mapping[idx] = ProcessedImage(
805
+ original_index=idx, saved_path=None, skip_reason="filtered"
806
+ )
807
+ continue
808
+
809
+ # Compress
810
+ quality = (
811
+ self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
812
+ )
813
+ max_size = (
814
+ (self.config.max_width, self.config.max_height)
815
+ if self.config
816
+ else (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT)
817
+ )
818
+
819
+ if self.config and self.config.compress:
820
+ compressed_img, compressed_data = self.compress(
821
+ img.copy(),
822
+ quality=quality,
823
+ max_size=max_size,
824
+ output_format=output_format,
825
+ )
826
+ final_width, final_height = compressed_img.size
827
+ else:
828
+ compressed_data = image_data
829
+ final_width, final_height = width, height
830
+
831
+ processed_images.append(
832
+ (idx, compressed_data, final_width, final_height)
833
+ )
834
+
835
+ except Exception:
836
+ # Skip invalid images
837
+ index_mapping[idx] = ProcessedImage(
838
+ original_index=idx, saved_path=None, skip_reason="error"
839
+ )
840
+ continue
841
+
842
+ # Second pass: save images concurrently (I/O-bound)
843
+ semaphore = asyncio.Semaphore(max_concurrency)
844
+
845
+ async def save_image(
846
+ idx: int, data: bytes, width: int, height: int
847
+ ) -> tuple[int, ExtractedImage | None, Path | None]:
848
+ filename = f"{base_name}.{idx:04d}.{extension}"
849
+ output_path = assets_dir / filename
850
+
851
+ async with semaphore:
852
+ try:
853
+ await write_bytes_async(output_path, data)
854
+ return (
855
+ idx,
856
+ ExtractedImage(
857
+ path=output_path,
858
+ index=idx,
859
+ original_name=filename,
860
+ mime_type=f"image/{extension}",
861
+ width=width,
862
+ height=height,
863
+ ),
864
+ output_path,
865
+ )
866
+ except Exception:
867
+ return idx, None, None
868
+
869
+ # Run all saves concurrently
870
+ tasks = [
871
+ save_image(idx, data, width, height)
872
+ for idx, data, width, height in processed_images
873
+ ]
874
+ results = await asyncio.gather(*tasks)
875
+
876
+ # Collect successful saves and build index mapping
877
+ for idx, extracted, output_path in results:
878
+ if extracted is not None:
879
+ saved_images.append(extracted)
880
+ index_mapping[idx] = ProcessedImage(
881
+ original_index=idx, saved_path=output_path, skip_reason=None
882
+ )
883
+ else:
884
+ index_mapping[idx] = ProcessedImage(
885
+ original_index=idx, saved_path=None, skip_reason="error"
886
+ )
887
+
888
+ # Sort by index to maintain order
889
+ saved_images.sort(key=lambda x: x.index)
890
+
891
+ return ImageProcessResult(
892
+ saved_images=saved_images,
893
+ filtered_count=filtered_count,
894
+ deduplicated_count=deduplicated_count,
895
+ index_mapping=index_mapping,
896
+ )
897
+
898
+ async def process_and_save_multiprocess(
899
+ self,
900
+ images: list[tuple[str, str, bytes]],
901
+ output_dir: Path,
902
+ base_name: str,
903
+ max_workers: int | None = None,
904
+ max_io_concurrency: int = DEFAULT_IMAGE_IO_CONCURRENCY,
905
+ ) -> ImageProcessResult:
906
+ """Process and save images using multiprocessing for CPU-bound compression.
907
+
908
+ This version uses ProcessPoolExecutor to parallelize image compression
909
+ across multiple CPU cores, bypassing the GIL limitation.
910
+
911
+ Args:
912
+ images: List of (alt_text, mime_type, image_data) tuples
913
+ output_dir: Directory to save images
914
+ base_name: Base name for image files
915
+ max_workers: Max worker processes (default: cpu_count // 2)
916
+ max_io_concurrency: Maximum concurrent I/O operations
917
+
918
+ Returns:
919
+ ImageProcessResult with saved images, statistics, and index mapping
920
+ """
921
+ import asyncio
922
+
923
+ from markitai.converter.base import ExtractedImage
924
+ from markitai.security import write_bytes_async
925
+
926
+ if not images:
927
+ return ImageProcessResult(
928
+ saved_images=[],
929
+ filtered_count=0,
930
+ deduplicated_count=0,
931
+ index_mapping={},
932
+ )
933
+
934
+ # Create assets directory
935
+ assets_dir = ensure_assets_dir(output_dir)
936
+
937
+ # Determine output format
938
+ output_format = "JPEG"
939
+ extension = "jpg"
940
+ if self.config:
941
+ format_map = {
942
+ "jpeg": ("JPEG", "jpg"),
943
+ "png": ("PNG", "png"),
944
+ "webp": ("WEBP", "webp"),
945
+ }
946
+ output_format, extension = format_map.get(
947
+ self.config.format, ("JPEG", "jpg")
948
+ )
949
+
950
+ # Get compression parameters
951
+ quality = self.config.quality if self.config else DEFAULT_IMAGE_QUALITY
952
+ max_size = (
953
+ (self.config.max_width, self.config.max_height)
954
+ if self.config
955
+ else (DEFAULT_IMAGE_MAX_WIDTH, DEFAULT_IMAGE_MAX_HEIGHT)
956
+ )
957
+ compress_enabled = self.config.compress if self.config else True
958
+
959
+ # Get filter parameters
960
+ min_width = self.config.filter.min_width if self.config else 50
961
+ min_height = self.config.filter.min_height if self.config else 50
962
+ min_area = self.config.filter.min_area if self.config else 5000
963
+
964
+ # Prepare work items (filter duplicates first)
965
+ work_items: list[tuple[int, bytes]] = []
966
+ deduplicated_count = 0
967
+ index_mapping: dict[int, ProcessedImage] = {}
968
+ for idx, (_alt_text, _mime_type, image_data) in enumerate(images, start=1):
969
+ if self.is_duplicate(image_data):
970
+ deduplicated_count += 1
971
+ index_mapping[idx] = ProcessedImage(
972
+ original_index=idx, saved_path=None, skip_reason="duplicate"
973
+ )
974
+ continue
975
+ work_items.append((idx, image_data))
976
+
977
+ if not work_items:
978
+ return ImageProcessResult(
979
+ saved_images=[],
980
+ filtered_count=0,
981
+ deduplicated_count=deduplicated_count,
982
+ index_mapping=index_mapping,
983
+ )
984
+
985
+ # Determine worker count (use half of CPUs to avoid system overload)
986
+ if max_workers is None:
987
+ max_workers = max(1, (os.cpu_count() or 4) // 2)
988
+
989
+ # Process images in parallel using ProcessPoolExecutor
990
+ loop = asyncio.get_running_loop()
991
+ processed_results: list[tuple[int, bytes, int, int]] = []
992
+ filtered_count = 0
993
+
994
+ # Use ProcessPoolExecutor for CPU-bound compression
995
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
996
+ futures = []
997
+ for idx, image_data in work_items:
998
+ if compress_enabled:
999
+ future = loop.run_in_executor(
1000
+ executor,
1001
+ _compress_image_worker,
1002
+ image_data,
1003
+ quality,
1004
+ max_size,
1005
+ output_format,
1006
+ min_width,
1007
+ min_height,
1008
+ min_area,
1009
+ )
1010
+ futures.append((idx, future))
1011
+ else:
1012
+ # No compression, just validate size
1013
+ try:
1014
+ with io.BytesIO(image_data) as buffer:
1015
+ img = Image.open(buffer)
1016
+ w, h = img.size
1017
+ if w >= min_width and h >= min_height and w * h >= min_area:
1018
+ processed_results.append((idx, image_data, w, h))
1019
+ else:
1020
+ filtered_count += 1
1021
+ index_mapping[idx] = ProcessedImage(
1022
+ original_index=idx,
1023
+ saved_path=None,
1024
+ skip_reason="filtered",
1025
+ )
1026
+ except Exception:
1027
+ index_mapping[idx] = ProcessedImage(
1028
+ original_index=idx, saved_path=None, skip_reason="error"
1029
+ )
1030
+
1031
+ # Gather results from workers
1032
+ for idx, future in futures:
1033
+ try:
1034
+ result = await future
1035
+ if result is None:
1036
+ filtered_count += 1
1037
+ index_mapping[idx] = ProcessedImage(
1038
+ original_index=idx, saved_path=None, skip_reason="filtered"
1039
+ )
1040
+ else:
1041
+ compressed_data, final_w, final_h = result
1042
+ processed_results.append(
1043
+ (idx, compressed_data, final_w, final_h)
1044
+ )
1045
+ except Exception:
1046
+ filtered_count += 1
1047
+ index_mapping[idx] = ProcessedImage(
1048
+ original_index=idx, saved_path=None, skip_reason="error"
1049
+ )
1050
+
1051
+ # Second pass: save images concurrently (I/O-bound)
1052
+ semaphore = asyncio.Semaphore(max_io_concurrency)
1053
+ saved_images: list[ExtractedImage] = []
1054
+
1055
+ async def save_image(
1056
+ idx: int, data: bytes, width: int, height: int
1057
+ ) -> tuple[int, ExtractedImage | None, Path | None]:
1058
+ filename = f"{base_name}.{idx:04d}.{extension}"
1059
+ output_path = assets_dir / filename
1060
+
1061
+ async with semaphore:
1062
+ try:
1063
+ await write_bytes_async(output_path, data)
1064
+ return (
1065
+ idx,
1066
+ ExtractedImage(
1067
+ path=output_path,
1068
+ index=idx,
1069
+ original_name=filename,
1070
+ mime_type=f"image/{extension}",
1071
+ width=width,
1072
+ height=height,
1073
+ ),
1074
+ output_path,
1075
+ )
1076
+ except Exception:
1077
+ return idx, None, None
1078
+
1079
+ # Run all saves concurrently
1080
+ tasks = [
1081
+ save_image(idx, data, width, height)
1082
+ for idx, data, width, height in processed_results
1083
+ ]
1084
+ results = await asyncio.gather(*tasks)
1085
+
1086
+ # Collect successful saves and build index mapping
1087
+ for idx, extracted, output_path in results:
1088
+ if extracted is not None:
1089
+ saved_images.append(extracted)
1090
+ index_mapping[idx] = ProcessedImage(
1091
+ original_index=idx, saved_path=output_path, skip_reason=None
1092
+ )
1093
+ else:
1094
+ index_mapping[idx] = ProcessedImage(
1095
+ original_index=idx, saved_path=None, skip_reason="error"
1096
+ )
1097
+
1098
+ # Sort by index to maintain order
1099
+ saved_images.sort(key=lambda x: x.index)
1100
+
1101
+ return ImageProcessResult(
1102
+ saved_images=saved_images,
1103
+ filtered_count=filtered_count,
1104
+ deduplicated_count=deduplicated_count,
1105
+ index_mapping=index_mapping,
1106
+ )
1107
+
1108
+
1109
+ # =============================================================================
1110
+ # URL Image Download
1111
+ # =============================================================================
1112
+
1113
+ # Pattern to match markdown images: ![alt](url)
1114
+ # Excludes data: URIs (base64 encoded images)
1115
+ _URL_IMAGE_PATTERN = re.compile(
1116
+ r"!\[([^\]]*)\]\((?!data:)([^)]+)\)",
1117
+ re.IGNORECASE,
1118
+ )
1119
+
1120
+ # Common image extensions
1121
+ _IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".svg", ".bmp", ".ico"}
1122
+
1123
+
1124
+ @dataclass
1125
+ class UrlImageDownloadResult:
1126
+ """Result of downloading images from URLs."""
1127
+
1128
+ updated_markdown: str
1129
+ downloaded_paths: list[Path]
1130
+ failed_urls: list[str]
1131
+ url_to_path: dict[str, Path] = field(
1132
+ default_factory=dict
1133
+ ) # URL -> local path mapping
1134
+
1135
+
1136
+ def _get_extension_from_content_type(content_type: str) -> str:
1137
+ """Get file extension from content-type header."""
1138
+ return get_extension_from_mime(content_type)
1139
+
1140
+
1141
+ def _get_extension_from_url(url: str) -> str | None:
1142
+ """Extract image extension from URL path."""
1143
+ parsed = urlparse(url)
1144
+ path = parsed.path.lower()
1145
+ # Remove query params from path
1146
+ path = path.split("?")[0]
1147
+ for ext in _IMAGE_EXTENSIONS:
1148
+ if path.endswith(ext):
1149
+ return ext
1150
+ return None
1151
+
1152
+
1153
+ def _sanitize_image_filename(name: str, max_length: int = 100) -> str:
1154
+ """Sanitize filename for cross-platform compatibility."""
1155
+ # Remove or replace invalid characters
1156
+ invalid_chars = '<>:"/\\|?*'
1157
+ for char in invalid_chars:
1158
+ name = name.replace(char, "_")
1159
+ # Remove control characters
1160
+ name = "".join(c for c in name if ord(c) >= 32)
1161
+ # Limit length
1162
+ if len(name) > max_length:
1163
+ name = name[:max_length]
1164
+ return name.strip() or "image"
1165
+
1166
+
1167
+ async def download_url_images(
1168
+ markdown: str,
1169
+ output_dir: Path,
1170
+ base_url: str,
1171
+ config: ImageConfig,
1172
+ source_name: str = "url",
1173
+ concurrency: int = 5,
1174
+ timeout: int = 30,
1175
+ ) -> UrlImageDownloadResult:
1176
+ """Download images from URLs in markdown and save to assets directory.
1177
+
1178
+ This function:
1179
+ 1. Finds all image URLs in markdown (excluding data: URIs)
1180
+ 2. Downloads images concurrently with rate limiting
1181
+ 3. Saves to assets directory with proper naming
1182
+ 4. Replaces URLs with local paths in markdown
1183
+ 5. Skips failed downloads (keeps original URL, logs warning)
1184
+
1185
+ Args:
1186
+ markdown: Markdown content with image URLs
1187
+ output_dir: Output directory (assets will be created inside)
1188
+ base_url: Base URL for resolving relative image paths
1189
+ config: Image configuration (format, quality, etc.)
1190
+ source_name: Source identifier for naming images
1191
+ concurrency: Max concurrent downloads (default 5)
1192
+ timeout: HTTP request timeout in seconds (default 30)
1193
+
1194
+ Returns:
1195
+ UrlImageDownloadResult with:
1196
+ - updated_markdown: Markdown with local paths for downloaded images
1197
+ - downloaded_paths: List of successfully downloaded image paths
1198
+ - failed_urls: List of URLs that failed to download
1199
+ """
1200
+ # Find all image URLs
1201
+ matches = list(_URL_IMAGE_PATTERN.finditer(markdown))
1202
+ if not matches:
1203
+ return UrlImageDownloadResult(
1204
+ updated_markdown=markdown,
1205
+ downloaded_paths=[],
1206
+ failed_urls=[],
1207
+ )
1208
+
1209
+ # Create assets directory
1210
+ assets_dir = ensure_assets_dir(output_dir)
1211
+
1212
+ # Prepare download tasks
1213
+ semaphore = asyncio.Semaphore(concurrency)
1214
+ downloaded_paths: list[Path] = []
1215
+ failed_urls: list[str] = []
1216
+ replacements: dict[str, str] = {} # original_match -> replacement
1217
+ url_to_path: dict[str, Path] = {} # image_url -> local_path mapping
1218
+
1219
+ # Sanitize source name for filenames
1220
+ safe_source = _sanitize_image_filename(source_name, max_length=50)
1221
+
1222
+ async def download_single(
1223
+ client: httpx.AsyncClient,
1224
+ match: re.Match,
1225
+ index: int,
1226
+ ) -> None:
1227
+ """Download a single image."""
1228
+ alt_text = match.group(1)
1229
+ image_url = match.group(2).strip()
1230
+ original_match = match.group(0)
1231
+
1232
+ # Resolve relative URLs
1233
+ if not image_url.startswith(("http://", "https://", "//")):
1234
+ image_url = urljoin(base_url, image_url)
1235
+ elif image_url.startswith("//"):
1236
+ # Protocol-relative URL
1237
+ parsed_base = urlparse(base_url)
1238
+ image_url = f"{parsed_base.scheme}:{image_url}"
1239
+
1240
+ async with semaphore:
1241
+ try:
1242
+ response = await client.get(
1243
+ image_url,
1244
+ follow_redirects=True,
1245
+ timeout=timeout,
1246
+ )
1247
+ response.raise_for_status()
1248
+
1249
+ # Determine file extension
1250
+ content_type = response.headers.get("content-type", "")
1251
+ ext = _get_extension_from_url(image_url)
1252
+ if not ext:
1253
+ ext = _get_extension_from_content_type(content_type)
1254
+
1255
+ # Generate filename: source_name.NNNN.ext (1-indexed, 4 digits)
1256
+ filename = f"{safe_source}.{index + 1:04d}{ext}"
1257
+ output_path = assets_dir / filename
1258
+
1259
+ # Process image (apply quality settings if configured)
1260
+ image_data = response.content
1261
+ if ext.lower() in (".jpg", ".jpeg", ".png", ".webp"):
1262
+ try:
1263
+ processed = _compress_image_worker(
1264
+ image_data,
1265
+ quality=config.quality,
1266
+ max_size=(config.max_width, config.max_height),
1267
+ output_format=config.format.upper(),
1268
+ min_width=config.filter.min_width,
1269
+ min_height=config.filter.min_height,
1270
+ min_area=config.filter.min_area,
1271
+ )
1272
+ if processed:
1273
+ image_data, _, _ = processed
1274
+ # Update extension if format changed
1275
+ if config.format.lower() != ext[1:].lower():
1276
+ ext = f".{config.format.lower()}"
1277
+ filename = f"{safe_source}.{index + 1:04d}{ext}"
1278
+ output_path = assets_dir / filename
1279
+ else:
1280
+ # Image was filtered out (too small)
1281
+ logger.debug(
1282
+ f"Image filtered (too small): {image_url[:60]}..."
1283
+ )
1284
+ return
1285
+ except Exception as e:
1286
+ logger.debug(f"Image processing failed, saving original: {e}")
1287
+
1288
+ # Save to file
1289
+ output_path.write_bytes(image_data)
1290
+ downloaded_paths.append(output_path)
1291
+
1292
+ # Prepare replacement with local path
1293
+ local_path = f"assets/{filename}"
1294
+ replacements[original_match] = f"![{alt_text}]({local_path})"
1295
+
1296
+ # Track URL to path mapping for post-processing
1297
+ url_to_path[image_url] = output_path
1298
+
1299
+ logger.debug(f"Downloaded: {image_url[:60]}... -> {output_path}")
1300
+
1301
+ except httpx.TimeoutException:
1302
+ logger.warning(f"Timeout downloading image: {image_url[:80]}...")
1303
+ failed_urls.append(image_url)
1304
+ except httpx.HTTPStatusError as e:
1305
+ logger.warning(
1306
+ f"HTTP {e.response.status_code} downloading: {image_url[:80]}..."
1307
+ )
1308
+ failed_urls.append(image_url)
1309
+ except Exception as e:
1310
+ logger.warning(f"Failed to download image: {image_url[:80]}... - {e}")
1311
+ failed_urls.append(image_url)
1312
+
1313
+ # Download all images concurrently
1314
+ async with httpx.AsyncClient(
1315
+ headers={
1316
+ "User-Agent": "Mozilla/5.0 (compatible; markitai/0.3.0; +https://github.com/Ynewtime/markitai)"
1317
+ },
1318
+ follow_redirects=True,
1319
+ ) as client:
1320
+ tasks = [
1321
+ download_single(client, match, idx) for idx, match in enumerate(matches)
1322
+ ]
1323
+ await asyncio.gather(*tasks)
1324
+
1325
+ # Apply replacements to markdown
1326
+ updated_markdown = markdown
1327
+ for original, replacement in replacements.items():
1328
+ updated_markdown = updated_markdown.replace(original, replacement)
1329
+
1330
+ return UrlImageDownloadResult(
1331
+ updated_markdown=updated_markdown,
1332
+ downloaded_paths=downloaded_paths,
1333
+ failed_urls=failed_urls,
1334
+ url_to_path=url_to_path,
1335
+ )