kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,41 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import io
4
+ import logging
5
+ import time
6
+ import zlib
3
7
  from abc import ABC, abstractmethod
4
- from typing import TYPE_CHECKING, ClassVar
8
+ from dataclasses import asdict
9
+ from multiprocessing import cpu_count
10
+ from typing import TYPE_CHECKING, Any, ClassVar
5
11
 
6
- from kreuzberg._types import ExtractionResult, normalize_metadata
12
+ from PIL import Image
13
+
14
+ from kreuzberg._ocr import get_ocr_backend
15
+ from kreuzberg._types import (
16
+ EasyOCRConfig,
17
+ ExtractedImage,
18
+ ExtractionResult,
19
+ ImageOCRResult,
20
+ PaddleOCRConfig,
21
+ TesseractConfig,
22
+ normalize_metadata,
23
+ )
7
24
  from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
25
+ from kreuzberg._utils._sync import run_taskgroup_batched
8
26
 
9
27
  if TYPE_CHECKING:
10
28
  from pathlib import Path
11
29
 
12
30
  from kreuzberg._types import ExtractionConfig
13
31
 
32
+ MAX_TOTAL_IMAGE_SIZE_MB = 100
33
+ MAX_SINGLE_IMAGE_SIZE_MB = 50
34
+ MAX_TOTAL_IMAGE_SIZE = MAX_TOTAL_IMAGE_SIZE_MB * 1024 * 1024
35
+ MAX_SINGLE_IMAGE_SIZE = MAX_SINGLE_IMAGE_SIZE_MB * 1024 * 1024
36
+
37
+ logger = logging.getLogger(__name__)
38
+
14
39
 
15
40
  class Extractor(ABC):
16
41
  __slots__ = ("config", "mime_type")
@@ -52,11 +77,252 @@ class Extractor(ABC):
52
77
 
53
78
  enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
54
79
 
80
+ deduplicated_images = self._deduplicate_images(result.images) if result.images else []
81
+
55
82
  return ExtractionResult(
56
83
  content=cleaned_content,
57
84
  mime_type=result.mime_type,
58
85
  metadata=normalize_metadata(enhanced_metadata),
86
+ tables=result.tables,
59
87
  chunks=result.chunks,
88
+ images=deduplicated_images,
89
+ image_ocr_results=result.image_ocr_results,
90
+ entities=result.entities,
91
+ keywords=result.keywords,
60
92
  detected_languages=result.detected_languages,
61
- tables=result.tables,
93
+ document_type=result.document_type,
94
+ document_type_confidence=result.document_type_confidence,
95
+ layout=result.layout,
96
+ )
97
+
98
+ def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
99
+ """Filter images based on memory safety limits."""
100
+ if not images:
101
+ return []
102
+
103
+ images_with_sizes = [(img, len(img.data)) for img in images]
104
+
105
+ valid_images = []
106
+ for img, size in images_with_sizes:
107
+ if size <= MAX_SINGLE_IMAGE_SIZE:
108
+ valid_images.append((img, size))
109
+ else:
110
+ logger.warning(
111
+ "Skipping image %s: size %d MB exceeds limit of %d MB",
112
+ img.filename or "unknown",
113
+ size // (1024 * 1024),
114
+ MAX_SINGLE_IMAGE_SIZE_MB,
115
+ )
116
+
117
+ total_size = sum(size for _, size in valid_images)
118
+
119
+ if total_size <= MAX_TOTAL_IMAGE_SIZE:
120
+ return [img for img, _ in valid_images]
121
+
122
+ logger.warning(
123
+ "Total image size %d MB exceeds limit of %d MB, selecting subset",
124
+ total_size // (1024 * 1024),
125
+ MAX_TOTAL_IMAGE_SIZE_MB,
62
126
  )
127
+
128
+ sorted_images = sorted(valid_images, key=lambda x: x[1])
129
+ selected = []
130
+ current_size = 0
131
+
132
+ for img, img_size in sorted_images:
133
+ if current_size + img_size <= MAX_TOTAL_IMAGE_SIZE:
134
+ selected.append(img)
135
+ current_size += img_size
136
+ else:
137
+ logger.debug("Skipping image %s: would exceed total memory limit", img.filename or "unknown")
138
+
139
+ return selected
140
+
141
+ _SMALL_IMAGE_THRESHOLD = 1024
142
+ _HASH_SAMPLE_SIZE = 512
143
+
144
+ def _compute_image_hash(self, img: ExtractedImage) -> int:
145
+ """Compute hash for image deduplication using progressive hashing.
146
+
147
+ For small images (<1KB), hash the entire content.
148
+ For larger images, use size + first/last bytes for quick comparison.
149
+
150
+ Args:
151
+ img: Image to hash
152
+
153
+ Returns:
154
+ Hash value for deduplication
155
+ """
156
+ data_len = len(img.data)
157
+
158
+ if data_len < self._SMALL_IMAGE_THRESHOLD:
159
+ return zlib.crc32(img.data) & 0xFFFFFFFF
160
+
161
+ hash_components = [
162
+ str(data_len).encode(),
163
+ img.data[: self._HASH_SAMPLE_SIZE],
164
+ img.data[-self._HASH_SAMPLE_SIZE :],
165
+ img.format.encode() if img.format else b"",
166
+ ]
167
+
168
+ combined = b"".join(hash_components)
169
+ return zlib.crc32(combined) & 0xFFFFFFFF
170
+
171
+ def _deduplicate_images(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
172
+ if not self.config.deduplicate_images or not images:
173
+ return images
174
+
175
+ seen_hashes = set()
176
+ unique_images = []
177
+
178
+ for img in images:
179
+ img_hash = self._compute_image_hash(img)
180
+ if img_hash not in seen_hashes:
181
+ seen_hashes.add(img_hash)
182
+ unique_images.append(img)
183
+ else:
184
+ logger.debug("Filtered duplicate image: %s", img.filename)
185
+
186
+ if len(unique_images) < len(images):
187
+ logger.info("Deduplicated %d images to %d unique", len(images), len(unique_images))
188
+
189
+ return unique_images
190
+
191
+ def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
192
+ """Prepare OCR configuration for the specified backend.
193
+
194
+ Args:
195
+ backend_name: Name of the OCR backend
196
+
197
+ Returns:
198
+ Configuration dictionary for the backend
199
+ """
200
+ default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
201
+ config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
202
+
203
+ if backend_name == "tesseract":
204
+ default_config = TesseractConfig()
205
+ config_class = TesseractConfig
206
+ elif backend_name == "easyocr":
207
+ default_config = EasyOCRConfig()
208
+ config_class = EasyOCRConfig
209
+ elif backend_name == "paddleocr":
210
+ default_config = PaddleOCRConfig()
211
+ config_class = PaddleOCRConfig
212
+ else:
213
+ raise ValueError(f"Unknown OCR backend: {backend_name}")
214
+
215
+ cfg: dict[str, Any] = asdict(default_config)
216
+
217
+ if self.config.ocr_config and isinstance(self.config.ocr_config, config_class):
218
+ user_cfg: dict[str, Any] = asdict(self.config.ocr_config)
219
+ cfg.update(user_cfg)
220
+
221
+ cfg["use_cache"] = self.config.use_cache
222
+ return cfg
223
+
224
+ def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
225
+ """Validate if an image is suitable for OCR processing.
226
+
227
+ Args:
228
+ img: Image to validate
229
+
230
+ Returns:
231
+ Reason for skipping if invalid, None if valid
232
+ """
233
+ fmt = img.format.lower()
234
+ if fmt not in self.config.image_ocr_formats:
235
+ return f"Unsupported format: {img.format}"
236
+
237
+ if img.dimensions is not None:
238
+ w, h = img.dimensions
239
+ min_w, min_h = self.config.image_ocr_min_dimensions
240
+ max_w, max_h = self.config.image_ocr_max_dimensions
241
+
242
+ if w < min_w or h < min_h:
243
+ return f"Too small: {w}x{h}"
244
+ if w > max_w or h > max_h:
245
+ return f"Too large: {w}x{h}"
246
+
247
+ return None
248
+
249
+ async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
250
+ """Process a single image with OCR.
251
+
252
+ Args:
253
+ target: Image to process
254
+ backend: OCR backend instance
255
+ cfg: Configuration for the backend
256
+
257
+ Returns:
258
+ OCR result for the image
259
+ """
260
+ try:
261
+ start = time.time()
262
+ pil_img = Image.open(io.BytesIO(target.data))
263
+ ocr_res = await backend.process_image(pil_img, **cfg)
264
+ duration = time.time() - start
265
+ return ImageOCRResult(
266
+ image=target,
267
+ ocr_result=ocr_res,
268
+ confidence_score=None,
269
+ processing_time=duration,
270
+ )
271
+ except (OSError, ValueError) as e: # pragma: no cover
272
+ return ImageOCRResult(
273
+ image=target,
274
+ ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
275
+ skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
276
+ )
277
+ except (RuntimeError, TypeError) as e: # pragma: no cover
278
+ return ImageOCRResult(
279
+ image=target,
280
+ ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
281
+ skipped_reason=f"Backend error: {type(e).__name__}: {e}",
282
+ )
283
+
284
+ async def _process_images_with_ocr(
285
+ self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
286
+ ) -> list[ImageOCRResult]:
287
+ """Process multiple images with OCR.
288
+
289
+ Args:
290
+ images: Tuple or list of images to process
291
+
292
+ Returns:
293
+ List of OCR results
294
+ """
295
+ if not images or not self.config.ocr_extracted_images:
296
+ return []
297
+
298
+ images_list = list(self._deduplicate_images(list(images)))
299
+ images_list = self._check_image_memory_limits(images_list)
300
+
301
+ backend_name = self.config.image_ocr_backend or self.config.ocr_backend
302
+ if backend_name is None:
303
+ return []
304
+
305
+ cfg = self._prepare_ocr_config(backend_name)
306
+ backend = get_ocr_backend(backend_name)
307
+
308
+ results: list[ImageOCRResult] = []
309
+ tasks = []
310
+
311
+ for img in images_list:
312
+ skip_reason = self._validate_image_for_ocr(img)
313
+ if skip_reason:
314
+ results.append(
315
+ ImageOCRResult(
316
+ image=img,
317
+ ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
318
+ skipped_reason=skip_reason,
319
+ )
320
+ )
321
+ else:
322
+ tasks.append(self._ocr_single_image(img, backend, cfg))
323
+
324
+ if tasks:
325
+ batch_size = max(1, min(len(tasks), cpu_count()))
326
+ results.extend(await run_taskgroup_batched(*tasks, batch_size=batch_size))
327
+
328
+ return results
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
3
4
  import re
4
5
  from html import unescape
5
6
  from typing import TYPE_CHECKING, Any, ClassVar
@@ -8,9 +9,8 @@ from anyio import Path as AsyncPath
8
9
 
9
10
  from kreuzberg._extractors._base import Extractor
10
11
  from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
11
- from kreuzberg._types import ExtractionResult, normalize_metadata
12
- from kreuzberg._utils._string import normalize_spaces
13
- from kreuzberg._utils._sync import run_sync
12
+ from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult, normalize_metadata
13
+ from kreuzberg._utils._sync import run_maybe_async, run_sync
14
14
  from kreuzberg.exceptions import MissingDependencyError
15
15
 
16
16
  if TYPE_CHECKING:
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
27
27
  html2text = None
28
28
 
29
29
  _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
30
+ _UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
31
+ _UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
30
32
 
31
33
 
32
34
  class EmailExtractor(Extractor):
@@ -84,24 +86,25 @@ class EmailExtractor(Extractor):
84
86
  text_parts.append(f"BCC: {bcc_formatted}")
85
87
 
86
88
  def _format_email_field(self, field: Any) -> str:
87
- if isinstance(field, list):
88
- emails = []
89
- for item in field:
90
- if isinstance(item, dict):
91
- email = item.get("email", "")
92
- if email:
93
- emails.append(email)
94
- else:
95
- emails.append(str(item))
96
- return ", ".join(emails)
97
- if isinstance(field, dict):
98
- return str(field.get("email", ""))
99
- return str(field)
89
+ match field:
90
+ case list():
91
+ emails = []
92
+ for item in field:
93
+ if isinstance(item, dict):
94
+ if email := item.get("email", ""):
95
+ emails.append(str(email))
96
+ else:
97
+ emails.append(str(item))
98
+ return ", ".join(emails)
99
+ case dict():
100
+ return str(field.get("email", ""))
101
+ case _:
102
+ return str(field)
100
103
 
101
104
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
102
105
  text_content = parsed_email.get("text")
103
106
  if text_content:
104
- text_parts.append(f"\n{text_content}")
107
+ text_parts.append(str(text_content))
105
108
  return
106
109
 
107
110
  html_content = parsed_email.get("html")
@@ -111,20 +114,80 @@ class EmailExtractor(Extractor):
111
114
  h.ignore_links = True
112
115
  h.ignore_images = True
113
116
  converted_text = h.handle(html_content)
114
- text_parts.append(f"\n{converted_text}")
117
+ text_parts.append(converted_text)
115
118
  else:
116
- clean_html = _HTML_TAG_PATTERN.sub("", html_content)
119
+ cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html_content, flags=re.IGNORECASE | re.DOTALL)
120
+ cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
121
+ clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
117
122
  clean_html = unescape(clean_html)
118
- text_parts.append(f"\n{clean_html}")
123
+ clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
124
+ clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
125
+ text_parts.append(clean_html)
119
126
 
120
127
  def _extract_email_attachments(
121
128
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
122
129
  ) -> None:
123
- if parsed_email.get("attachments"):
124
- attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
125
- metadata["attachments"] = attachment_names
126
- if attachment_names:
127
- text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
130
+ attachments = parsed_email.get("attachments")
131
+ if not isinstance(attachments, list):
132
+ return
133
+ names: list[str] = []
134
+ for att in attachments:
135
+ name_val: str = "unknown"
136
+ if isinstance(att, dict):
137
+ n = att.get("name") or att.get("filename")
138
+ if isinstance(n, str) and n:
139
+ name_val = n
140
+ names.append(name_val)
141
+ if names:
142
+ metadata["attachments"] = names
143
+ text_parts.append("Attachments: " + ", ".join(names))
144
+
145
+ def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
146
+ images: list[ExtractedImage] = []
147
+ attachments = parsed_email.get("attachments") or []
148
+ if not isinstance(attachments, list):
149
+ return []
150
+
151
+ for idx, att in enumerate(attachments, start=1):
152
+ if not isinstance(att, dict):
153
+ continue
154
+
155
+ mime = att.get("mime") or att.get("content_type") or att.get("type")
156
+ if not isinstance(mime, str) or not mime.startswith("image/"):
157
+ continue
158
+
159
+ name = att.get("name") or att.get("filename")
160
+ name = name if isinstance(name, str) else None
161
+ data = att.get("data") or att.get("content") or att.get("payload")
162
+ raw: bytes | None = None
163
+ if isinstance(data, (bytes, bytearray)):
164
+ raw = bytes(data)
165
+ elif isinstance(data, str):
166
+ try:
167
+ raw = base64.b64decode(data)
168
+ except Exception: # noqa: BLE001
169
+ raw = data.encode()
170
+
171
+ if raw is None:
172
+ continue
173
+
174
+ fmt = mime.split("/", 1)[1].lower()
175
+ if name and "." in name:
176
+ ext = name.rsplit(".", 1)[-1].lower()
177
+ if ext:
178
+ fmt = ext
179
+
180
+ filename = name or f"attachment_image_{idx}.{fmt}"
181
+ images.append(
182
+ ExtractedImage(
183
+ data=raw,
184
+ format=fmt,
185
+ filename=filename,
186
+ page_number=None,
187
+ )
188
+ )
189
+
190
+ return images
128
191
 
129
192
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
130
193
  if mailparse is None:
@@ -142,13 +205,24 @@ class EmailExtractor(Extractor):
142
205
 
143
206
  combined_text = "\n".join(text_parts)
144
207
 
145
- return ExtractionResult(
146
- content=normalize_spaces(combined_text),
208
+ result = ExtractionResult(
209
+ content=combined_text,
147
210
  mime_type=PLAIN_TEXT_MIME_TYPE,
148
211
  metadata=normalize_metadata(metadata),
149
212
  chunks=[],
150
213
  )
151
214
 
215
+ if self.config.extract_images:
216
+ images = self._extract_images_from_attachments(parsed_email)
217
+ result.images = images
218
+ if self.config.ocr_extracted_images and result.images:
219
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(
220
+ self._process_images_with_ocr, result.images
221
+ )
222
+ result.image_ocr_results = image_ocr_results
223
+
224
+ return result
225
+
152
226
  except Exception as e:
153
227
  msg = f"Failed to parse email content: {e}"
154
228
  raise RuntimeError(msg) from e
@@ -1,29 +1,44 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
4
+ import binascii
5
+ import io
6
+ import logging
3
7
  from typing import TYPE_CHECKING, ClassVar
4
8
 
5
9
  import html_to_markdown
6
10
  from anyio import Path as AsyncPath
11
+ from bs4 import BeautifulSoup
12
+ from PIL import Image
7
13
 
8
- from kreuzberg._extractors._base import Extractor
14
+ from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
9
15
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
- from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
16
+ from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
17
+ from kreuzberg._utils._html_streaming import should_use_streaming
11
18
  from kreuzberg._utils._string import safe_decode
12
- from kreuzberg._utils._sync import run_sync
19
+ from kreuzberg._utils._sync import run_maybe_async, run_sync
13
20
 
14
21
  if TYPE_CHECKING:
15
22
  from pathlib import Path
16
23
 
24
+ logger = logging.getLogger(__name__)
25
+
17
26
 
18
27
  class HTMLExtractor(Extractor):
19
28
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
20
29
 
21
30
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
22
- return await run_sync(self.extract_bytes_sync, content)
31
+ result = await run_sync(self.extract_bytes_sync, content)
32
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
33
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
34
+ return result
23
35
 
24
36
  async def extract_path_async(self, path: Path) -> ExtractionResult:
25
37
  content = await AsyncPath(path).read_bytes()
26
- return await run_sync(self.extract_bytes_sync, content)
38
+ result = await run_sync(self.extract_bytes_sync, content)
39
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
40
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
41
+ return result
27
42
 
28
43
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
44
  config = self.config.html_to_markdown_config if self.config else None
@@ -32,12 +47,102 @@ class HTMLExtractor(Extractor):
32
47
 
33
48
  config_dict = config.to_dict()
34
49
 
35
- result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
50
+ html_content = safe_decode(content)
51
+
52
+ use_streaming, chunk_size = should_use_streaming(len(content))
53
+ config_dict["stream_processing"] = use_streaming
54
+ config_dict["chunk_size"] = chunk_size
55
+
56
+ result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
36
57
 
37
- extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
58
+ extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
59
+
60
+ if self.config.extract_images:
61
+ extraction_result.images = self._extract_images_from_html(html_content)
62
+ if self.config.ocr_extracted_images and extraction_result.images:
63
+ extraction_result.image_ocr_results = run_maybe_async(
64
+ self._process_images_with_ocr, extraction_result.images
65
+ )
38
66
 
39
67
  return self._apply_quality_processing(extraction_result)
40
68
 
41
69
  def extract_path_sync(self, path: Path) -> ExtractionResult:
42
70
  content = path.read_bytes()
43
71
  return self.extract_bytes_sync(content)
72
+
73
+ def _extract_images_from_html(self, html_content: str) -> list[ExtractedImage]:
74
+ images: list[ExtractedImage] = []
75
+ soup = BeautifulSoup(html_content, "xml")
76
+
77
+ for img in soup.find_all("img"):
78
+ src_val = img.get("src") # type: ignore[union-attr]
79
+ if isinstance(src_val, str) and src_val.startswith("data:image/"):
80
+ try:
81
+ header, data = src_val.split(",", 1)
82
+ mime_type = header.split(";")[0].split(":")[1]
83
+ format_name = mime_type.split("/")[1]
84
+
85
+ if not data or len(data) < 4:
86
+ logger.debug("Skipping empty or too small base64 data")
87
+ continue
88
+
89
+ if len(data) > 67 * 1024 * 1024:
90
+ logger.warning("Skipping base64 image larger than 67MB")
91
+ continue
92
+
93
+ image_data = base64.b64decode(data)
94
+
95
+ if len(image_data) > MAX_SINGLE_IMAGE_SIZE:
96
+ logger.warning(
97
+ "Skipping decoded image larger than %dMB", MAX_SINGLE_IMAGE_SIZE // (1024 * 1024)
98
+ )
99
+ continue
100
+
101
+ dimensions = None
102
+ try:
103
+ with Image.open(io.BytesIO(image_data)) as pil_img:
104
+ dimensions = pil_img.size
105
+ except (OSError, ValueError) as e:
106
+ logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
+
108
+ alt_val = img.get("alt") # type: ignore[union-attr]
109
+ desc = alt_val if isinstance(alt_val, str) else None
110
+ images.append(
111
+ ExtractedImage(
112
+ data=image_data,
113
+ format=format_name,
114
+ filename=f"embedded_image_{len(images) + 1}.{format_name}",
115
+ description=desc,
116
+ dimensions=dimensions,
117
+ )
118
+ )
119
+ except (ValueError, binascii.Error) as e:
120
+ logger.warning("Failed to extract base64 image: %s", e)
121
+
122
+ def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
123
+ try:
124
+ svg_content = str(svg_element).encode("utf-8")
125
+
126
+ def _get_attr_safe(obj: object, attr: str) -> str | None:
127
+ get_method = getattr(obj, "get", None)
128
+ if callable(get_method):
129
+ result = get_method(attr)
130
+ return result if isinstance(result, str) else None
131
+ return None
132
+
133
+ title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
134
+ desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
135
+ return ExtractedImage(
136
+ data=svg_content,
137
+ format="svg",
138
+ filename=f"inline_svg_{len(images) + 1}.svg",
139
+ description=desc_svg,
140
+ )
141
+ except (UnicodeEncodeError, AttributeError) as e:
142
+ logger.warning("Failed to extract SVG: %s", e)
143
+ return None
144
+
145
+ svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
146
+ images.extend(img for img in svg_images if img is not None)
147
+
148
+ return images