kreuzberg 3.14.1__py3-none-any.whl → 3.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +127 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_constants.py +2 -0
  6. kreuzberg/_document_classification.py +4 -6
  7. kreuzberg/_entity_extraction.py +9 -4
  8. kreuzberg/_extractors/_base.py +269 -3
  9. kreuzberg/_extractors/_email.py +95 -27
  10. kreuzberg/_extractors/_html.py +85 -7
  11. kreuzberg/_extractors/_image.py +23 -22
  12. kreuzberg/_extractors/_pandoc.py +106 -75
  13. kreuzberg/_extractors/_pdf.py +209 -99
  14. kreuzberg/_extractors/_presentation.py +72 -8
  15. kreuzberg/_extractors/_spread_sheet.py +25 -30
  16. kreuzberg/_mcp/server.py +345 -25
  17. kreuzberg/_mime_types.py +42 -0
  18. kreuzberg/_ocr/_easyocr.py +2 -2
  19. kreuzberg/_ocr/_paddleocr.py +1 -1
  20. kreuzberg/_ocr/_tesseract.py +74 -34
  21. kreuzberg/_types.py +180 -21
  22. kreuzberg/_utils/_cache.py +10 -4
  23. kreuzberg/_utils/_device.py +2 -4
  24. kreuzberg/_utils/_image_preprocessing.py +12 -39
  25. kreuzberg/_utils/_process_pool.py +29 -8
  26. kreuzberg/_utils/_quality.py +7 -2
  27. kreuzberg/_utils/_resource_managers.py +65 -0
  28. kreuzberg/_utils/_sync.py +36 -6
  29. kreuzberg/_utils/_tmp.py +37 -1
  30. kreuzberg/cli.py +34 -20
  31. kreuzberg/extraction.py +43 -27
  32. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
  33. kreuzberg-3.15.0.dist-info/RECORD +60 -0
  34. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  35. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,41 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import io
4
+ import logging
5
+ import time
6
+ import zlib
3
7
  from abc import ABC, abstractmethod
4
- from typing import TYPE_CHECKING, ClassVar
8
+ from dataclasses import asdict
9
+ from multiprocessing import cpu_count
10
+ from typing import TYPE_CHECKING, Any, ClassVar
5
11
 
6
- from kreuzberg._types import ExtractionResult, normalize_metadata
12
+ from PIL import Image
13
+
14
+ from kreuzberg._ocr import get_ocr_backend
15
+ from kreuzberg._types import (
16
+ EasyOCRConfig,
17
+ ExtractedImage,
18
+ ExtractionResult,
19
+ ImageOCRResult,
20
+ PaddleOCRConfig,
21
+ TesseractConfig,
22
+ normalize_metadata,
23
+ )
7
24
  from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
25
+ from kreuzberg._utils._sync import run_taskgroup_batched
8
26
 
9
27
  if TYPE_CHECKING:
10
28
  from pathlib import Path
11
29
 
12
30
  from kreuzberg._types import ExtractionConfig
13
31
 
32
+ MAX_TOTAL_IMAGE_SIZE_MB = 100
33
+ MAX_SINGLE_IMAGE_SIZE_MB = 50
34
+ MAX_TOTAL_IMAGE_SIZE = MAX_TOTAL_IMAGE_SIZE_MB * 1024 * 1024
35
+ MAX_SINGLE_IMAGE_SIZE = MAX_SINGLE_IMAGE_SIZE_MB * 1024 * 1024
36
+
37
+ logger = logging.getLogger(__name__)
38
+
14
39
 
15
40
  class Extractor(ABC):
16
41
  __slots__ = ("config", "mime_type")
@@ -52,11 +77,252 @@ class Extractor(ABC):
52
77
 
53
78
  enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
54
79
 
80
+ deduplicated_images = self._deduplicate_images(result.images) if result.images else []
81
+
55
82
  return ExtractionResult(
56
83
  content=cleaned_content,
57
84
  mime_type=result.mime_type,
58
85
  metadata=normalize_metadata(enhanced_metadata),
86
+ tables=result.tables,
59
87
  chunks=result.chunks,
88
+ images=deduplicated_images,
89
+ image_ocr_results=result.image_ocr_results,
90
+ entities=result.entities,
91
+ keywords=result.keywords,
60
92
  detected_languages=result.detected_languages,
61
- tables=result.tables,
93
+ document_type=result.document_type,
94
+ document_type_confidence=result.document_type_confidence,
95
+ layout=result.layout,
96
+ )
97
+
98
+ def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
99
+ """Filter images based on memory safety limits."""
100
+ if not images:
101
+ return []
102
+
103
+ images_with_sizes = [(img, len(img.data)) for img in images]
104
+
105
+ valid_images = []
106
+ for img, size in images_with_sizes:
107
+ if size <= MAX_SINGLE_IMAGE_SIZE:
108
+ valid_images.append((img, size))
109
+ else:
110
+ logger.warning(
111
+ "Skipping image %s: size %d MB exceeds limit of %d MB",
112
+ img.filename or "unknown",
113
+ size // (1024 * 1024),
114
+ MAX_SINGLE_IMAGE_SIZE_MB,
115
+ )
116
+
117
+ total_size = sum(size for _, size in valid_images)
118
+
119
+ if total_size <= MAX_TOTAL_IMAGE_SIZE:
120
+ return [img for img, _ in valid_images]
121
+
122
+ logger.warning(
123
+ "Total image size %d MB exceeds limit of %d MB, selecting subset",
124
+ total_size // (1024 * 1024),
125
+ MAX_TOTAL_IMAGE_SIZE_MB,
62
126
  )
127
+
128
+ sorted_images = sorted(valid_images, key=lambda x: x[1])
129
+ selected = []
130
+ current_size = 0
131
+
132
+ for img, img_size in sorted_images:
133
+ if current_size + img_size <= MAX_TOTAL_IMAGE_SIZE:
134
+ selected.append(img)
135
+ current_size += img_size
136
+ else:
137
+ logger.debug("Skipping image %s: would exceed total memory limit", img.filename or "unknown")
138
+
139
+ return selected
140
+
141
+ _SMALL_IMAGE_THRESHOLD = 1024
142
+ _HASH_SAMPLE_SIZE = 512
143
+
144
+ def _compute_image_hash(self, img: ExtractedImage) -> int:
145
+ """Compute hash for image deduplication using progressive hashing.
146
+
147
+ For small images (<1KB), hash the entire content.
148
+ For larger images, use size + first/last bytes for quick comparison.
149
+
150
+ Args:
151
+ img: Image to hash
152
+
153
+ Returns:
154
+ Hash value for deduplication
155
+ """
156
+ data_len = len(img.data)
157
+
158
+ if data_len < self._SMALL_IMAGE_THRESHOLD:
159
+ return zlib.crc32(img.data) & 0xFFFFFFFF
160
+
161
+ hash_components = [
162
+ str(data_len).encode(),
163
+ img.data[: self._HASH_SAMPLE_SIZE],
164
+ img.data[-self._HASH_SAMPLE_SIZE :],
165
+ img.format.encode() if img.format else b"",
166
+ ]
167
+
168
+ combined = b"".join(hash_components)
169
+ return zlib.crc32(combined) & 0xFFFFFFFF
170
+
171
+ def _deduplicate_images(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
172
+ if not self.config.deduplicate_images or not images:
173
+ return images
174
+
175
+ seen_hashes = set()
176
+ unique_images = []
177
+
178
+ for img in images:
179
+ img_hash = self._compute_image_hash(img)
180
+ if img_hash not in seen_hashes:
181
+ seen_hashes.add(img_hash)
182
+ unique_images.append(img)
183
+ else:
184
+ logger.debug("Filtered duplicate image: %s", img.filename)
185
+
186
+ if len(unique_images) < len(images):
187
+ logger.info("Deduplicated %d images to %d unique", len(images), len(unique_images))
188
+
189
+ return unique_images
190
+
191
+ def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
192
+ """Prepare OCR configuration for the specified backend.
193
+
194
+ Args:
195
+ backend_name: Name of the OCR backend
196
+
197
+ Returns:
198
+ Configuration dictionary for the backend
199
+ """
200
+ default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
201
+ config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
202
+
203
+ if backend_name == "tesseract":
204
+ default_config = TesseractConfig()
205
+ config_class = TesseractConfig
206
+ elif backend_name == "easyocr":
207
+ default_config = EasyOCRConfig()
208
+ config_class = EasyOCRConfig
209
+ elif backend_name == "paddleocr":
210
+ default_config = PaddleOCRConfig()
211
+ config_class = PaddleOCRConfig
212
+ else:
213
+ raise ValueError(f"Unknown OCR backend: {backend_name}")
214
+
215
+ cfg: dict[str, Any] = asdict(default_config)
216
+
217
+ if self.config.ocr_config and isinstance(self.config.ocr_config, config_class):
218
+ user_cfg: dict[str, Any] = asdict(self.config.ocr_config)
219
+ cfg.update(user_cfg)
220
+
221
+ cfg["use_cache"] = self.config.use_cache
222
+ return cfg
223
+
224
+ def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
225
+ """Validate if an image is suitable for OCR processing.
226
+
227
+ Args:
228
+ img: Image to validate
229
+
230
+ Returns:
231
+ Reason for skipping if invalid, None if valid
232
+ """
233
+ fmt = img.format.lower()
234
+ if fmt not in self.config.image_ocr_formats:
235
+ return f"Unsupported format: {img.format}"
236
+
237
+ if img.dimensions is not None:
238
+ w, h = img.dimensions
239
+ min_w, min_h = self.config.image_ocr_min_dimensions
240
+ max_w, max_h = self.config.image_ocr_max_dimensions
241
+
242
+ if w < min_w or h < min_h:
243
+ return f"Too small: {w}x{h}"
244
+ if w > max_w or h > max_h:
245
+ return f"Too large: {w}x{h}"
246
+
247
+ return None
248
+
249
+ async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
250
+ """Process a single image with OCR.
251
+
252
+ Args:
253
+ target: Image to process
254
+ backend: OCR backend instance
255
+ cfg: Configuration for the backend
256
+
257
+ Returns:
258
+ OCR result for the image
259
+ """
260
+ try:
261
+ start = time.time()
262
+ pil_img = Image.open(io.BytesIO(target.data))
263
+ ocr_res = await backend.process_image(pil_img, **cfg)
264
+ duration = time.time() - start
265
+ return ImageOCRResult(
266
+ image=target,
267
+ ocr_result=ocr_res,
268
+ confidence_score=None,
269
+ processing_time=duration,
270
+ )
271
+ except (OSError, ValueError) as e: # pragma: no cover
272
+ return ImageOCRResult(
273
+ image=target,
274
+ ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
275
+ skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
276
+ )
277
+ except (RuntimeError, TypeError) as e: # pragma: no cover
278
+ return ImageOCRResult(
279
+ image=target,
280
+ ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
281
+ skipped_reason=f"Backend error: {type(e).__name__}: {e}",
282
+ )
283
+
284
+ async def _process_images_with_ocr(
285
+ self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
286
+ ) -> list[ImageOCRResult]:
287
+ """Process multiple images with OCR.
288
+
289
+ Args:
290
+ images: Tuple or list of images to process
291
+
292
+ Returns:
293
+ List of OCR results
294
+ """
295
+ if not images or not self.config.ocr_extracted_images:
296
+ return []
297
+
298
+ images_list = list(self._deduplicate_images(list(images)))
299
+ images_list = self._check_image_memory_limits(images_list)
300
+
301
+ backend_name = self.config.image_ocr_backend or self.config.ocr_backend
302
+ if backend_name is None:
303
+ return []
304
+
305
+ cfg = self._prepare_ocr_config(backend_name)
306
+ backend = get_ocr_backend(backend_name)
307
+
308
+ results: list[ImageOCRResult] = []
309
+ tasks = []
310
+
311
+ for img in images_list:
312
+ skip_reason = self._validate_image_for_ocr(img)
313
+ if skip_reason:
314
+ results.append(
315
+ ImageOCRResult(
316
+ image=img,
317
+ ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
318
+ skipped_reason=skip_reason,
319
+ )
320
+ )
321
+ else:
322
+ tasks.append(self._ocr_single_image(img, backend, cfg))
323
+
324
+ if tasks:
325
+ batch_size = max(1, min(len(tasks), cpu_count()))
326
+ results.extend(await run_taskgroup_batched(*tasks, batch_size=batch_size))
327
+
328
+ return results
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
3
4
  import re
4
5
  from html import unescape
5
6
  from typing import TYPE_CHECKING, Any, ClassVar
@@ -8,9 +9,8 @@ from anyio import Path as AsyncPath
8
9
 
9
10
  from kreuzberg._extractors._base import Extractor
10
11
  from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
11
- from kreuzberg._types import ExtractionResult, normalize_metadata
12
- from kreuzberg._utils._string import normalize_spaces
13
- from kreuzberg._utils._sync import run_sync
12
+ from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult, normalize_metadata
13
+ from kreuzberg._utils._sync import run_maybe_async, run_sync
14
14
  from kreuzberg.exceptions import MissingDependencyError
15
15
 
16
16
  if TYPE_CHECKING:
@@ -84,24 +84,18 @@ class EmailExtractor(Extractor):
84
84
  text_parts.append(f"BCC: {bcc_formatted}")
85
85
 
86
86
  def _format_email_field(self, field: Any) -> str:
87
- if isinstance(field, list):
88
- emails = []
89
- for item in field:
90
- if isinstance(item, dict):
91
- email = item.get("email", "")
92
- if email:
93
- emails.append(email)
94
- else:
95
- emails.append(str(item))
96
- return ", ".join(emails)
97
- if isinstance(field, dict):
98
- return str(field.get("email", ""))
99
- return str(field)
87
+ match field:
88
+ case list():
89
+ return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
90
+ case dict():
91
+ return str(field.get("email", ""))
92
+ case _:
93
+ return str(field)
100
94
 
101
95
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
102
96
  text_content = parsed_email.get("text")
103
97
  if text_content:
104
- text_parts.append(f"\n{text_content}")
98
+ text_parts.append(str(text_content))
105
99
  return
106
100
 
107
101
  html_content = parsed_email.get("html")
@@ -111,20 +105,83 @@ class EmailExtractor(Extractor):
111
105
  h.ignore_links = True
112
106
  h.ignore_images = True
113
107
  converted_text = h.handle(html_content)
114
- text_parts.append(f"\n{converted_text}")
108
+ text_parts.append(converted_text)
115
109
  else:
116
- clean_html = _HTML_TAG_PATTERN.sub("", html_content)
110
+ cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html_content, flags=re.IGNORECASE | re.DOTALL)
111
+ cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
112
+ clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
117
113
  clean_html = unescape(clean_html)
118
- text_parts.append(f"\n{clean_html}")
114
+ clean_html = (
115
+ clean_html.replace("\u201c", '"')
116
+ .replace("\u201d", '"')
117
+ .replace("\u2019", "'")
118
+ .replace("\u2018", "'")
119
+ )
120
+ text_parts.append(clean_html)
119
121
 
120
122
  def _extract_email_attachments(
121
123
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
122
124
  ) -> None:
123
- if parsed_email.get("attachments"):
124
- attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
125
- metadata["attachments"] = attachment_names
126
- if attachment_names:
127
- text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
125
+ attachments = parsed_email.get("attachments")
126
+ if not isinstance(attachments, list):
127
+ return
128
+ names: list[str] = []
129
+ for att in attachments:
130
+ name_val: str = "unknown"
131
+ if isinstance(att, dict):
132
+ n = att.get("name")
133
+ if isinstance(n, str) and n:
134
+ name_val = n
135
+ names.append(name_val)
136
+ metadata["attachments"] = names
137
+ if names:
138
+ text_parts.append("Attachments: " + ", ".join(names))
139
+
140
+ def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
141
+ images: list[ExtractedImage] = []
142
+ attachments = parsed_email.get("attachments") or []
143
+ if not isinstance(attachments, list):
144
+ return []
145
+
146
+ for idx, att in enumerate(attachments, start=1):
147
+ if not isinstance(att, dict):
148
+ continue
149
+
150
+ mime = att.get("mime") or att.get("content_type") or att.get("type")
151
+ if not isinstance(mime, str) or not mime.startswith("image/"):
152
+ continue
153
+
154
+ name = att.get("name") if isinstance(att.get("name"), str) else None
155
+ data = att.get("data") or att.get("content") or att.get("payload")
156
+ raw: bytes | None = None
157
+ if isinstance(data, (bytes, bytearray)):
158
+ raw = bytes(data)
159
+ elif isinstance(data, str):
160
+ try:
161
+ raw = base64.b64decode(data)
162
+ except Exception: # noqa: BLE001
163
+ raw = data.encode()
164
+
165
+ if raw is None:
166
+ continue
167
+
168
+ fmt = mime.split("/", 1)[1].lower()
169
+ if name and "." in name:
170
+ ext = name.rsplit(".", 1)[-1].lower()
171
+ if ext:
172
+ fmt = ext
173
+
174
+ filename = name or f"attachment_image_{idx}.{fmt}"
175
+ images.append(
176
+ ExtractedImage(
177
+ data=raw,
178
+ format=fmt,
179
+ filename=filename,
180
+ page_number=None,
181
+ )
182
+ )
183
+
184
+ return images
128
185
 
129
186
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
130
187
  if mailparse is None:
@@ -142,13 +199,24 @@ class EmailExtractor(Extractor):
142
199
 
143
200
  combined_text = "\n".join(text_parts)
144
201
 
145
- return ExtractionResult(
146
- content=normalize_spaces(combined_text),
202
+ result = ExtractionResult(
203
+ content=combined_text,
147
204
  mime_type=PLAIN_TEXT_MIME_TYPE,
148
205
  metadata=normalize_metadata(metadata),
149
206
  chunks=[],
150
207
  )
151
208
 
209
+ if self.config.extract_images:
210
+ images = self._extract_images_from_attachments(parsed_email)
211
+ result.images = images
212
+ if self.config.ocr_extracted_images and result.images:
213
+ image_ocr_results: list[ImageOCRResult] = run_maybe_async(
214
+ self._process_images_with_ocr, result.images
215
+ )
216
+ result.image_ocr_results = image_ocr_results
217
+
218
+ return result
219
+
152
220
  except Exception as e:
153
221
  msg = f"Failed to parse email content: {e}"
154
222
  raise RuntimeError(msg) from e
@@ -1,29 +1,40 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
4
+ import logging
3
5
  from typing import TYPE_CHECKING, ClassVar
4
6
 
5
7
  import html_to_markdown
6
8
  from anyio import Path as AsyncPath
9
+ from bs4 import BeautifulSoup
7
10
 
8
- from kreuzberg._extractors._base import Extractor
11
+ from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
9
12
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
- from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
13
+ from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
11
14
  from kreuzberg._utils._string import safe_decode
12
- from kreuzberg._utils._sync import run_sync
15
+ from kreuzberg._utils._sync import run_maybe_async, run_sync
13
16
 
14
17
  if TYPE_CHECKING:
15
18
  from pathlib import Path
16
19
 
20
+ logger = logging.getLogger(__name__)
21
+
17
22
 
18
23
  class HTMLExtractor(Extractor):
19
24
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
20
25
 
21
26
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
22
- return await run_sync(self.extract_bytes_sync, content)
27
+ result = await run_sync(self.extract_bytes_sync, content)
28
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
29
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
30
+ return result
23
31
 
24
32
  async def extract_path_async(self, path: Path) -> ExtractionResult:
25
33
  content = await AsyncPath(path).read_bytes()
26
- return await run_sync(self.extract_bytes_sync, content)
34
+ result = await run_sync(self.extract_bytes_sync, content)
35
+ if self.config.extract_images and self.config.ocr_extracted_images and result.images:
36
+ result.image_ocr_results = await self._process_images_with_ocr(result.images)
37
+ return result
27
38
 
28
39
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
40
  config = self.config.html_to_markdown_config if self.config else None
@@ -32,12 +43,79 @@ class HTMLExtractor(Extractor):
32
43
 
33
44
  config_dict = config.to_dict()
34
45
 
35
- result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
46
+ html_content = safe_decode(content)
47
+ result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
48
+
49
+ extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
36
50
 
37
- extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
51
+ if self.config.extract_images:
52
+ extraction_result.images = self._extract_images_from_html(html_content)
53
+ if self.config.ocr_extracted_images and extraction_result.images:
54
+ extraction_result.image_ocr_results = run_maybe_async(
55
+ self._process_images_with_ocr, extraction_result.images
56
+ )
38
57
 
39
58
  return self._apply_quality_processing(extraction_result)
40
59
 
41
60
  def extract_path_sync(self, path: Path) -> ExtractionResult:
42
61
  content = path.read_bytes()
43
62
  return self.extract_bytes_sync(content)
63
+
64
+ def _extract_images_from_html(self, html_content: str) -> list[ExtractedImage]:
65
+ images: list[ExtractedImage] = []
66
+ soup = BeautifulSoup(html_content, "xml")
67
+
68
+ for img in soup.find_all("img"):
69
+ src_val = img.get("src") # type: ignore[union-attr]
70
+ if isinstance(src_val, str) and src_val.startswith("data:image/"):
71
+ try:
72
+ header, data = src_val.split(",", 1)
73
+ mime_type = header.split(";")[0].split(":")[1]
74
+ format_name = mime_type.split("/")[1]
75
+
76
+ if not data or len(data) < 4:
77
+ logger.debug("Skipping empty or too small base64 data")
78
+ continue
79
+
80
+ if len(data) > 67 * 1024 * 1024:
81
+ logger.warning("Skipping base64 image larger than 67MB")
82
+ continue
83
+
84
+ image_data = base64.b64decode(data)
85
+
86
+ if len(image_data) > MAX_SINGLE_IMAGE_SIZE:
87
+ logger.warning(
88
+ "Skipping decoded image larger than %dMB", MAX_SINGLE_IMAGE_SIZE // (1024 * 1024)
89
+ )
90
+ continue
91
+
92
+ alt_val = img.get("alt") # type: ignore[union-attr]
93
+ desc = alt_val if isinstance(alt_val, str) else None
94
+ images.append(
95
+ ExtractedImage(
96
+ data=image_data,
97
+ format=format_name,
98
+ filename=f"embedded_image_{len(images) + 1}.{format_name}",
99
+ description=desc,
100
+ )
101
+ )
102
+ except Exception as e: # noqa: BLE001
103
+ logger.warning("Failed to extract base64 image: %s", e)
104
+
105
+ for svg in soup.find_all("svg"):
106
+ try:
107
+ svg_content = str(svg).encode("utf-8")
108
+ title_or_aria = svg.get("title") or svg.get("aria-label") # type: ignore[union-attr]
109
+ desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
110
+ images.append(
111
+ ExtractedImage(
112
+ data=svg_content,
113
+ format="svg",
114
+ filename=f"inline_svg_{len(images) + 1}.svg",
115
+ description=desc_svg,
116
+ )
117
+ )
118
+ except Exception as e: # noqa: BLE001, PERF203
119
+ logger.warning("Failed to extract SVG: %s", e)
120
+
121
+ return images
@@ -10,8 +10,9 @@ from anyio import Path as AsyncPath
10
10
  from PIL import Image
11
11
 
12
12
  from kreuzberg._extractors._base import Extractor
13
- from kreuzberg._mime_types import IMAGE_MIME_TYPES
13
+ from kreuzberg._mime_types import IMAGE_MIME_TO_EXT, IMAGE_MIME_TYPES
14
14
  from kreuzberg._ocr import get_ocr_backend
15
+ from kreuzberg._types import ExtractedImage
15
16
  from kreuzberg._utils._image_preprocessing import normalize_image_dpi
16
17
  from kreuzberg._utils._sync import run_sync
17
18
  from kreuzberg._utils._tmp import create_temp_file
@@ -26,33 +27,17 @@ if TYPE_CHECKING: # pragma: no cover
26
27
  class ImageExtractor(Extractor):
27
28
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
28
29
 
29
- IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] = {
30
- "image/bmp": "bmp",
31
- "image/x-bmp": "bmp",
32
- "image/x-ms-bmp": "bmp",
33
- "image/gif": "gif",
34
- "image/jpeg": "jpg",
35
- "image/pjpeg": "jpg",
36
- "image/png": "png",
37
- "image/tiff": "tiff",
38
- "image/x-tiff": "tiff",
39
- "image/jp2": "jp2",
40
- "image/jpx": "jpx",
41
- "image/jpm": "jpm",
42
- "image/mj2": "mj2",
43
- "image/webp": "webp",
44
- "image/x-portable-anymap": "pnm",
45
- "image/x-portable-bitmap": "pbm",
46
- "image/x-portable-graymap": "pgm",
47
- "image/x-portable-pixmap": "ppm",
48
- }
30
+ IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] = IMAGE_MIME_TO_EXT
49
31
 
50
32
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
51
33
  extension = self._get_extension_from_mime_type(self.mime_type)
52
34
  file_path, unlink = await create_temp_file(f".{extension}")
53
35
  await AsyncPath(file_path).write_bytes(content)
54
36
  try:
55
- return await self.extract_path_async(file_path)
37
+ result = await self.extract_path_async(file_path)
38
+ if self.config.extract_images:
39
+ result.images = [self._create_self_reference_image(content, self.mime_type)]
40
+ return result
56
41
  finally:
57
42
  await unlink()
58
43
 
@@ -69,6 +54,10 @@ class ImageExtractor(Extractor):
69
54
  if preprocessing_metadata:
70
55
  result.metadata["image_preprocessing"] = preprocessing_metadata
71
56
 
57
+ if self.config.extract_images:
58
+ content = await AsyncPath(path).read_bytes()
59
+ result.images = [self._create_self_reference_image(content, self.mime_type)]
60
+
72
61
  return self._apply_quality_processing(result)
73
62
 
74
63
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
@@ -97,6 +86,10 @@ class ImageExtractor(Extractor):
97
86
  if preprocessing_metadata:
98
87
  result.metadata["image_preprocessing"] = preprocessing_metadata
99
88
 
89
+ if self.config.extract_images:
90
+ content = path.read_bytes()
91
+ result.images = [self._create_self_reference_image(content, self.mime_type)]
92
+
100
93
  return self._apply_quality_processing(result)
101
94
 
102
95
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -108,3 +101,11 @@ class ImageExtractor(Extractor):
108
101
  return v
109
102
 
110
103
  raise ValidationError("unsupported mimetype", context={"mime_type": mime_type})
104
+
105
+ def _create_self_reference_image(self, image_data: bytes, mime_type: str) -> ExtractedImage:
106
+ return ExtractedImage(
107
+ data=image_data,
108
+ format=IMAGE_MIME_TO_EXT.get(mime_type, "unknown"),
109
+ filename="source_image",
110
+ page_number=1,
111
+ )