kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +156 -30
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +4 -6
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +95 -27
- kreuzberg/_extractors/_html.py +85 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +209 -99
- kreuzberg/_extractors/_presentation.py +72 -8
- kreuzberg/_extractors/_spread_sheet.py +25 -30
- kreuzberg/_mcp/server.py +345 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +2 -2
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +74 -34
- kreuzberg/_types.py +182 -23
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_sync.py +36 -6
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +43 -27
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
- kreuzberg-3.15.0.dist-info/RECORD +60 -0
- kreuzberg-3.14.0.dist-info/RECORD +0 -58
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_base.py
CHANGED
@@ -1,16 +1,41 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import io
|
4
|
+
import logging
|
5
|
+
import time
|
6
|
+
import zlib
|
3
7
|
from abc import ABC, abstractmethod
|
4
|
-
from
|
8
|
+
from dataclasses import asdict
|
9
|
+
from multiprocessing import cpu_count
|
10
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
5
11
|
|
6
|
-
from
|
12
|
+
from PIL import Image
|
13
|
+
|
14
|
+
from kreuzberg._ocr import get_ocr_backend
|
15
|
+
from kreuzberg._types import (
|
16
|
+
EasyOCRConfig,
|
17
|
+
ExtractedImage,
|
18
|
+
ExtractionResult,
|
19
|
+
ImageOCRResult,
|
20
|
+
PaddleOCRConfig,
|
21
|
+
TesseractConfig,
|
22
|
+
normalize_metadata,
|
23
|
+
)
|
7
24
|
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
25
|
+
from kreuzberg._utils._sync import run_taskgroup_batched
|
8
26
|
|
9
27
|
if TYPE_CHECKING:
|
10
28
|
from pathlib import Path
|
11
29
|
|
12
30
|
from kreuzberg._types import ExtractionConfig
|
13
31
|
|
32
|
+
MAX_TOTAL_IMAGE_SIZE_MB = 100
|
33
|
+
MAX_SINGLE_IMAGE_SIZE_MB = 50
|
34
|
+
MAX_TOTAL_IMAGE_SIZE = MAX_TOTAL_IMAGE_SIZE_MB * 1024 * 1024
|
35
|
+
MAX_SINGLE_IMAGE_SIZE = MAX_SINGLE_IMAGE_SIZE_MB * 1024 * 1024
|
36
|
+
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
|
14
39
|
|
15
40
|
class Extractor(ABC):
|
16
41
|
__slots__ = ("config", "mime_type")
|
@@ -52,11 +77,252 @@ class Extractor(ABC):
|
|
52
77
|
|
53
78
|
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
54
79
|
|
80
|
+
deduplicated_images = self._deduplicate_images(result.images) if result.images else []
|
81
|
+
|
55
82
|
return ExtractionResult(
|
56
83
|
content=cleaned_content,
|
57
84
|
mime_type=result.mime_type,
|
58
85
|
metadata=normalize_metadata(enhanced_metadata),
|
86
|
+
tables=result.tables,
|
59
87
|
chunks=result.chunks,
|
88
|
+
images=deduplicated_images,
|
89
|
+
image_ocr_results=result.image_ocr_results,
|
90
|
+
entities=result.entities,
|
91
|
+
keywords=result.keywords,
|
60
92
|
detected_languages=result.detected_languages,
|
61
|
-
|
93
|
+
document_type=result.document_type,
|
94
|
+
document_type_confidence=result.document_type_confidence,
|
95
|
+
layout=result.layout,
|
96
|
+
)
|
97
|
+
|
98
|
+
def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
|
99
|
+
"""Filter images based on memory safety limits."""
|
100
|
+
if not images:
|
101
|
+
return []
|
102
|
+
|
103
|
+
images_with_sizes = [(img, len(img.data)) for img in images]
|
104
|
+
|
105
|
+
valid_images = []
|
106
|
+
for img, size in images_with_sizes:
|
107
|
+
if size <= MAX_SINGLE_IMAGE_SIZE:
|
108
|
+
valid_images.append((img, size))
|
109
|
+
else:
|
110
|
+
logger.warning(
|
111
|
+
"Skipping image %s: size %d MB exceeds limit of %d MB",
|
112
|
+
img.filename or "unknown",
|
113
|
+
size // (1024 * 1024),
|
114
|
+
MAX_SINGLE_IMAGE_SIZE_MB,
|
115
|
+
)
|
116
|
+
|
117
|
+
total_size = sum(size for _, size in valid_images)
|
118
|
+
|
119
|
+
if total_size <= MAX_TOTAL_IMAGE_SIZE:
|
120
|
+
return [img for img, _ in valid_images]
|
121
|
+
|
122
|
+
logger.warning(
|
123
|
+
"Total image size %d MB exceeds limit of %d MB, selecting subset",
|
124
|
+
total_size // (1024 * 1024),
|
125
|
+
MAX_TOTAL_IMAGE_SIZE_MB,
|
62
126
|
)
|
127
|
+
|
128
|
+
sorted_images = sorted(valid_images, key=lambda x: x[1])
|
129
|
+
selected = []
|
130
|
+
current_size = 0
|
131
|
+
|
132
|
+
for img, img_size in sorted_images:
|
133
|
+
if current_size + img_size <= MAX_TOTAL_IMAGE_SIZE:
|
134
|
+
selected.append(img)
|
135
|
+
current_size += img_size
|
136
|
+
else:
|
137
|
+
logger.debug("Skipping image %s: would exceed total memory limit", img.filename or "unknown")
|
138
|
+
|
139
|
+
return selected
|
140
|
+
|
141
|
+
_SMALL_IMAGE_THRESHOLD = 1024
|
142
|
+
_HASH_SAMPLE_SIZE = 512
|
143
|
+
|
144
|
+
def _compute_image_hash(self, img: ExtractedImage) -> int:
|
145
|
+
"""Compute hash for image deduplication using progressive hashing.
|
146
|
+
|
147
|
+
For small images (<1KB), hash the entire content.
|
148
|
+
For larger images, use size + first/last bytes for quick comparison.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
img: Image to hash
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
Hash value for deduplication
|
155
|
+
"""
|
156
|
+
data_len = len(img.data)
|
157
|
+
|
158
|
+
if data_len < self._SMALL_IMAGE_THRESHOLD:
|
159
|
+
return zlib.crc32(img.data) & 0xFFFFFFFF
|
160
|
+
|
161
|
+
hash_components = [
|
162
|
+
str(data_len).encode(),
|
163
|
+
img.data[: self._HASH_SAMPLE_SIZE],
|
164
|
+
img.data[-self._HASH_SAMPLE_SIZE :],
|
165
|
+
img.format.encode() if img.format else b"",
|
166
|
+
]
|
167
|
+
|
168
|
+
combined = b"".join(hash_components)
|
169
|
+
return zlib.crc32(combined) & 0xFFFFFFFF
|
170
|
+
|
171
|
+
def _deduplicate_images(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
|
172
|
+
if not self.config.deduplicate_images or not images:
|
173
|
+
return images
|
174
|
+
|
175
|
+
seen_hashes = set()
|
176
|
+
unique_images = []
|
177
|
+
|
178
|
+
for img in images:
|
179
|
+
img_hash = self._compute_image_hash(img)
|
180
|
+
if img_hash not in seen_hashes:
|
181
|
+
seen_hashes.add(img_hash)
|
182
|
+
unique_images.append(img)
|
183
|
+
else:
|
184
|
+
logger.debug("Filtered duplicate image: %s", img.filename)
|
185
|
+
|
186
|
+
if len(unique_images) < len(images):
|
187
|
+
logger.info("Deduplicated %d images to %d unique", len(images), len(unique_images))
|
188
|
+
|
189
|
+
return unique_images
|
190
|
+
|
191
|
+
def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
|
192
|
+
"""Prepare OCR configuration for the specified backend.
|
193
|
+
|
194
|
+
Args:
|
195
|
+
backend_name: Name of the OCR backend
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
Configuration dictionary for the backend
|
199
|
+
"""
|
200
|
+
default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
|
201
|
+
config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
|
202
|
+
|
203
|
+
if backend_name == "tesseract":
|
204
|
+
default_config = TesseractConfig()
|
205
|
+
config_class = TesseractConfig
|
206
|
+
elif backend_name == "easyocr":
|
207
|
+
default_config = EasyOCRConfig()
|
208
|
+
config_class = EasyOCRConfig
|
209
|
+
elif backend_name == "paddleocr":
|
210
|
+
default_config = PaddleOCRConfig()
|
211
|
+
config_class = PaddleOCRConfig
|
212
|
+
else:
|
213
|
+
raise ValueError(f"Unknown OCR backend: {backend_name}")
|
214
|
+
|
215
|
+
cfg: dict[str, Any] = asdict(default_config)
|
216
|
+
|
217
|
+
if self.config.ocr_config and isinstance(self.config.ocr_config, config_class):
|
218
|
+
user_cfg: dict[str, Any] = asdict(self.config.ocr_config)
|
219
|
+
cfg.update(user_cfg)
|
220
|
+
|
221
|
+
cfg["use_cache"] = self.config.use_cache
|
222
|
+
return cfg
|
223
|
+
|
224
|
+
def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
|
225
|
+
"""Validate if an image is suitable for OCR processing.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
img: Image to validate
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
Reason for skipping if invalid, None if valid
|
232
|
+
"""
|
233
|
+
fmt = img.format.lower()
|
234
|
+
if fmt not in self.config.image_ocr_formats:
|
235
|
+
return f"Unsupported format: {img.format}"
|
236
|
+
|
237
|
+
if img.dimensions is not None:
|
238
|
+
w, h = img.dimensions
|
239
|
+
min_w, min_h = self.config.image_ocr_min_dimensions
|
240
|
+
max_w, max_h = self.config.image_ocr_max_dimensions
|
241
|
+
|
242
|
+
if w < min_w or h < min_h:
|
243
|
+
return f"Too small: {w}x{h}"
|
244
|
+
if w > max_w or h > max_h:
|
245
|
+
return f"Too large: {w}x{h}"
|
246
|
+
|
247
|
+
return None
|
248
|
+
|
249
|
+
async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
|
250
|
+
"""Process a single image with OCR.
|
251
|
+
|
252
|
+
Args:
|
253
|
+
target: Image to process
|
254
|
+
backend: OCR backend instance
|
255
|
+
cfg: Configuration for the backend
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
OCR result for the image
|
259
|
+
"""
|
260
|
+
try:
|
261
|
+
start = time.time()
|
262
|
+
pil_img = Image.open(io.BytesIO(target.data))
|
263
|
+
ocr_res = await backend.process_image(pil_img, **cfg)
|
264
|
+
duration = time.time() - start
|
265
|
+
return ImageOCRResult(
|
266
|
+
image=target,
|
267
|
+
ocr_result=ocr_res,
|
268
|
+
confidence_score=None,
|
269
|
+
processing_time=duration,
|
270
|
+
)
|
271
|
+
except (OSError, ValueError) as e: # pragma: no cover
|
272
|
+
return ImageOCRResult(
|
273
|
+
image=target,
|
274
|
+
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
275
|
+
skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
|
276
|
+
)
|
277
|
+
except (RuntimeError, TypeError) as e: # pragma: no cover
|
278
|
+
return ImageOCRResult(
|
279
|
+
image=target,
|
280
|
+
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
281
|
+
skipped_reason=f"Backend error: {type(e).__name__}: {e}",
|
282
|
+
)
|
283
|
+
|
284
|
+
async def _process_images_with_ocr(
|
285
|
+
self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
|
286
|
+
) -> list[ImageOCRResult]:
|
287
|
+
"""Process multiple images with OCR.
|
288
|
+
|
289
|
+
Args:
|
290
|
+
images: Tuple or list of images to process
|
291
|
+
|
292
|
+
Returns:
|
293
|
+
List of OCR results
|
294
|
+
"""
|
295
|
+
if not images or not self.config.ocr_extracted_images:
|
296
|
+
return []
|
297
|
+
|
298
|
+
images_list = list(self._deduplicate_images(list(images)))
|
299
|
+
images_list = self._check_image_memory_limits(images_list)
|
300
|
+
|
301
|
+
backend_name = self.config.image_ocr_backend or self.config.ocr_backend
|
302
|
+
if backend_name is None:
|
303
|
+
return []
|
304
|
+
|
305
|
+
cfg = self._prepare_ocr_config(backend_name)
|
306
|
+
backend = get_ocr_backend(backend_name)
|
307
|
+
|
308
|
+
results: list[ImageOCRResult] = []
|
309
|
+
tasks = []
|
310
|
+
|
311
|
+
for img in images_list:
|
312
|
+
skip_reason = self._validate_image_for_ocr(img)
|
313
|
+
if skip_reason:
|
314
|
+
results.append(
|
315
|
+
ImageOCRResult(
|
316
|
+
image=img,
|
317
|
+
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
318
|
+
skipped_reason=skip_reason,
|
319
|
+
)
|
320
|
+
)
|
321
|
+
else:
|
322
|
+
tasks.append(self._ocr_single_image(img, backend, cfg))
|
323
|
+
|
324
|
+
if tasks:
|
325
|
+
batch_size = max(1, min(len(tasks), cpu_count()))
|
326
|
+
results.extend(await run_taskgroup_batched(*tasks, batch_size=batch_size))
|
327
|
+
|
328
|
+
return results
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import re
|
4
5
|
from html import unescape
|
5
6
|
from typing import TYPE_CHECKING, Any, ClassVar
|
@@ -8,9 +9,8 @@ from anyio import Path as AsyncPath
|
|
8
9
|
|
9
10
|
from kreuzberg._extractors._base import Extractor
|
10
11
|
from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
11
|
-
from kreuzberg._types import ExtractionResult, normalize_metadata
|
12
|
-
from kreuzberg._utils.
|
13
|
-
from kreuzberg._utils._sync import run_sync
|
12
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult, normalize_metadata
|
13
|
+
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
14
14
|
from kreuzberg.exceptions import MissingDependencyError
|
15
15
|
|
16
16
|
if TYPE_CHECKING:
|
@@ -84,24 +84,18 @@ class EmailExtractor(Extractor):
|
|
84
84
|
text_parts.append(f"BCC: {bcc_formatted}")
|
85
85
|
|
86
86
|
def _format_email_field(self, field: Any) -> str:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
else:
|
95
|
-
emails.append(str(item))
|
96
|
-
return ", ".join(emails)
|
97
|
-
if isinstance(field, dict):
|
98
|
-
return str(field.get("email", ""))
|
99
|
-
return str(field)
|
87
|
+
match field:
|
88
|
+
case list():
|
89
|
+
return ", ".join(str(item.get("email", "")) if isinstance(item, dict) else str(item) for item in field)
|
90
|
+
case dict():
|
91
|
+
return str(field.get("email", ""))
|
92
|
+
case _:
|
93
|
+
return str(field)
|
100
94
|
|
101
95
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
102
96
|
text_content = parsed_email.get("text")
|
103
97
|
if text_content:
|
104
|
-
text_parts.append(
|
98
|
+
text_parts.append(str(text_content))
|
105
99
|
return
|
106
100
|
|
107
101
|
html_content = parsed_email.get("html")
|
@@ -111,20 +105,83 @@ class EmailExtractor(Extractor):
|
|
111
105
|
h.ignore_links = True
|
112
106
|
h.ignore_images = True
|
113
107
|
converted_text = h.handle(html_content)
|
114
|
-
text_parts.append(
|
108
|
+
text_parts.append(converted_text)
|
115
109
|
else:
|
116
|
-
|
110
|
+
cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html_content, flags=re.IGNORECASE | re.DOTALL)
|
111
|
+
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
|
112
|
+
clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
|
117
113
|
clean_html = unescape(clean_html)
|
118
|
-
|
114
|
+
clean_html = (
|
115
|
+
clean_html.replace("\u201c", '"')
|
116
|
+
.replace("\u201d", '"')
|
117
|
+
.replace("\u2019", "'")
|
118
|
+
.replace("\u2018", "'")
|
119
|
+
)
|
120
|
+
text_parts.append(clean_html)
|
119
121
|
|
120
122
|
def _extract_email_attachments(
|
121
123
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
122
124
|
) -> None:
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
125
|
+
attachments = parsed_email.get("attachments")
|
126
|
+
if not isinstance(attachments, list):
|
127
|
+
return
|
128
|
+
names: list[str] = []
|
129
|
+
for att in attachments:
|
130
|
+
name_val: str = "unknown"
|
131
|
+
if isinstance(att, dict):
|
132
|
+
n = att.get("name")
|
133
|
+
if isinstance(n, str) and n:
|
134
|
+
name_val = n
|
135
|
+
names.append(name_val)
|
136
|
+
metadata["attachments"] = names
|
137
|
+
if names:
|
138
|
+
text_parts.append("Attachments: " + ", ".join(names))
|
139
|
+
|
140
|
+
def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
|
141
|
+
images: list[ExtractedImage] = []
|
142
|
+
attachments = parsed_email.get("attachments") or []
|
143
|
+
if not isinstance(attachments, list):
|
144
|
+
return []
|
145
|
+
|
146
|
+
for idx, att in enumerate(attachments, start=1):
|
147
|
+
if not isinstance(att, dict):
|
148
|
+
continue
|
149
|
+
|
150
|
+
mime = att.get("mime") or att.get("content_type") or att.get("type")
|
151
|
+
if not isinstance(mime, str) or not mime.startswith("image/"):
|
152
|
+
continue
|
153
|
+
|
154
|
+
name = att.get("name") if isinstance(att.get("name"), str) else None
|
155
|
+
data = att.get("data") or att.get("content") or att.get("payload")
|
156
|
+
raw: bytes | None = None
|
157
|
+
if isinstance(data, (bytes, bytearray)):
|
158
|
+
raw = bytes(data)
|
159
|
+
elif isinstance(data, str):
|
160
|
+
try:
|
161
|
+
raw = base64.b64decode(data)
|
162
|
+
except Exception: # noqa: BLE001
|
163
|
+
raw = data.encode()
|
164
|
+
|
165
|
+
if raw is None:
|
166
|
+
continue
|
167
|
+
|
168
|
+
fmt = mime.split("/", 1)[1].lower()
|
169
|
+
if name and "." in name:
|
170
|
+
ext = name.rsplit(".", 1)[-1].lower()
|
171
|
+
if ext:
|
172
|
+
fmt = ext
|
173
|
+
|
174
|
+
filename = name or f"attachment_image_{idx}.{fmt}"
|
175
|
+
images.append(
|
176
|
+
ExtractedImage(
|
177
|
+
data=raw,
|
178
|
+
format=fmt,
|
179
|
+
filename=filename,
|
180
|
+
page_number=None,
|
181
|
+
)
|
182
|
+
)
|
183
|
+
|
184
|
+
return images
|
128
185
|
|
129
186
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
130
187
|
if mailparse is None:
|
@@ -142,13 +199,24 @@ class EmailExtractor(Extractor):
|
|
142
199
|
|
143
200
|
combined_text = "\n".join(text_parts)
|
144
201
|
|
145
|
-
|
146
|
-
content=
|
202
|
+
result = ExtractionResult(
|
203
|
+
content=combined_text,
|
147
204
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
148
205
|
metadata=normalize_metadata(metadata),
|
149
206
|
chunks=[],
|
150
207
|
)
|
151
208
|
|
209
|
+
if self.config.extract_images:
|
210
|
+
images = self._extract_images_from_attachments(parsed_email)
|
211
|
+
result.images = images
|
212
|
+
if self.config.ocr_extracted_images and result.images:
|
213
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(
|
214
|
+
self._process_images_with_ocr, result.images
|
215
|
+
)
|
216
|
+
result.image_ocr_results = image_ocr_results
|
217
|
+
|
218
|
+
return result
|
219
|
+
|
152
220
|
except Exception as e:
|
153
221
|
msg = f"Failed to parse email content: {e}"
|
154
222
|
raise RuntimeError(msg) from e
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -1,29 +1,40 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
4
|
+
import logging
|
3
5
|
from typing import TYPE_CHECKING, ClassVar
|
4
6
|
|
5
7
|
import html_to_markdown
|
6
8
|
from anyio import Path as AsyncPath
|
9
|
+
from bs4 import BeautifulSoup
|
7
10
|
|
8
|
-
from kreuzberg._extractors._base import Extractor
|
11
|
+
from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
|
9
12
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
|
-
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
|
13
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
|
11
14
|
from kreuzberg._utils._string import safe_decode
|
12
|
-
from kreuzberg._utils._sync import run_sync
|
15
|
+
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
13
16
|
|
14
17
|
if TYPE_CHECKING:
|
15
18
|
from pathlib import Path
|
16
19
|
|
20
|
+
logger = logging.getLogger(__name__)
|
21
|
+
|
17
22
|
|
18
23
|
class HTMLExtractor(Extractor):
|
19
24
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
|
20
25
|
|
21
26
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
22
|
-
|
27
|
+
result = await run_sync(self.extract_bytes_sync, content)
|
28
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
29
|
+
result.image_ocr_results = await self._process_images_with_ocr(result.images)
|
30
|
+
return result
|
23
31
|
|
24
32
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
25
33
|
content = await AsyncPath(path).read_bytes()
|
26
|
-
|
34
|
+
result = await run_sync(self.extract_bytes_sync, content)
|
35
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
36
|
+
result.image_ocr_results = await self._process_images_with_ocr(result.images)
|
37
|
+
return result
|
27
38
|
|
28
39
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
40
|
config = self.config.html_to_markdown_config if self.config else None
|
@@ -32,12 +43,79 @@ class HTMLExtractor(Extractor):
|
|
32
43
|
|
33
44
|
config_dict = config.to_dict()
|
34
45
|
|
35
|
-
|
46
|
+
html_content = safe_decode(content)
|
47
|
+
result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
|
48
|
+
|
49
|
+
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
36
50
|
|
37
|
-
|
51
|
+
if self.config.extract_images:
|
52
|
+
extraction_result.images = self._extract_images_from_html(html_content)
|
53
|
+
if self.config.ocr_extracted_images and extraction_result.images:
|
54
|
+
extraction_result.image_ocr_results = run_maybe_async(
|
55
|
+
self._process_images_with_ocr, extraction_result.images
|
56
|
+
)
|
38
57
|
|
39
58
|
return self._apply_quality_processing(extraction_result)
|
40
59
|
|
41
60
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
42
61
|
content = path.read_bytes()
|
43
62
|
return self.extract_bytes_sync(content)
|
63
|
+
|
64
|
+
def _extract_images_from_html(self, html_content: str) -> list[ExtractedImage]:
|
65
|
+
images: list[ExtractedImage] = []
|
66
|
+
soup = BeautifulSoup(html_content, "xml")
|
67
|
+
|
68
|
+
for img in soup.find_all("img"):
|
69
|
+
src_val = img.get("src") # type: ignore[union-attr]
|
70
|
+
if isinstance(src_val, str) and src_val.startswith("data:image/"):
|
71
|
+
try:
|
72
|
+
header, data = src_val.split(",", 1)
|
73
|
+
mime_type = header.split(";")[0].split(":")[1]
|
74
|
+
format_name = mime_type.split("/")[1]
|
75
|
+
|
76
|
+
if not data or len(data) < 4:
|
77
|
+
logger.debug("Skipping empty or too small base64 data")
|
78
|
+
continue
|
79
|
+
|
80
|
+
if len(data) > 67 * 1024 * 1024:
|
81
|
+
logger.warning("Skipping base64 image larger than 67MB")
|
82
|
+
continue
|
83
|
+
|
84
|
+
image_data = base64.b64decode(data)
|
85
|
+
|
86
|
+
if len(image_data) > MAX_SINGLE_IMAGE_SIZE:
|
87
|
+
logger.warning(
|
88
|
+
"Skipping decoded image larger than %dMB", MAX_SINGLE_IMAGE_SIZE // (1024 * 1024)
|
89
|
+
)
|
90
|
+
continue
|
91
|
+
|
92
|
+
alt_val = img.get("alt") # type: ignore[union-attr]
|
93
|
+
desc = alt_val if isinstance(alt_val, str) else None
|
94
|
+
images.append(
|
95
|
+
ExtractedImage(
|
96
|
+
data=image_data,
|
97
|
+
format=format_name,
|
98
|
+
filename=f"embedded_image_{len(images) + 1}.{format_name}",
|
99
|
+
description=desc,
|
100
|
+
)
|
101
|
+
)
|
102
|
+
except Exception as e: # noqa: BLE001
|
103
|
+
logger.warning("Failed to extract base64 image: %s", e)
|
104
|
+
|
105
|
+
for svg in soup.find_all("svg"):
|
106
|
+
try:
|
107
|
+
svg_content = str(svg).encode("utf-8")
|
108
|
+
title_or_aria = svg.get("title") or svg.get("aria-label") # type: ignore[union-attr]
|
109
|
+
desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
|
110
|
+
images.append(
|
111
|
+
ExtractedImage(
|
112
|
+
data=svg_content,
|
113
|
+
format="svg",
|
114
|
+
filename=f"inline_svg_{len(images) + 1}.svg",
|
115
|
+
description=desc_svg,
|
116
|
+
)
|
117
|
+
)
|
118
|
+
except Exception as e: # noqa: BLE001, PERF203
|
119
|
+
logger.warning("Failed to extract SVG: %s", e)
|
120
|
+
|
121
|
+
return images
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -10,8 +10,9 @@ from anyio import Path as AsyncPath
|
|
10
10
|
from PIL import Image
|
11
11
|
|
12
12
|
from kreuzberg._extractors._base import Extractor
|
13
|
-
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
13
|
+
from kreuzberg._mime_types import IMAGE_MIME_TO_EXT, IMAGE_MIME_TYPES
|
14
14
|
from kreuzberg._ocr import get_ocr_backend
|
15
|
+
from kreuzberg._types import ExtractedImage
|
15
16
|
from kreuzberg._utils._image_preprocessing import normalize_image_dpi
|
16
17
|
from kreuzberg._utils._sync import run_sync
|
17
18
|
from kreuzberg._utils._tmp import create_temp_file
|
@@ -26,33 +27,17 @@ if TYPE_CHECKING: # pragma: no cover
|
|
26
27
|
class ImageExtractor(Extractor):
|
27
28
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
28
29
|
|
29
|
-
IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] =
|
30
|
-
"image/bmp": "bmp",
|
31
|
-
"image/x-bmp": "bmp",
|
32
|
-
"image/x-ms-bmp": "bmp",
|
33
|
-
"image/gif": "gif",
|
34
|
-
"image/jpeg": "jpg",
|
35
|
-
"image/pjpeg": "jpg",
|
36
|
-
"image/png": "png",
|
37
|
-
"image/tiff": "tiff",
|
38
|
-
"image/x-tiff": "tiff",
|
39
|
-
"image/jp2": "jp2",
|
40
|
-
"image/jpx": "jpx",
|
41
|
-
"image/jpm": "jpm",
|
42
|
-
"image/mj2": "mj2",
|
43
|
-
"image/webp": "webp",
|
44
|
-
"image/x-portable-anymap": "pnm",
|
45
|
-
"image/x-portable-bitmap": "pbm",
|
46
|
-
"image/x-portable-graymap": "pgm",
|
47
|
-
"image/x-portable-pixmap": "ppm",
|
48
|
-
}
|
30
|
+
IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] = IMAGE_MIME_TO_EXT
|
49
31
|
|
50
32
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
51
33
|
extension = self._get_extension_from_mime_type(self.mime_type)
|
52
34
|
file_path, unlink = await create_temp_file(f".{extension}")
|
53
35
|
await AsyncPath(file_path).write_bytes(content)
|
54
36
|
try:
|
55
|
-
|
37
|
+
result = await self.extract_path_async(file_path)
|
38
|
+
if self.config.extract_images:
|
39
|
+
result.images = [self._create_self_reference_image(content, self.mime_type)]
|
40
|
+
return result
|
56
41
|
finally:
|
57
42
|
await unlink()
|
58
43
|
|
@@ -69,6 +54,10 @@ class ImageExtractor(Extractor):
|
|
69
54
|
if preprocessing_metadata:
|
70
55
|
result.metadata["image_preprocessing"] = preprocessing_metadata
|
71
56
|
|
57
|
+
if self.config.extract_images:
|
58
|
+
content = await AsyncPath(path).read_bytes()
|
59
|
+
result.images = [self._create_self_reference_image(content, self.mime_type)]
|
60
|
+
|
72
61
|
return self._apply_quality_processing(result)
|
73
62
|
|
74
63
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
@@ -97,6 +86,10 @@ class ImageExtractor(Extractor):
|
|
97
86
|
if preprocessing_metadata:
|
98
87
|
result.metadata["image_preprocessing"] = preprocessing_metadata
|
99
88
|
|
89
|
+
if self.config.extract_images:
|
90
|
+
content = path.read_bytes()
|
91
|
+
result.images = [self._create_self_reference_image(content, self.mime_type)]
|
92
|
+
|
100
93
|
return self._apply_quality_processing(result)
|
101
94
|
|
102
95
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
@@ -108,3 +101,11 @@ class ImageExtractor(Extractor):
|
|
108
101
|
return v
|
109
102
|
|
110
103
|
raise ValidationError("unsupported mimetype", context={"mime_type": mime_type})
|
104
|
+
|
105
|
+
def _create_self_reference_image(self, image_data: bytes, mime_type: str) -> ExtractedImage:
|
106
|
+
return ExtractedImage(
|
107
|
+
data=image_data,
|
108
|
+
format=IMAGE_MIME_TO_EXT.get(mime_type, "unknown"),
|
109
|
+
filename="source_image",
|
110
|
+
page_number=1,
|
111
|
+
)
|