kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_base.py
CHANGED
@@ -1,16 +1,41 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import io
|
4
|
+
import logging
|
5
|
+
import time
|
6
|
+
import zlib
|
3
7
|
from abc import ABC, abstractmethod
|
4
|
-
from
|
8
|
+
from dataclasses import asdict
|
9
|
+
from multiprocessing import cpu_count
|
10
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
5
11
|
|
6
|
-
from
|
12
|
+
from PIL import Image
|
13
|
+
|
14
|
+
from kreuzberg._ocr import get_ocr_backend
|
15
|
+
from kreuzberg._types import (
|
16
|
+
EasyOCRConfig,
|
17
|
+
ExtractedImage,
|
18
|
+
ExtractionResult,
|
19
|
+
ImageOCRResult,
|
20
|
+
PaddleOCRConfig,
|
21
|
+
TesseractConfig,
|
22
|
+
normalize_metadata,
|
23
|
+
)
|
7
24
|
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
25
|
+
from kreuzberg._utils._sync import run_taskgroup_batched
|
8
26
|
|
9
27
|
if TYPE_CHECKING:
|
10
28
|
from pathlib import Path
|
11
29
|
|
12
30
|
from kreuzberg._types import ExtractionConfig
|
13
31
|
|
32
|
+
MAX_TOTAL_IMAGE_SIZE_MB = 100
|
33
|
+
MAX_SINGLE_IMAGE_SIZE_MB = 50
|
34
|
+
MAX_TOTAL_IMAGE_SIZE = MAX_TOTAL_IMAGE_SIZE_MB * 1024 * 1024
|
35
|
+
MAX_SINGLE_IMAGE_SIZE = MAX_SINGLE_IMAGE_SIZE_MB * 1024 * 1024
|
36
|
+
|
37
|
+
logger = logging.getLogger(__name__)
|
38
|
+
|
14
39
|
|
15
40
|
class Extractor(ABC):
|
16
41
|
__slots__ = ("config", "mime_type")
|
@@ -52,11 +77,252 @@ class Extractor(ABC):
|
|
52
77
|
|
53
78
|
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
54
79
|
|
80
|
+
deduplicated_images = self._deduplicate_images(result.images) if result.images else []
|
81
|
+
|
55
82
|
return ExtractionResult(
|
56
83
|
content=cleaned_content,
|
57
84
|
mime_type=result.mime_type,
|
58
85
|
metadata=normalize_metadata(enhanced_metadata),
|
86
|
+
tables=result.tables,
|
59
87
|
chunks=result.chunks,
|
88
|
+
images=deduplicated_images,
|
89
|
+
image_ocr_results=result.image_ocr_results,
|
90
|
+
entities=result.entities,
|
91
|
+
keywords=result.keywords,
|
60
92
|
detected_languages=result.detected_languages,
|
61
|
-
|
93
|
+
document_type=result.document_type,
|
94
|
+
document_type_confidence=result.document_type_confidence,
|
95
|
+
layout=result.layout,
|
96
|
+
)
|
97
|
+
|
98
|
+
def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
|
99
|
+
"""Filter images based on memory safety limits."""
|
100
|
+
if not images:
|
101
|
+
return []
|
102
|
+
|
103
|
+
images_with_sizes = [(img, len(img.data)) for img in images]
|
104
|
+
|
105
|
+
valid_images = []
|
106
|
+
for img, size in images_with_sizes:
|
107
|
+
if size <= MAX_SINGLE_IMAGE_SIZE:
|
108
|
+
valid_images.append((img, size))
|
109
|
+
else:
|
110
|
+
logger.warning(
|
111
|
+
"Skipping image %s: size %d MB exceeds limit of %d MB",
|
112
|
+
img.filename or "unknown",
|
113
|
+
size // (1024 * 1024),
|
114
|
+
MAX_SINGLE_IMAGE_SIZE_MB,
|
115
|
+
)
|
116
|
+
|
117
|
+
total_size = sum(size for _, size in valid_images)
|
118
|
+
|
119
|
+
if total_size <= MAX_TOTAL_IMAGE_SIZE:
|
120
|
+
return [img for img, _ in valid_images]
|
121
|
+
|
122
|
+
logger.warning(
|
123
|
+
"Total image size %d MB exceeds limit of %d MB, selecting subset",
|
124
|
+
total_size // (1024 * 1024),
|
125
|
+
MAX_TOTAL_IMAGE_SIZE_MB,
|
62
126
|
)
|
127
|
+
|
128
|
+
sorted_images = sorted(valid_images, key=lambda x: x[1])
|
129
|
+
selected = []
|
130
|
+
current_size = 0
|
131
|
+
|
132
|
+
for img, img_size in sorted_images:
|
133
|
+
if current_size + img_size <= MAX_TOTAL_IMAGE_SIZE:
|
134
|
+
selected.append(img)
|
135
|
+
current_size += img_size
|
136
|
+
else:
|
137
|
+
logger.debug("Skipping image %s: would exceed total memory limit", img.filename or "unknown")
|
138
|
+
|
139
|
+
return selected
|
140
|
+
|
141
|
+
_SMALL_IMAGE_THRESHOLD = 1024
|
142
|
+
_HASH_SAMPLE_SIZE = 512
|
143
|
+
|
144
|
+
def _compute_image_hash(self, img: ExtractedImage) -> int:
|
145
|
+
"""Compute hash for image deduplication using progressive hashing.
|
146
|
+
|
147
|
+
For small images (<1KB), hash the entire content.
|
148
|
+
For larger images, use size + first/last bytes for quick comparison.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
img: Image to hash
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
Hash value for deduplication
|
155
|
+
"""
|
156
|
+
data_len = len(img.data)
|
157
|
+
|
158
|
+
if data_len < self._SMALL_IMAGE_THRESHOLD:
|
159
|
+
return zlib.crc32(img.data) & 0xFFFFFFFF
|
160
|
+
|
161
|
+
hash_components = [
|
162
|
+
str(data_len).encode(),
|
163
|
+
img.data[: self._HASH_SAMPLE_SIZE],
|
164
|
+
img.data[-self._HASH_SAMPLE_SIZE :],
|
165
|
+
img.format.encode() if img.format else b"",
|
166
|
+
]
|
167
|
+
|
168
|
+
combined = b"".join(hash_components)
|
169
|
+
return zlib.crc32(combined) & 0xFFFFFFFF
|
170
|
+
|
171
|
+
def _deduplicate_images(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
|
172
|
+
if not self.config.deduplicate_images or not images:
|
173
|
+
return images
|
174
|
+
|
175
|
+
seen_hashes = set()
|
176
|
+
unique_images = []
|
177
|
+
|
178
|
+
for img in images:
|
179
|
+
img_hash = self._compute_image_hash(img)
|
180
|
+
if img_hash not in seen_hashes:
|
181
|
+
seen_hashes.add(img_hash)
|
182
|
+
unique_images.append(img)
|
183
|
+
else:
|
184
|
+
logger.debug("Filtered duplicate image: %s", img.filename)
|
185
|
+
|
186
|
+
if len(unique_images) < len(images):
|
187
|
+
logger.info("Deduplicated %d images to %d unique", len(images), len(unique_images))
|
188
|
+
|
189
|
+
return unique_images
|
190
|
+
|
191
|
+
def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
|
192
|
+
"""Prepare OCR configuration for the specified backend.
|
193
|
+
|
194
|
+
Args:
|
195
|
+
backend_name: Name of the OCR backend
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
Configuration dictionary for the backend
|
199
|
+
"""
|
200
|
+
default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
|
201
|
+
config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
|
202
|
+
|
203
|
+
if backend_name == "tesseract":
|
204
|
+
default_config = TesseractConfig()
|
205
|
+
config_class = TesseractConfig
|
206
|
+
elif backend_name == "easyocr":
|
207
|
+
default_config = EasyOCRConfig()
|
208
|
+
config_class = EasyOCRConfig
|
209
|
+
elif backend_name == "paddleocr":
|
210
|
+
default_config = PaddleOCRConfig()
|
211
|
+
config_class = PaddleOCRConfig
|
212
|
+
else:
|
213
|
+
raise ValueError(f"Unknown OCR backend: {backend_name}")
|
214
|
+
|
215
|
+
cfg: dict[str, Any] = asdict(default_config)
|
216
|
+
|
217
|
+
if self.config.ocr_config and isinstance(self.config.ocr_config, config_class):
|
218
|
+
user_cfg: dict[str, Any] = asdict(self.config.ocr_config)
|
219
|
+
cfg.update(user_cfg)
|
220
|
+
|
221
|
+
cfg["use_cache"] = self.config.use_cache
|
222
|
+
return cfg
|
223
|
+
|
224
|
+
def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
|
225
|
+
"""Validate if an image is suitable for OCR processing.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
img: Image to validate
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
Reason for skipping if invalid, None if valid
|
232
|
+
"""
|
233
|
+
fmt = img.format.lower()
|
234
|
+
if fmt not in self.config.image_ocr_formats:
|
235
|
+
return f"Unsupported format: {img.format}"
|
236
|
+
|
237
|
+
if img.dimensions is not None:
|
238
|
+
w, h = img.dimensions
|
239
|
+
min_w, min_h = self.config.image_ocr_min_dimensions
|
240
|
+
max_w, max_h = self.config.image_ocr_max_dimensions
|
241
|
+
|
242
|
+
if w < min_w or h < min_h:
|
243
|
+
return f"Too small: {w}x{h}"
|
244
|
+
if w > max_w or h > max_h:
|
245
|
+
return f"Too large: {w}x{h}"
|
246
|
+
|
247
|
+
return None
|
248
|
+
|
249
|
+
async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
|
250
|
+
"""Process a single image with OCR.
|
251
|
+
|
252
|
+
Args:
|
253
|
+
target: Image to process
|
254
|
+
backend: OCR backend instance
|
255
|
+
cfg: Configuration for the backend
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
OCR result for the image
|
259
|
+
"""
|
260
|
+
try:
|
261
|
+
start = time.time()
|
262
|
+
pil_img = Image.open(io.BytesIO(target.data))
|
263
|
+
ocr_res = await backend.process_image(pil_img, **cfg)
|
264
|
+
duration = time.time() - start
|
265
|
+
return ImageOCRResult(
|
266
|
+
image=target,
|
267
|
+
ocr_result=ocr_res,
|
268
|
+
confidence_score=None,
|
269
|
+
processing_time=duration,
|
270
|
+
)
|
271
|
+
except (OSError, ValueError) as e: # pragma: no cover
|
272
|
+
return ImageOCRResult(
|
273
|
+
image=target,
|
274
|
+
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
275
|
+
skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
|
276
|
+
)
|
277
|
+
except (RuntimeError, TypeError) as e: # pragma: no cover
|
278
|
+
return ImageOCRResult(
|
279
|
+
image=target,
|
280
|
+
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
281
|
+
skipped_reason=f"Backend error: {type(e).__name__}: {e}",
|
282
|
+
)
|
283
|
+
|
284
|
+
async def _process_images_with_ocr(
|
285
|
+
self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
|
286
|
+
) -> list[ImageOCRResult]:
|
287
|
+
"""Process multiple images with OCR.
|
288
|
+
|
289
|
+
Args:
|
290
|
+
images: Tuple or list of images to process
|
291
|
+
|
292
|
+
Returns:
|
293
|
+
List of OCR results
|
294
|
+
"""
|
295
|
+
if not images or not self.config.ocr_extracted_images:
|
296
|
+
return []
|
297
|
+
|
298
|
+
images_list = list(self._deduplicate_images(list(images)))
|
299
|
+
images_list = self._check_image_memory_limits(images_list)
|
300
|
+
|
301
|
+
backend_name = self.config.image_ocr_backend or self.config.ocr_backend
|
302
|
+
if backend_name is None:
|
303
|
+
return []
|
304
|
+
|
305
|
+
cfg = self._prepare_ocr_config(backend_name)
|
306
|
+
backend = get_ocr_backend(backend_name)
|
307
|
+
|
308
|
+
results: list[ImageOCRResult] = []
|
309
|
+
tasks = []
|
310
|
+
|
311
|
+
for img in images_list:
|
312
|
+
skip_reason = self._validate_image_for_ocr(img)
|
313
|
+
if skip_reason:
|
314
|
+
results.append(
|
315
|
+
ImageOCRResult(
|
316
|
+
image=img,
|
317
|
+
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
318
|
+
skipped_reason=skip_reason,
|
319
|
+
)
|
320
|
+
)
|
321
|
+
else:
|
322
|
+
tasks.append(self._ocr_single_image(img, backend, cfg))
|
323
|
+
|
324
|
+
if tasks:
|
325
|
+
batch_size = max(1, min(len(tasks), cpu_count()))
|
326
|
+
results.extend(await run_taskgroup_batched(*tasks, batch_size=batch_size))
|
327
|
+
|
328
|
+
return results
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import re
|
4
5
|
from html import unescape
|
5
6
|
from typing import TYPE_CHECKING, Any, ClassVar
|
@@ -8,9 +9,8 @@ from anyio import Path as AsyncPath
|
|
8
9
|
|
9
10
|
from kreuzberg._extractors._base import Extractor
|
10
11
|
from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
11
|
-
from kreuzberg._types import ExtractionResult, normalize_metadata
|
12
|
-
from kreuzberg._utils.
|
13
|
-
from kreuzberg._utils._sync import run_sync
|
12
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, ImageOCRResult, normalize_metadata
|
13
|
+
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
14
14
|
from kreuzberg.exceptions import MissingDependencyError
|
15
15
|
|
16
16
|
if TYPE_CHECKING:
|
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
|
|
27
27
|
html2text = None
|
28
28
|
|
29
29
|
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
30
|
+
_UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
|
31
|
+
_UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
|
30
32
|
|
31
33
|
|
32
34
|
class EmailExtractor(Extractor):
|
@@ -84,24 +86,25 @@ class EmailExtractor(Extractor):
|
|
84
86
|
text_parts.append(f"BCC: {bcc_formatted}")
|
85
87
|
|
86
88
|
def _format_email_field(self, field: Any) -> str:
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
89
|
+
match field:
|
90
|
+
case list():
|
91
|
+
emails = []
|
92
|
+
for item in field:
|
93
|
+
if isinstance(item, dict):
|
94
|
+
if email := item.get("email", ""):
|
95
|
+
emails.append(str(email))
|
96
|
+
else:
|
97
|
+
emails.append(str(item))
|
98
|
+
return ", ".join(emails)
|
99
|
+
case dict():
|
100
|
+
return str(field.get("email", ""))
|
101
|
+
case _:
|
102
|
+
return str(field)
|
100
103
|
|
101
104
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
102
105
|
text_content = parsed_email.get("text")
|
103
106
|
if text_content:
|
104
|
-
text_parts.append(
|
107
|
+
text_parts.append(str(text_content))
|
105
108
|
return
|
106
109
|
|
107
110
|
html_content = parsed_email.get("html")
|
@@ -111,20 +114,80 @@ class EmailExtractor(Extractor):
|
|
111
114
|
h.ignore_links = True
|
112
115
|
h.ignore_images = True
|
113
116
|
converted_text = h.handle(html_content)
|
114
|
-
text_parts.append(
|
117
|
+
text_parts.append(converted_text)
|
115
118
|
else:
|
116
|
-
|
119
|
+
cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html_content, flags=re.IGNORECASE | re.DOTALL)
|
120
|
+
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
|
121
|
+
clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
|
117
122
|
clean_html = unescape(clean_html)
|
118
|
-
|
123
|
+
clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
|
124
|
+
clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
|
125
|
+
text_parts.append(clean_html)
|
119
126
|
|
120
127
|
def _extract_email_attachments(
|
121
128
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
122
129
|
) -> None:
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
130
|
+
attachments = parsed_email.get("attachments")
|
131
|
+
if not isinstance(attachments, list):
|
132
|
+
return
|
133
|
+
names: list[str] = []
|
134
|
+
for att in attachments:
|
135
|
+
name_val: str = "unknown"
|
136
|
+
if isinstance(att, dict):
|
137
|
+
n = att.get("name") or att.get("filename")
|
138
|
+
if isinstance(n, str) and n:
|
139
|
+
name_val = n
|
140
|
+
names.append(name_val)
|
141
|
+
if names:
|
142
|
+
metadata["attachments"] = names
|
143
|
+
text_parts.append("Attachments: " + ", ".join(names))
|
144
|
+
|
145
|
+
def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
|
146
|
+
images: list[ExtractedImage] = []
|
147
|
+
attachments = parsed_email.get("attachments") or []
|
148
|
+
if not isinstance(attachments, list):
|
149
|
+
return []
|
150
|
+
|
151
|
+
for idx, att in enumerate(attachments, start=1):
|
152
|
+
if not isinstance(att, dict):
|
153
|
+
continue
|
154
|
+
|
155
|
+
mime = att.get("mime") or att.get("content_type") or att.get("type")
|
156
|
+
if not isinstance(mime, str) or not mime.startswith("image/"):
|
157
|
+
continue
|
158
|
+
|
159
|
+
name = att.get("name") or att.get("filename")
|
160
|
+
name = name if isinstance(name, str) else None
|
161
|
+
data = att.get("data") or att.get("content") or att.get("payload")
|
162
|
+
raw: bytes | None = None
|
163
|
+
if isinstance(data, (bytes, bytearray)):
|
164
|
+
raw = bytes(data)
|
165
|
+
elif isinstance(data, str):
|
166
|
+
try:
|
167
|
+
raw = base64.b64decode(data)
|
168
|
+
except Exception: # noqa: BLE001
|
169
|
+
raw = data.encode()
|
170
|
+
|
171
|
+
if raw is None:
|
172
|
+
continue
|
173
|
+
|
174
|
+
fmt = mime.split("/", 1)[1].lower()
|
175
|
+
if name and "." in name:
|
176
|
+
ext = name.rsplit(".", 1)[-1].lower()
|
177
|
+
if ext:
|
178
|
+
fmt = ext
|
179
|
+
|
180
|
+
filename = name or f"attachment_image_{idx}.{fmt}"
|
181
|
+
images.append(
|
182
|
+
ExtractedImage(
|
183
|
+
data=raw,
|
184
|
+
format=fmt,
|
185
|
+
filename=filename,
|
186
|
+
page_number=None,
|
187
|
+
)
|
188
|
+
)
|
189
|
+
|
190
|
+
return images
|
128
191
|
|
129
192
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
130
193
|
if mailparse is None:
|
@@ -142,13 +205,24 @@ class EmailExtractor(Extractor):
|
|
142
205
|
|
143
206
|
combined_text = "\n".join(text_parts)
|
144
207
|
|
145
|
-
|
146
|
-
content=
|
208
|
+
result = ExtractionResult(
|
209
|
+
content=combined_text,
|
147
210
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
148
211
|
metadata=normalize_metadata(metadata),
|
149
212
|
chunks=[],
|
150
213
|
)
|
151
214
|
|
215
|
+
if self.config.extract_images:
|
216
|
+
images = self._extract_images_from_attachments(parsed_email)
|
217
|
+
result.images = images
|
218
|
+
if self.config.ocr_extracted_images and result.images:
|
219
|
+
image_ocr_results: list[ImageOCRResult] = run_maybe_async(
|
220
|
+
self._process_images_with_ocr, result.images
|
221
|
+
)
|
222
|
+
result.image_ocr_results = image_ocr_results
|
223
|
+
|
224
|
+
return result
|
225
|
+
|
152
226
|
except Exception as e:
|
153
227
|
msg = f"Failed to parse email content: {e}"
|
154
228
|
raise RuntimeError(msg) from e
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -1,29 +1,44 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
4
|
+
import binascii
|
5
|
+
import io
|
6
|
+
import logging
|
3
7
|
from typing import TYPE_CHECKING, ClassVar
|
4
8
|
|
5
9
|
import html_to_markdown
|
6
10
|
from anyio import Path as AsyncPath
|
11
|
+
from bs4 import BeautifulSoup
|
12
|
+
from PIL import Image
|
7
13
|
|
8
|
-
from kreuzberg._extractors._base import Extractor
|
14
|
+
from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
|
9
15
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
|
-
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
|
16
|
+
from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
|
17
|
+
from kreuzberg._utils._html_streaming import should_use_streaming
|
11
18
|
from kreuzberg._utils._string import safe_decode
|
12
|
-
from kreuzberg._utils._sync import run_sync
|
19
|
+
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
13
20
|
|
14
21
|
if TYPE_CHECKING:
|
15
22
|
from pathlib import Path
|
16
23
|
|
24
|
+
logger = logging.getLogger(__name__)
|
25
|
+
|
17
26
|
|
18
27
|
class HTMLExtractor(Extractor):
|
19
28
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
|
20
29
|
|
21
30
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
22
|
-
|
31
|
+
result = await run_sync(self.extract_bytes_sync, content)
|
32
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
33
|
+
result.image_ocr_results = await self._process_images_with_ocr(result.images)
|
34
|
+
return result
|
23
35
|
|
24
36
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
25
37
|
content = await AsyncPath(path).read_bytes()
|
26
|
-
|
38
|
+
result = await run_sync(self.extract_bytes_sync, content)
|
39
|
+
if self.config.extract_images and self.config.ocr_extracted_images and result.images:
|
40
|
+
result.image_ocr_results = await self._process_images_with_ocr(result.images)
|
41
|
+
return result
|
27
42
|
|
28
43
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
44
|
config = self.config.html_to_markdown_config if self.config else None
|
@@ -32,12 +47,102 @@ class HTMLExtractor(Extractor):
|
|
32
47
|
|
33
48
|
config_dict = config.to_dict()
|
34
49
|
|
35
|
-
|
50
|
+
html_content = safe_decode(content)
|
51
|
+
|
52
|
+
use_streaming, chunk_size = should_use_streaming(len(content))
|
53
|
+
config_dict["stream_processing"] = use_streaming
|
54
|
+
config_dict["chunk_size"] = chunk_size
|
55
|
+
|
56
|
+
result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
|
36
57
|
|
37
|
-
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}
|
58
|
+
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
59
|
+
|
60
|
+
if self.config.extract_images:
|
61
|
+
extraction_result.images = self._extract_images_from_html(html_content)
|
62
|
+
if self.config.ocr_extracted_images and extraction_result.images:
|
63
|
+
extraction_result.image_ocr_results = run_maybe_async(
|
64
|
+
self._process_images_with_ocr, extraction_result.images
|
65
|
+
)
|
38
66
|
|
39
67
|
return self._apply_quality_processing(extraction_result)
|
40
68
|
|
41
69
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
42
70
|
content = path.read_bytes()
|
43
71
|
return self.extract_bytes_sync(content)
|
72
|
+
|
73
|
+
def _extract_images_from_html(self, html_content: str) -> list[ExtractedImage]:
|
74
|
+
images: list[ExtractedImage] = []
|
75
|
+
soup = BeautifulSoup(html_content, "xml")
|
76
|
+
|
77
|
+
for img in soup.find_all("img"):
|
78
|
+
src_val = img.get("src") # type: ignore[union-attr]
|
79
|
+
if isinstance(src_val, str) and src_val.startswith("data:image/"):
|
80
|
+
try:
|
81
|
+
header, data = src_val.split(",", 1)
|
82
|
+
mime_type = header.split(";")[0].split(":")[1]
|
83
|
+
format_name = mime_type.split("/")[1]
|
84
|
+
|
85
|
+
if not data or len(data) < 4:
|
86
|
+
logger.debug("Skipping empty or too small base64 data")
|
87
|
+
continue
|
88
|
+
|
89
|
+
if len(data) > 67 * 1024 * 1024:
|
90
|
+
logger.warning("Skipping base64 image larger than 67MB")
|
91
|
+
continue
|
92
|
+
|
93
|
+
image_data = base64.b64decode(data)
|
94
|
+
|
95
|
+
if len(image_data) > MAX_SINGLE_IMAGE_SIZE:
|
96
|
+
logger.warning(
|
97
|
+
"Skipping decoded image larger than %dMB", MAX_SINGLE_IMAGE_SIZE // (1024 * 1024)
|
98
|
+
)
|
99
|
+
continue
|
100
|
+
|
101
|
+
dimensions = None
|
102
|
+
try:
|
103
|
+
with Image.open(io.BytesIO(image_data)) as pil_img:
|
104
|
+
dimensions = pil_img.size
|
105
|
+
except (OSError, ValueError) as e:
|
106
|
+
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
|
+
|
108
|
+
alt_val = img.get("alt") # type: ignore[union-attr]
|
109
|
+
desc = alt_val if isinstance(alt_val, str) else None
|
110
|
+
images.append(
|
111
|
+
ExtractedImage(
|
112
|
+
data=image_data,
|
113
|
+
format=format_name,
|
114
|
+
filename=f"embedded_image_{len(images) + 1}.{format_name}",
|
115
|
+
description=desc,
|
116
|
+
dimensions=dimensions,
|
117
|
+
)
|
118
|
+
)
|
119
|
+
except (ValueError, binascii.Error) as e:
|
120
|
+
logger.warning("Failed to extract base64 image: %s", e)
|
121
|
+
|
122
|
+
def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
|
123
|
+
try:
|
124
|
+
svg_content = str(svg_element).encode("utf-8")
|
125
|
+
|
126
|
+
def _get_attr_safe(obj: object, attr: str) -> str | None:
|
127
|
+
get_method = getattr(obj, "get", None)
|
128
|
+
if callable(get_method):
|
129
|
+
result = get_method(attr)
|
130
|
+
return result if isinstance(result, str) else None
|
131
|
+
return None
|
132
|
+
|
133
|
+
title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
|
134
|
+
desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
|
135
|
+
return ExtractedImage(
|
136
|
+
data=svg_content,
|
137
|
+
format="svg",
|
138
|
+
filename=f"inline_svg_{len(images) + 1}.svg",
|
139
|
+
description=desc_svg,
|
140
|
+
)
|
141
|
+
except (UnicodeEncodeError, AttributeError) as e:
|
142
|
+
logger.warning("Failed to extract SVG: %s", e)
|
143
|
+
return None
|
144
|
+
|
145
|
+
svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
|
146
|
+
images.extend(img for img in svg_images if img is not None)
|
147
|
+
|
148
|
+
return images
|