kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +17 -8
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +6 -7
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +7 -2
- kreuzberg/_mcp/server.py +1 -22
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +47 -20
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +27 -26
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +146 -43
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +27 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
- kreuzberg-3.17.0.dist-info/RECORD +128 -0
- kreuzberg-3.15.0.dist-info/RECORD +0 -60
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -8,8 +8,10 @@ from ._types import (
|
|
8
8
|
ExtractionConfig,
|
9
9
|
ExtractionResult,
|
10
10
|
GMFTConfig,
|
11
|
+
HTMLToMarkdownConfig,
|
11
12
|
ImageOCRConfig,
|
12
13
|
ImageOCRResult,
|
14
|
+
JSONExtractionConfig,
|
13
15
|
LanguageDetectionConfig,
|
14
16
|
Metadata,
|
15
17
|
PaddleOCRConfig,
|
@@ -17,6 +19,7 @@ from ._types import (
|
|
17
19
|
SpacyEntityExtractionConfig,
|
18
20
|
TableData,
|
19
21
|
TesseractConfig,
|
22
|
+
TokenReductionConfig,
|
20
23
|
)
|
21
24
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
22
25
|
from .extraction import (
|
@@ -40,8 +43,10 @@ __all__ = [
|
|
40
43
|
"ExtractionResult",
|
41
44
|
"ExtractorRegistry",
|
42
45
|
"GMFTConfig",
|
46
|
+
"HTMLToMarkdownConfig",
|
43
47
|
"ImageOCRConfig",
|
44
48
|
"ImageOCRResult",
|
49
|
+
"JSONExtractionConfig",
|
45
50
|
"KreuzbergError",
|
46
51
|
"LanguageDetectionConfig",
|
47
52
|
"Metadata",
|
@@ -53,6 +58,7 @@ __all__ = [
|
|
53
58
|
"SpacyEntityExtractionConfig",
|
54
59
|
"TableData",
|
55
60
|
"TesseractConfig",
|
61
|
+
"TokenReductionConfig",
|
56
62
|
"ValidationError",
|
57
63
|
"__version__",
|
58
64
|
"batch_extract_bytes",
|
kreuzberg/_api/main.py
CHANGED
@@ -13,10 +13,8 @@ from typing_extensions import TypedDict
|
|
13
13
|
|
14
14
|
from kreuzberg import (
|
15
15
|
EasyOCRConfig,
|
16
|
-
ExtractedImage,
|
17
16
|
ExtractionConfig,
|
18
17
|
ExtractionResult,
|
19
|
-
ImageOCRResult,
|
20
18
|
KreuzbergError,
|
21
19
|
MissingDependencyError,
|
22
20
|
PaddleOCRConfig,
|
@@ -40,30 +38,6 @@ if TYPE_CHECKING:
|
|
40
38
|
from litestar.datastructures import UploadFile
|
41
39
|
|
42
40
|
|
43
|
-
class ExtractedImageDict(TypedDict):
|
44
|
-
"""TypedDict for extracted image JSON representation."""
|
45
|
-
|
46
|
-
data: str
|
47
|
-
format: str
|
48
|
-
filename: str | None
|
49
|
-
page_number: int | None
|
50
|
-
dimensions: tuple[int, int] | None
|
51
|
-
colorspace: str | None
|
52
|
-
bits_per_component: int | None
|
53
|
-
is_mask: bool
|
54
|
-
description: str | None
|
55
|
-
|
56
|
-
|
57
|
-
class ImageOCRResultDict(TypedDict):
|
58
|
-
"""TypedDict for image OCR result JSON representation."""
|
59
|
-
|
60
|
-
image: ExtractedImageDict
|
61
|
-
ocr_result: Any
|
62
|
-
confidence_score: float | None
|
63
|
-
processing_time: float | None
|
64
|
-
skipped_reason: str | None
|
65
|
-
|
66
|
-
|
67
41
|
class HealthResponse(TypedDict):
|
68
42
|
"""Response model for health check endpoint."""
|
69
43
|
|
@@ -384,31 +358,6 @@ def _pil_image_encoder(obj: Any) -> str:
|
|
384
358
|
return f"data:image/png;base64,{img_str}"
|
385
359
|
|
386
360
|
|
387
|
-
def _extracted_image_encoder(obj: ExtractedImage) -> ExtractedImageDict:
|
388
|
-
encoded_data = base64.b64encode(obj.data).decode()
|
389
|
-
return ExtractedImageDict(
|
390
|
-
data=f"data:image/{obj.format};base64,{encoded_data}",
|
391
|
-
format=obj.format,
|
392
|
-
filename=obj.filename,
|
393
|
-
page_number=obj.page_number,
|
394
|
-
dimensions=obj.dimensions,
|
395
|
-
colorspace=obj.colorspace,
|
396
|
-
bits_per_component=obj.bits_per_component,
|
397
|
-
is_mask=obj.is_mask,
|
398
|
-
description=obj.description,
|
399
|
-
)
|
400
|
-
|
401
|
-
|
402
|
-
def _image_ocr_result_encoder(obj: ImageOCRResult) -> ImageOCRResultDict:
|
403
|
-
return ImageOCRResultDict(
|
404
|
-
image=_extracted_image_encoder(obj.image),
|
405
|
-
ocr_result=obj.ocr_result,
|
406
|
-
confidence_score=obj.confidence_score,
|
407
|
-
processing_time=obj.processing_time,
|
408
|
-
skipped_reason=obj.skipped_reason,
|
409
|
-
)
|
410
|
-
|
411
|
-
|
412
361
|
openapi_config = OpenAPIConfig(
|
413
362
|
title="Kreuzberg API",
|
414
363
|
version="3.14.0",
|
@@ -428,8 +377,6 @@ openapi_config = OpenAPIConfig(
|
|
428
377
|
type_encoders = {
|
429
378
|
pl.DataFrame: _polars_dataframe_encoder,
|
430
379
|
Image.Image: _pil_image_encoder,
|
431
|
-
ExtractedImage: _extracted_image_encoder,
|
432
|
-
ImageOCRResult: _image_ocr_result_encoder,
|
433
380
|
}
|
434
381
|
|
435
382
|
app = Litestar(
|
kreuzberg/_config.py
CHANGED
@@ -69,12 +69,21 @@ def _build_ocr_config_from_cli(
|
|
69
69
|
try:
|
70
70
|
match ocr_backend:
|
71
71
|
case "tesseract":
|
72
|
-
|
72
|
+
processed_args = backend_args.copy()
|
73
|
+
if "psm" in processed_args and isinstance(processed_args["psm"], int):
|
74
|
+
try:
|
75
|
+
processed_args["psm"] = PSMMode(processed_args["psm"])
|
76
|
+
except ValueError as e: # pragma: no cover
|
77
|
+
raise ValidationError(
|
78
|
+
f"Invalid PSM mode value: {processed_args['psm']}",
|
79
|
+
context={"psm_value": processed_args["psm"], "error": str(e)},
|
80
|
+
) from e
|
81
|
+
return TesseractConfig(**processed_args)
|
73
82
|
case "easyocr":
|
74
83
|
return EasyOCRConfig(**backend_args)
|
75
84
|
case "paddleocr":
|
76
85
|
return PaddleOCRConfig(**backend_args)
|
77
|
-
case _:
|
86
|
+
case _: # pragma: no cover
|
78
87
|
return None
|
79
88
|
except (TypeError, ValueError) as e:
|
80
89
|
raise ValidationError(
|
@@ -112,7 +121,7 @@ def _configure_gmft(
|
|
112
121
|
try:
|
113
122
|
if cli_args.get("gmft_config"):
|
114
123
|
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
115
|
-
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
124
|
+
elif "gmft" in file_config and isinstance(file_config["gmft"], dict): # pragma: no cover
|
116
125
|
gmft_config = GMFTConfig(**file_config["gmft"])
|
117
126
|
except (TypeError, ValueError) as e:
|
118
127
|
raise ValidationError(
|
@@ -120,7 +129,7 @@ def _configure_gmft(
|
|
120
129
|
context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
|
121
130
|
) from e
|
122
131
|
|
123
|
-
if gmft_config:
|
132
|
+
if gmft_config: # pragma: no cover
|
124
133
|
config_dict["gmft_config"] = gmft_config
|
125
134
|
|
126
135
|
|
@@ -151,7 +160,7 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
151
160
|
try:
|
152
161
|
with config_path.open("rb") as f:
|
153
162
|
data = tomllib.load(f)
|
154
|
-
except FileNotFoundError as e:
|
163
|
+
except FileNotFoundError as e: # pragma: no cover
|
155
164
|
raise ValidationError(f"Configuration file not found: {config_path}") from e
|
156
165
|
except tomllib.TOMLDecodeError as e:
|
157
166
|
raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
|
@@ -237,7 +246,7 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
237
246
|
|
238
247
|
try:
|
239
248
|
return ExtractionConfig(**extraction_config)
|
240
|
-
except (TypeError, ValueError) as e:
|
249
|
+
except (TypeError, ValueError) as e: # pragma: no cover
|
241
250
|
raise ValidationError(
|
242
251
|
f"Invalid extraction configuration: {e}",
|
243
252
|
context={"config": extraction_config, "error": str(e)},
|
@@ -261,7 +270,7 @@ def build_extraction_config(
|
|
261
270
|
|
262
271
|
try:
|
263
272
|
return ExtractionConfig(**config_dict)
|
264
|
-
except (TypeError, ValueError) as e:
|
273
|
+
except (TypeError, ValueError) as e: # pragma: no cover
|
265
274
|
raise ValidationError(
|
266
275
|
f"Invalid extraction configuration: {e}",
|
267
276
|
context={"config": config_dict, "error": str(e)},
|
@@ -283,7 +292,7 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
|
|
283
292
|
data = tomllib.load(f)
|
284
293
|
if "tool" in data and "kreuzberg" in data["tool"]:
|
285
294
|
return pyproject_toml
|
286
|
-
except OSError as e:
|
295
|
+
except OSError as e: # pragma: no cover
|
287
296
|
raise ValidationError(
|
288
297
|
f"Failed to read pyproject.toml: {e}",
|
289
298
|
context={"file": str(pyproject_toml), "error": str(e)},
|
@@ -132,7 +132,7 @@ def classify_document_from_layout(
|
|
132
132
|
if not found_words.is_empty():
|
133
133
|
scores[doc_type] += 1.0
|
134
134
|
word_top = found_words[0, "top"]
|
135
|
-
if word_top < page_height * 0.3:
|
135
|
+
if word_top is not None and word_top < page_height * 0.3:
|
136
136
|
scores[doc_type] += 0.5
|
137
137
|
|
138
138
|
total_score = sum(scores.values())
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -96,7 +96,6 @@ class Extractor(ABC):
|
|
96
96
|
)
|
97
97
|
|
98
98
|
def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
|
99
|
-
"""Filter images based on memory safety limits."""
|
100
99
|
if not images:
|
101
100
|
return []
|
102
101
|
|
@@ -142,17 +141,6 @@ class Extractor(ABC):
|
|
142
141
|
_HASH_SAMPLE_SIZE = 512
|
143
142
|
|
144
143
|
def _compute_image_hash(self, img: ExtractedImage) -> int:
|
145
|
-
"""Compute hash for image deduplication using progressive hashing.
|
146
|
-
|
147
|
-
For small images (<1KB), hash the entire content.
|
148
|
-
For larger images, use size + first/last bytes for quick comparison.
|
149
|
-
|
150
|
-
Args:
|
151
|
-
img: Image to hash
|
152
|
-
|
153
|
-
Returns:
|
154
|
-
Hash value for deduplication
|
155
|
-
"""
|
156
144
|
data_len = len(img.data)
|
157
145
|
|
158
146
|
if data_len < self._SMALL_IMAGE_THRESHOLD:
|
@@ -189,14 +177,6 @@ class Extractor(ABC):
|
|
189
177
|
return unique_images
|
190
178
|
|
191
179
|
def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
|
192
|
-
"""Prepare OCR configuration for the specified backend.
|
193
|
-
|
194
|
-
Args:
|
195
|
-
backend_name: Name of the OCR backend
|
196
|
-
|
197
|
-
Returns:
|
198
|
-
Configuration dictionary for the backend
|
199
|
-
"""
|
200
180
|
default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
|
201
181
|
config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
|
202
182
|
|
@@ -222,14 +202,6 @@ class Extractor(ABC):
|
|
222
202
|
return cfg
|
223
203
|
|
224
204
|
def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
|
225
|
-
"""Validate if an image is suitable for OCR processing.
|
226
|
-
|
227
|
-
Args:
|
228
|
-
img: Image to validate
|
229
|
-
|
230
|
-
Returns:
|
231
|
-
Reason for skipping if invalid, None if valid
|
232
|
-
"""
|
233
205
|
fmt = img.format.lower()
|
234
206
|
if fmt not in self.config.image_ocr_formats:
|
235
207
|
return f"Unsupported format: {img.format}"
|
@@ -247,16 +219,6 @@ class Extractor(ABC):
|
|
247
219
|
return None
|
248
220
|
|
249
221
|
async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
|
250
|
-
"""Process a single image with OCR.
|
251
|
-
|
252
|
-
Args:
|
253
|
-
target: Image to process
|
254
|
-
backend: OCR backend instance
|
255
|
-
cfg: Configuration for the backend
|
256
|
-
|
257
|
-
Returns:
|
258
|
-
OCR result for the image
|
259
|
-
"""
|
260
222
|
try:
|
261
223
|
start = time.time()
|
262
224
|
pil_img = Image.open(io.BytesIO(target.data))
|
@@ -284,14 +246,6 @@ class Extractor(ABC):
|
|
284
246
|
async def _process_images_with_ocr(
|
285
247
|
self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
|
286
248
|
) -> list[ImageOCRResult]:
|
287
|
-
"""Process multiple images with OCR.
|
288
|
-
|
289
|
-
Args:
|
290
|
-
images: Tuple or list of images to process
|
291
|
-
|
292
|
-
Returns:
|
293
|
-
List of OCR results
|
294
|
-
"""
|
295
249
|
if not images or not self.config.ocr_extracted_images:
|
296
250
|
return []
|
297
251
|
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -27,6 +27,8 @@ except ImportError: # pragma: no cover
|
|
27
27
|
html2text = None
|
28
28
|
|
29
29
|
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
30
|
+
_UNICODE_QUOTES_PATTERN = re.compile(r"[\u201c\u201d]")
|
31
|
+
_UNICODE_SINGLE_QUOTES_PATTERN = re.compile(r"[\u2018\u2019]")
|
30
32
|
|
31
33
|
|
32
34
|
class EmailExtractor(Extractor):
|
@@ -86,7 +88,14 @@ class EmailExtractor(Extractor):
|
|
86
88
|
def _format_email_field(self, field: Any) -> str:
|
87
89
|
match field:
|
88
90
|
case list():
|
89
|
-
|
91
|
+
emails = []
|
92
|
+
for item in field:
|
93
|
+
if isinstance(item, dict):
|
94
|
+
if email := item.get("email", ""):
|
95
|
+
emails.append(str(email))
|
96
|
+
else:
|
97
|
+
emails.append(str(item))
|
98
|
+
return ", ".join(emails)
|
90
99
|
case dict():
|
91
100
|
return str(field.get("email", ""))
|
92
101
|
case _:
|
@@ -111,12 +120,8 @@ class EmailExtractor(Extractor):
|
|
111
120
|
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned, flags=re.IGNORECASE | re.DOTALL)
|
112
121
|
clean_html = _HTML_TAG_PATTERN.sub("", cleaned)
|
113
122
|
clean_html = unescape(clean_html)
|
114
|
-
clean_html = (
|
115
|
-
|
116
|
-
.replace("\u201d", '"')
|
117
|
-
.replace("\u2019", "'")
|
118
|
-
.replace("\u2018", "'")
|
119
|
-
)
|
123
|
+
clean_html = _UNICODE_QUOTES_PATTERN.sub('"', clean_html)
|
124
|
+
clean_html = _UNICODE_SINGLE_QUOTES_PATTERN.sub("'", clean_html)
|
120
125
|
text_parts.append(clean_html)
|
121
126
|
|
122
127
|
def _extract_email_attachments(
|
@@ -129,12 +134,12 @@ class EmailExtractor(Extractor):
|
|
129
134
|
for att in attachments:
|
130
135
|
name_val: str = "unknown"
|
131
136
|
if isinstance(att, dict):
|
132
|
-
n = att.get("name")
|
137
|
+
n = att.get("name") or att.get("filename")
|
133
138
|
if isinstance(n, str) and n:
|
134
139
|
name_val = n
|
135
140
|
names.append(name_val)
|
136
|
-
metadata["attachments"] = names
|
137
141
|
if names:
|
142
|
+
metadata["attachments"] = names
|
138
143
|
text_parts.append("Attachments: " + ", ".join(names))
|
139
144
|
|
140
145
|
def _extract_images_from_attachments(self, parsed_email: dict[str, Any]) -> list[ExtractedImage]:
|
@@ -151,7 +156,8 @@ class EmailExtractor(Extractor):
|
|
151
156
|
if not isinstance(mime, str) or not mime.startswith("image/"):
|
152
157
|
continue
|
153
158
|
|
154
|
-
name = att.get("name")
|
159
|
+
name = att.get("name") or att.get("filename")
|
160
|
+
name = name if isinstance(name, str) else None
|
155
161
|
data = att.get("data") or att.get("content") or att.get("payload")
|
156
162
|
raw: bytes | None = None
|
157
163
|
if isinstance(data, (bytes, bytearray)):
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import base64
|
4
|
+
import binascii
|
5
|
+
import io
|
4
6
|
import logging
|
5
7
|
from typing import TYPE_CHECKING, ClassVar
|
6
8
|
|
7
9
|
import html_to_markdown
|
8
10
|
from anyio import Path as AsyncPath
|
9
11
|
from bs4 import BeautifulSoup
|
12
|
+
from PIL import Image
|
10
13
|
|
11
14
|
from kreuzberg._extractors._base import MAX_SINGLE_IMAGE_SIZE, Extractor
|
12
15
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
13
16
|
from kreuzberg._types import ExtractedImage, ExtractionResult, HTMLToMarkdownConfig
|
17
|
+
from kreuzberg._utils._html_streaming import should_use_streaming
|
14
18
|
from kreuzberg._utils._string import safe_decode
|
15
19
|
from kreuzberg._utils._sync import run_maybe_async, run_sync
|
16
20
|
|
@@ -44,6 +48,11 @@ class HTMLExtractor(Extractor):
|
|
44
48
|
config_dict = config.to_dict()
|
45
49
|
|
46
50
|
html_content = safe_decode(content)
|
51
|
+
|
52
|
+
use_streaming, chunk_size = should_use_streaming(len(content))
|
53
|
+
config_dict["stream_processing"] = use_streaming
|
54
|
+
config_dict["chunk_size"] = chunk_size
|
55
|
+
|
47
56
|
result = html_to_markdown.convert_to_markdown(html_content, **config_dict)
|
48
57
|
|
49
58
|
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
@@ -89,6 +98,13 @@ class HTMLExtractor(Extractor):
|
|
89
98
|
)
|
90
99
|
continue
|
91
100
|
|
101
|
+
dimensions = None
|
102
|
+
try:
|
103
|
+
with Image.open(io.BytesIO(image_data)) as pil_img:
|
104
|
+
dimensions = pil_img.size
|
105
|
+
except (OSError, ValueError) as e: # pragma: no cover
|
106
|
+
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
|
+
|
92
108
|
alt_val = img.get("alt") # type: ignore[union-attr]
|
93
109
|
desc = alt_val if isinstance(alt_val, str) else None
|
94
110
|
images.append(
|
@@ -97,25 +113,36 @@ class HTMLExtractor(Extractor):
|
|
97
113
|
format=format_name,
|
98
114
|
filename=f"embedded_image_{len(images) + 1}.{format_name}",
|
99
115
|
description=desc,
|
116
|
+
dimensions=dimensions,
|
100
117
|
)
|
101
118
|
)
|
102
|
-
except
|
119
|
+
except (ValueError, binascii.Error) as e:
|
103
120
|
logger.warning("Failed to extract base64 image: %s", e)
|
104
121
|
|
105
|
-
|
122
|
+
def extract_svg_safe(svg_element: object) -> ExtractedImage | None:
|
106
123
|
try:
|
107
|
-
svg_content = str(
|
108
|
-
|
124
|
+
svg_content = str(svg_element).encode("utf-8")
|
125
|
+
|
126
|
+
def _get_attr_safe(obj: object, attr: str) -> str | None:
|
127
|
+
get_method = getattr(obj, "get", None)
|
128
|
+
if callable(get_method):
|
129
|
+
result = get_method(attr)
|
130
|
+
return result if isinstance(result, str) else None
|
131
|
+
return None
|
132
|
+
|
133
|
+
title_or_aria = _get_attr_safe(svg_element, "title") or _get_attr_safe(svg_element, "aria-label")
|
109
134
|
desc_svg = title_or_aria if isinstance(title_or_aria, str) else None
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
description=desc_svg,
|
116
|
-
)
|
135
|
+
return ExtractedImage(
|
136
|
+
data=svg_content,
|
137
|
+
format="svg",
|
138
|
+
filename=f"inline_svg_{len(images) + 1}.svg",
|
139
|
+
description=desc_svg,
|
117
140
|
)
|
118
|
-
except
|
141
|
+
except (UnicodeEncodeError, AttributeError) as e:
|
119
142
|
logger.warning("Failed to extract SVG: %s", e)
|
143
|
+
return None
|
144
|
+
|
145
|
+
svg_images = [extract_svg_safe(svg) for svg in soup.find_all("svg")]
|
146
|
+
images.extend(img for img in svg_images if img is not None)
|
120
147
|
|
121
148
|
return images
|
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -253,7 +253,7 @@ class PandocExtractor(Extractor):
|
|
253
253
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
254
254
|
)
|
255
255
|
|
256
|
-
except FileNotFoundError as e:
|
256
|
+
except FileNotFoundError as e: # pragma: no cover
|
257
257
|
raise MissingDependencyError(
|
258
258
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
259
259
|
) from e
|
@@ -491,7 +491,7 @@ class PandocExtractor(Extractor):
|
|
491
491
|
"Please install it on your system and make sure its available in $PATH."
|
492
492
|
)
|
493
493
|
|
494
|
-
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
494
|
+
except (subprocess.SubprocessError, FileNotFoundError) as e: # pragma: no cover
|
495
495
|
raise MissingDependencyError(
|
496
496
|
"Pandoc version 2 or above is a required system dependency. "
|
497
497
|
"Please install it on your system and make sure its available in $PATH."
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import asyncio
|
4
3
|
import contextlib
|
5
4
|
import io
|
6
5
|
import logging
|
@@ -41,7 +40,7 @@ from kreuzberg._utils._errors import create_error_context, should_retry
|
|
41
40
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
42
41
|
from kreuzberg._utils._resource_managers import pdf_document, pdf_document_sync, pdf_resources_sync
|
43
42
|
from kreuzberg._utils._string import normalize_spaces
|
44
|
-
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup_batched
|
43
|
+
from kreuzberg._utils._sync import run_maybe_async, run_taskgroup, run_taskgroup_batched
|
45
44
|
from kreuzberg._utils._table import generate_table_summary
|
46
45
|
from kreuzberg._utils._tmp import temporary_file, temporary_file_sync
|
47
46
|
from kreuzberg.exceptions import ParsingError
|
@@ -154,7 +153,7 @@ class PDFExtractor(Extractor):
|
|
154
153
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
155
154
|
|
156
155
|
tables = extract_tables_sync(path)
|
157
|
-
except ImportError:
|
156
|
+
except ImportError: # pragma: no cover
|
158
157
|
tables = []
|
159
158
|
|
160
159
|
if not self.config.force_ocr and self._validate_extracted_text(text):
|
@@ -231,7 +230,7 @@ class PDFExtractor(Extractor):
|
|
231
230
|
img_counter += 1
|
232
231
|
|
233
232
|
if tasks:
|
234
|
-
results = await
|
233
|
+
results = await run_taskgroup(*tasks)
|
235
234
|
return [img for img in results if img is not None]
|
236
235
|
|
237
236
|
return []
|
@@ -501,7 +500,7 @@ class PDFExtractor(Extractor):
|
|
501
500
|
except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
|
502
501
|
last_exception = e
|
503
502
|
continue
|
504
|
-
except OSError as e:
|
503
|
+
except OSError as e: # pragma: no cover
|
505
504
|
raise ParsingError(f"Failed to parse PDF: {e}") from e
|
506
505
|
|
507
506
|
if last_exception:
|
@@ -521,7 +520,7 @@ class PDFExtractor(Extractor):
|
|
521
520
|
for password in passwords:
|
522
521
|
try:
|
523
522
|
return await extract_pdf_metadata(content, password=password)
|
524
|
-
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
523
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
|
525
524
|
last_exception = e
|
526
525
|
continue
|
527
526
|
|
@@ -539,7 +538,7 @@ class PDFExtractor(Extractor):
|
|
539
538
|
for password in passwords:
|
540
539
|
try:
|
541
540
|
return extract_pdf_metadata_sync(content, password=password)
|
542
|
-
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
541
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
|
543
542
|
last_exception = e
|
544
543
|
continue
|
545
544
|
|
@@ -142,6 +142,8 @@ class PresentationExtractor(Extractor):
|
|
142
142
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
143
143
|
try:
|
144
144
|
image = shape.image
|
145
|
+
if not image.blob or not isinstance(image.blob, bytes):
|
146
|
+
continue
|
145
147
|
filename = f"slide_{slide_num}_image_{len(images) + 1}.{image.ext}"
|
146
148
|
|
147
149
|
images.append(
|
@@ -162,6 +164,8 @@ class PresentationExtractor(Extractor):
|
|
162
164
|
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
163
165
|
try:
|
164
166
|
image = shape.image
|
167
|
+
if not image.blob or not isinstance(image.blob, bytes):
|
168
|
+
continue
|
165
169
|
filename = f"slide_{slide_num}_group_image_{image_count + len(images) + 1}.{image.ext}"
|
166
170
|
images.append(
|
167
171
|
ExtractedImage(data=image.blob, format=image.ext, filename=filename, page_number=slide_num)
|
@@ -197,7 +197,6 @@ class SpreadSheetExtractor(Extractor):
|
|
197
197
|
if not data or not any(row for row in data):
|
198
198
|
return f"## {sheet_name}\n\n*Empty sheet*"
|
199
199
|
|
200
|
-
# Normalize row lengths to avoid polars ShapeError
|
201
200
|
if data:
|
202
201
|
max_cols = max(len(row) if row else 0 for row in data)
|
203
202
|
data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
|