kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +0 -124
- kreuzberg/_document_classification.py +20 -39
- kreuzberg/_entity_extraction.py +0 -29
- kreuzberg/_extractors/_base.py +4 -66
- kreuzberg/_extractors/_email.py +0 -4
- kreuzberg/_extractors/_image.py +0 -2
- kreuzberg/_extractors/_pandoc.py +0 -58
- kreuzberg/_extractors/_pdf.py +0 -3
- kreuzberg/_extractors/_presentation.py +0 -82
- kreuzberg/_extractors/_spread_sheet.py +0 -2
- kreuzberg/_gmft.py +0 -61
- kreuzberg/_language_detection.py +0 -14
- kreuzberg/_mime_types.py +0 -17
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +110 -85
- kreuzberg/_ocr/_paddleocr.py +146 -138
- kreuzberg/_ocr/_table_extractor.py +0 -76
- kreuzberg/_ocr/_tesseract.py +0 -206
- kreuzberg/_playa.py +0 -27
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +16 -119
- kreuzberg/_utils/_cache.py +0 -52
- kreuzberg/_utils/_device.py +0 -56
- kreuzberg/_utils/_document_cache.py +0 -73
- kreuzberg/_utils/_errors.py +0 -47
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -14
- kreuzberg/_utils/_process_pool.py +0 -47
- kreuzberg/_utils/_quality.py +0 -17
- kreuzberg/_utils/_ref.py +0 -16
- kreuzberg/_utils/_serialization.py +0 -25
- kreuzberg/_utils/_string.py +0 -20
- kreuzberg/_utils/_sync.py +0 -76
- kreuzberg/_utils/_table.py +0 -45
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +2 -2
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
- kreuzberg-3.13.2.dist-info/RECORD +57 -0
- kreuzberg-3.13.0.dist-info/RECORD +0 -56
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_chunker.py
CHANGED
@@ -17,21 +17,6 @@ def get_chunker(
|
|
17
17
|
max_characters: int = DEFAULT_MAX_CHARACTERS,
|
18
18
|
overlap_characters: int = DEFAULT_MAX_OVERLAP,
|
19
19
|
) -> MarkdownSplitter | TextSplitter:
|
20
|
-
"""Creates and returns a Chunker object configured with the given maximum
|
21
|
-
characters per chunk and overlap between chunks.
|
22
|
-
|
23
|
-
Args:
|
24
|
-
mime_type: The mime type of the content.
|
25
|
-
max_characters: Maximum number of characters allowed in each chunk.
|
26
|
-
overlap_characters: Number of characters overlapping between two consecutive chunks.
|
27
|
-
|
28
|
-
Raises:
|
29
|
-
MissingDependencyError: if semantic-text-splitter is not installed.
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
Chunker: A Chunker object configured with the specified maximum
|
33
|
-
characters and overlap.
|
34
|
-
"""
|
35
20
|
key = (max_characters, overlap_characters, mime_type)
|
36
21
|
if key not in _chunkers:
|
37
22
|
try:
|
kreuzberg/_config.py
CHANGED
@@ -148,17 +148,6 @@ def _create_ocr_config(
|
|
148
148
|
|
149
149
|
|
150
150
|
def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
151
|
-
"""Load configuration from a TOML file.
|
152
|
-
|
153
|
-
Args:
|
154
|
-
config_path: Path to the configuration file.
|
155
|
-
|
156
|
-
Returns:
|
157
|
-
Dictionary containing the loaded configuration.
|
158
|
-
|
159
|
-
Raises:
|
160
|
-
ValidationError: If the file cannot be read or parsed.
|
161
|
-
"""
|
162
151
|
try:
|
163
152
|
with config_path.open("rb") as f:
|
164
153
|
data = tomllib.load(f)
|
@@ -177,15 +166,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
177
166
|
|
178
167
|
|
179
168
|
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
180
|
-
"""Merge two configuration dictionaries recursively.
|
181
|
-
|
182
|
-
Args:
|
183
|
-
base: Base configuration dictionary.
|
184
|
-
override: Configuration dictionary to override base values.
|
185
|
-
|
186
|
-
Returns:
|
187
|
-
Merged configuration dictionary.
|
188
|
-
"""
|
189
169
|
result = base.copy()
|
190
170
|
for key, value in override.items():
|
191
171
|
if isinstance(value, dict) and key in result and isinstance(result[key], dict):
|
@@ -198,18 +178,6 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
|
|
198
178
|
def parse_ocr_backend_config(
|
199
179
|
config_dict: dict[str, Any], backend: OcrBackendType
|
200
180
|
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
201
|
-
"""Parse OCR backend-specific configuration.
|
202
|
-
|
203
|
-
Args:
|
204
|
-
config_dict: Configuration dictionary.
|
205
|
-
backend: The OCR backend type.
|
206
|
-
|
207
|
-
Returns:
|
208
|
-
Backend-specific configuration object or None.
|
209
|
-
|
210
|
-
Raises:
|
211
|
-
ValidationError: If the backend configuration is invalid.
|
212
|
-
"""
|
213
181
|
if backend not in config_dict:
|
214
182
|
return None
|
215
183
|
|
@@ -230,17 +198,6 @@ def parse_ocr_backend_config(
|
|
230
198
|
|
231
199
|
|
232
200
|
def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
|
233
|
-
"""Build ExtractionConfig from a configuration dictionary.
|
234
|
-
|
235
|
-
Args:
|
236
|
-
config_dict: Configuration dictionary from TOML file.
|
237
|
-
|
238
|
-
Returns:
|
239
|
-
ExtractionConfig instance.
|
240
|
-
|
241
|
-
Raises:
|
242
|
-
ValidationError: If the configuration is invalid.
|
243
|
-
"""
|
244
201
|
extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
|
245
202
|
|
246
203
|
ocr_backend = extraction_config.get("ocr_backend")
|
@@ -288,18 +245,6 @@ def build_extraction_config(
|
|
288
245
|
file_config: dict[str, Any],
|
289
246
|
cli_args: MutableMapping[str, Any],
|
290
247
|
) -> ExtractionConfig:
|
291
|
-
"""Build ExtractionConfig from file config and CLI arguments.
|
292
|
-
|
293
|
-
Args:
|
294
|
-
file_config: Configuration loaded from file.
|
295
|
-
cli_args: CLI arguments.
|
296
|
-
|
297
|
-
Returns:
|
298
|
-
ExtractionConfig instance.
|
299
|
-
|
300
|
-
Raises:
|
301
|
-
ValidationError: If the combined configuration is invalid.
|
302
|
-
"""
|
303
248
|
config_dict: dict[str, Any] = {}
|
304
249
|
|
305
250
|
_merge_file_config(config_dict, file_config)
|
@@ -321,21 +266,6 @@ def build_extraction_config(
|
|
321
266
|
|
322
267
|
|
323
268
|
def find_config_file(start_path: Path | None = None) -> Path | None:
|
324
|
-
"""Find configuration file by searching up the directory tree.
|
325
|
-
|
326
|
-
Searches for configuration files in the following order:
|
327
|
-
1. kreuzberg.toml
|
328
|
-
2. pyproject.toml (with [tool.kreuzberg] section)
|
329
|
-
|
330
|
-
Args:
|
331
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
332
|
-
|
333
|
-
Returns:
|
334
|
-
Path to the configuration file or None if not found.
|
335
|
-
|
336
|
-
Raises:
|
337
|
-
ValidationError: If a config file exists but cannot be read or has invalid TOML.
|
338
|
-
"""
|
339
269
|
current = start_path or Path.cwd()
|
340
270
|
|
341
271
|
while current != current.parent:
|
@@ -366,17 +296,6 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
|
|
366
296
|
|
367
297
|
|
368
298
|
def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
|
369
|
-
"""Load the default configuration from discovered config file.
|
370
|
-
|
371
|
-
Args:
|
372
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
373
|
-
|
374
|
-
Returns:
|
375
|
-
ExtractionConfig instance or None if no configuration found.
|
376
|
-
|
377
|
-
Raises:
|
378
|
-
ValidationError: If configuration file exists but contains invalid configuration.
|
379
|
-
"""
|
380
299
|
config_path = find_config_file(start_path)
|
381
300
|
if not config_path:
|
382
301
|
return None
|
@@ -388,34 +307,12 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
|
|
388
307
|
|
389
308
|
|
390
309
|
def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
|
391
|
-
"""Load configuration from a specific file path.
|
392
|
-
|
393
|
-
Args:
|
394
|
-
config_path: Path to the configuration file.
|
395
|
-
|
396
|
-
Returns:
|
397
|
-
ExtractionConfig instance.
|
398
|
-
|
399
|
-
Raises:
|
400
|
-
ValidationError: If the file cannot be read, parsed, or is invalid.
|
401
|
-
"""
|
402
310
|
path = Path(config_path)
|
403
311
|
config_dict = load_config_from_file(path)
|
404
312
|
return build_extraction_config_from_dict(config_dict)
|
405
313
|
|
406
314
|
|
407
315
|
def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
|
408
|
-
"""Load configuration by discovering config files in the directory tree.
|
409
|
-
|
410
|
-
Args:
|
411
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
412
|
-
|
413
|
-
Returns:
|
414
|
-
ExtractionConfig instance.
|
415
|
-
|
416
|
-
Raises:
|
417
|
-
ValidationError: If no configuration file is found or if the file is invalid.
|
418
|
-
"""
|
419
316
|
search_path = Path(start_path) if start_path else None
|
420
317
|
config_path = find_config_file(search_path)
|
421
318
|
|
@@ -436,19 +333,6 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
|
|
436
333
|
|
437
334
|
|
438
335
|
def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
|
439
|
-
"""Discover and load configuration, returning None if no config file found.
|
440
|
-
|
441
|
-
If a config file is found, attempts to load it. Any errors during loading will bubble up.
|
442
|
-
|
443
|
-
Args:
|
444
|
-
start_path: Directory to start searching from. Defaults to current working directory.
|
445
|
-
|
446
|
-
Returns:
|
447
|
-
ExtractionConfig instance or None if no configuration file found.
|
448
|
-
|
449
|
-
Raises:
|
450
|
-
ValidationError: If a configuration file exists but is invalid.
|
451
|
-
"""
|
452
336
|
search_path = Path(start_path) if start_path else None
|
453
337
|
config_path = find_config_file(search_path)
|
454
338
|
|
@@ -462,12 +346,4 @@ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig |
|
|
462
346
|
|
463
347
|
|
464
348
|
def find_default_config() -> Path | None:
|
465
|
-
"""Find the default configuration file (pyproject.toml).
|
466
|
-
|
467
|
-
Returns:
|
468
|
-
Path to the configuration file or None if not found.
|
469
|
-
|
470
|
-
Note:
|
471
|
-
This function is deprecated. Use find_config_file() instead.
|
472
|
-
"""
|
473
349
|
return find_config_file()
|
@@ -3,6 +3,8 @@ from __future__ import annotations
|
|
3
3
|
import re
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
|
+
import polars as pl
|
7
|
+
|
6
8
|
from kreuzberg._ocr import get_ocr_backend
|
7
9
|
from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
|
8
10
|
from kreuzberg.exceptions import MissingDependencyError
|
@@ -40,17 +42,6 @@ DOCUMENT_CLASSIFIERS = {
|
|
40
42
|
|
41
43
|
|
42
44
|
def _get_translated_text(result: ExtractionResult) -> str:
|
43
|
-
"""Translate extracted text to English using Google Translate API.
|
44
|
-
|
45
|
-
Args:
|
46
|
-
result: ExtractionResult containing the text to be translated
|
47
|
-
|
48
|
-
Returns:
|
49
|
-
str: The translated text in lowercase English
|
50
|
-
|
51
|
-
Raises:
|
52
|
-
MissingDependencyError: If the deep-translator package is not installed
|
53
|
-
"""
|
54
45
|
text_to_classify = result.content
|
55
46
|
if result.metadata:
|
56
47
|
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
@@ -70,16 +61,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
70
61
|
|
71
62
|
|
72
63
|
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
73
|
-
"""Classifies the document type based on keywords and patterns.
|
74
|
-
|
75
|
-
Args:
|
76
|
-
result: The extraction result containing the content.
|
77
|
-
config: The extraction configuration.
|
78
|
-
|
79
|
-
Returns:
|
80
|
-
A tuple containing the detected document type and the confidence score,
|
81
|
-
or (None, None) if no type is detected with sufficient confidence.
|
82
|
-
"""
|
83
64
|
if not config.auto_detect_document_type:
|
84
65
|
return None, None
|
85
66
|
|
@@ -108,27 +89,17 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
108
89
|
def classify_document_from_layout(
|
109
90
|
result: ExtractionResult, config: ExtractionConfig
|
110
91
|
) -> tuple[str | None, float | None]:
|
111
|
-
"""Classifies the document type based on layout information from OCR.
|
112
|
-
|
113
|
-
Args:
|
114
|
-
result: The extraction result containing the layout data.
|
115
|
-
config: The extraction configuration.
|
116
|
-
|
117
|
-
Returns:
|
118
|
-
A tuple containing the detected document type and the confidence score,
|
119
|
-
or (None, None) if no type is detected with sufficient confidence.
|
120
|
-
"""
|
121
92
|
if not config.auto_detect_document_type:
|
122
93
|
return None, None
|
123
94
|
|
124
|
-
if result.layout is None or result.layout.
|
95
|
+
if result.layout is None or result.layout.is_empty():
|
125
96
|
return None, None
|
126
97
|
|
127
98
|
layout_df = result.layout
|
128
99
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
129
100
|
return None, None
|
130
101
|
|
131
|
-
layout_text = " ".join(layout_df["text"].
|
102
|
+
layout_text = " ".join(layout_df["text"].cast(str).to_list())
|
132
103
|
|
133
104
|
text_to_classify = layout_text
|
134
105
|
if result.metadata:
|
@@ -142,17 +113,27 @@ def classify_document_from_layout(
|
|
142
113
|
except Exception: # noqa: BLE001
|
143
114
|
translated_text = text_to_classify.lower()
|
144
115
|
|
145
|
-
layout_df
|
116
|
+
layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
|
146
117
|
|
147
|
-
|
118
|
+
try:
|
119
|
+
layout_df = layout_df.with_columns(
|
120
|
+
[pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
|
121
|
+
)
|
122
|
+
|
123
|
+
page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
|
124
|
+
if page_height_val is None:
|
125
|
+
page_height_val = 0.0
|
126
|
+
page_height = float(page_height_val)
|
127
|
+
except Exception: # noqa: BLE001
|
128
|
+
page_height = 1000.0
|
148
129
|
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
|
149
130
|
|
150
131
|
for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
|
151
132
|
for pattern in patterns:
|
152
|
-
found_words = layout_df
|
153
|
-
if not found_words.
|
133
|
+
found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
|
134
|
+
if not found_words.is_empty():
|
154
135
|
scores[doc_type] += 1.0
|
155
|
-
word_top = found_words
|
136
|
+
word_top = found_words[0, "top"]
|
156
137
|
if word_top < page_height * 0.3:
|
157
138
|
scores[doc_type] += 0.5
|
158
139
|
|
@@ -176,7 +157,7 @@ def auto_detect_document_type(
|
|
176
157
|
if config.document_classification_mode == "vision" and file_path:
|
177
158
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
178
159
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
179
|
-
elif result.layout is not None and not result.layout.
|
160
|
+
elif result.layout is not None and not result.layout.is_empty():
|
180
161
|
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
181
162
|
else:
|
182
163
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -19,21 +19,6 @@ def extract_entities(
|
|
19
19
|
languages: list[str] | None = None,
|
20
20
|
spacy_config: SpacyEntityExtractionConfig | None = None,
|
21
21
|
) -> list[Entity]:
|
22
|
-
"""Extract entities from text using custom regex patterns and/or a NER model.
|
23
|
-
|
24
|
-
Args:
|
25
|
-
text: The input text to extract entities from.
|
26
|
-
entity_types: List of entity types to extract using the NER model.
|
27
|
-
custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
|
28
|
-
languages: List of detected languages to choose appropriate spaCy models.
|
29
|
-
spacy_config: Configuration for spaCy entity extraction.
|
30
|
-
|
31
|
-
Returns:
|
32
|
-
list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
|
33
|
-
|
34
|
-
Raises:
|
35
|
-
MissingDependencyError: If `spacy` is not installed.
|
36
|
-
"""
|
37
22
|
entities: list[Entity] = []
|
38
23
|
if custom_patterns:
|
39
24
|
for ent_type, pattern in custom_patterns:
|
@@ -85,7 +70,6 @@ def extract_entities(
|
|
85
70
|
|
86
71
|
@lru_cache(maxsize=32)
|
87
72
|
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
88
|
-
"""Load a spaCy model with caching."""
|
89
73
|
try:
|
90
74
|
import spacy # noqa: PLC0415
|
91
75
|
|
@@ -102,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
102
86
|
|
103
87
|
|
104
88
|
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
105
|
-
"""Select the best spaCy model based on detected languages."""
|
106
89
|
if not languages:
|
107
90
|
return spacy_config.get_model_for_language("en")
|
108
91
|
|
@@ -118,18 +101,6 @@ def extract_keywords(
|
|
118
101
|
text: str,
|
119
102
|
keyword_count: int = 10,
|
120
103
|
) -> list[tuple[str, float]]:
|
121
|
-
"""Extract keywords from text using the KeyBERT model.
|
122
|
-
|
123
|
-
Args:
|
124
|
-
text: The input text to extract keywords from.
|
125
|
-
keyword_count: Number of top keywords to return. Defaults to 10.
|
126
|
-
|
127
|
-
Returns:
|
128
|
-
list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
|
129
|
-
|
130
|
-
Raises:
|
131
|
-
MissingDependencyError: If `keybert` is not installed.
|
132
|
-
"""
|
133
104
|
try:
|
134
105
|
from keybert import KeyBERT # noqa: PLC0415
|
135
106
|
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -13,20 +13,6 @@ if TYPE_CHECKING:
|
|
13
13
|
|
14
14
|
|
15
15
|
class Extractor(ABC):
|
16
|
-
"""Abstract base class for file content extraction.
|
17
|
-
|
18
|
-
This class provides the interface for different types of content extractors.
|
19
|
-
Subclasses are expected to implement the methods for extracting content
|
20
|
-
either asynchronously or synchronously and determining the supported MIME types.
|
21
|
-
|
22
|
-
Attributes:
|
23
|
-
SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
|
24
|
-
|
25
|
-
Args:
|
26
|
-
mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
|
27
|
-
config: Configuration options for the extraction process.
|
28
|
-
"""
|
29
|
-
|
30
16
|
__slots__ = ("config", "mime_type")
|
31
17
|
|
32
18
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]]
|
@@ -36,72 +22,24 @@ class Extractor(ABC):
|
|
36
22
|
self.config = config
|
37
23
|
|
38
24
|
@abstractmethod
|
39
|
-
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
40
|
-
"""Asynchronously extract content from a byte stream.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
content: The byte content to extract.
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
47
|
-
"""
|
25
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
|
48
26
|
|
49
27
|
@abstractmethod
|
50
|
-
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
51
|
-
"""Asynchronously extract content from a file located at the specified path.
|
52
|
-
|
53
|
-
Args:
|
54
|
-
path: The path to the file to process.
|
55
|
-
|
56
|
-
Returns:
|
57
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
58
|
-
"""
|
28
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult: ...
|
59
29
|
|
60
30
|
@abstractmethod
|
61
|
-
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
62
|
-
"""Synchronously extract content from a byte stream.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
content: The byte content to extract.
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
69
|
-
"""
|
31
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
|
70
32
|
|
71
33
|
@abstractmethod
|
72
|
-
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
73
|
-
"""Synchronously extract content from a file located at the specified path.
|
74
|
-
|
75
|
-
Args:
|
76
|
-
path: The path to the file to process.
|
77
|
-
|
78
|
-
Returns:
|
79
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
80
|
-
"""
|
34
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult: ...
|
81
35
|
|
82
36
|
@classmethod
|
83
37
|
def supports_mimetype(cls, mime_type: str) -> bool:
|
84
|
-
"""Verify whether the extractor supports the given MIME type.
|
85
|
-
|
86
|
-
Args:
|
87
|
-
mime_type: The MIME type to check (e.g., "application/pdf").
|
88
|
-
|
89
|
-
Returns:
|
90
|
-
bool: True if the MIME type is supported, False otherwise.
|
91
|
-
"""
|
92
38
|
return mime_type in cls.SUPPORTED_MIME_TYPES or any(
|
93
39
|
mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
|
94
40
|
)
|
95
41
|
|
96
42
|
def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
|
97
|
-
"""Apply quality post-processing to extraction result if enabled.
|
98
|
-
|
99
|
-
Args:
|
100
|
-
result: The raw extraction result
|
101
|
-
|
102
|
-
Returns:
|
103
|
-
Enhanced extraction result with quality improvements (if enabled)
|
104
|
-
"""
|
105
43
|
if not self.config.enable_quality_processing:
|
106
44
|
return result
|
107
45
|
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -42,7 +42,6 @@ class EmailExtractor(Extractor):
|
|
42
42
|
def _extract_email_headers(
|
43
43
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
44
44
|
) -> None:
|
45
|
-
"""Extract and process email headers."""
|
46
45
|
subject = parsed_email.get("subject")
|
47
46
|
if subject:
|
48
47
|
metadata["subject"] = subject
|
@@ -85,7 +84,6 @@ class EmailExtractor(Extractor):
|
|
85
84
|
text_parts.append(f"BCC: {bcc_formatted}")
|
86
85
|
|
87
86
|
def _format_email_field(self, field: Any) -> str:
|
88
|
-
"""Format email field (to, cc, bcc) for display."""
|
89
87
|
if isinstance(field, list):
|
90
88
|
emails = []
|
91
89
|
for item in field:
|
@@ -101,7 +99,6 @@ class EmailExtractor(Extractor):
|
|
101
99
|
return str(field)
|
102
100
|
|
103
101
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
104
|
-
"""Extract and process email body content."""
|
105
102
|
text_content = parsed_email.get("text")
|
106
103
|
if text_content:
|
107
104
|
text_parts.append(f"\n{text_content}")
|
@@ -123,7 +120,6 @@ class EmailExtractor(Extractor):
|
|
123
120
|
def _extract_email_attachments(
|
124
121
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
125
122
|
) -> None:
|
126
|
-
"""Extract and process email attachments info."""
|
127
123
|
if parsed_email.get("attachments"):
|
128
124
|
attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
|
129
125
|
metadata["attachments"] = attachment_names
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -61,7 +61,6 @@ class ImageExtractor(Extractor):
|
|
61
61
|
return self._apply_quality_processing(result)
|
62
62
|
|
63
63
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
64
|
-
"""Pure sync implementation of extract_bytes."""
|
65
64
|
extension = self._get_extension_from_mime_type(self.mime_type)
|
66
65
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
67
66
|
|
@@ -75,7 +74,6 @@ class ImageExtractor(Extractor):
|
|
75
74
|
Path(temp_path).unlink()
|
76
75
|
|
77
76
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
78
|
-
"""Pure sync implementation of extract_path."""
|
79
77
|
if self.config.ocr_backend is None:
|
80
78
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
81
79
|
|
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -84,8 +84,6 @@ NodeType = Literal[
|
|
84
84
|
|
85
85
|
|
86
86
|
class PandocExtractor(Extractor):
|
87
|
-
"""Extractor for documents supported by Pandoc."""
|
88
|
-
|
89
87
|
_checked_version: bool = False
|
90
88
|
|
91
89
|
MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
|
@@ -153,14 +151,6 @@ class PandocExtractor(Extractor):
|
|
153
151
|
}
|
154
152
|
|
155
153
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
156
|
-
"""Extract text and metadata from bytes content using Pandoc.
|
157
|
-
|
158
|
-
Args:
|
159
|
-
content: The content bytes to process.
|
160
|
-
|
161
|
-
Returns:
|
162
|
-
ExtractionResult with the extracted text and metadata.
|
163
|
-
"""
|
164
154
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
165
155
|
input_file, unlink = await create_temp_file(f".{extension}")
|
166
156
|
|
@@ -171,17 +161,6 @@ class PandocExtractor(Extractor):
|
|
171
161
|
await unlink()
|
172
162
|
|
173
163
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
174
|
-
"""Extract text and metadata from a file using Pandoc.
|
175
|
-
|
176
|
-
Args:
|
177
|
-
path: The path to the file to process.
|
178
|
-
|
179
|
-
Raises:
|
180
|
-
ParsingError: If the file data could not be extracted.
|
181
|
-
|
182
|
-
Returns:
|
183
|
-
ExtractionResult with the extracted text and metadata.
|
184
|
-
"""
|
185
164
|
await self._validate_pandoc_version()
|
186
165
|
self._get_pandoc_type_from_mime_type(self.mime_type)
|
187
166
|
|
@@ -198,14 +177,6 @@ class PandocExtractor(Extractor):
|
|
198
177
|
raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
|
199
178
|
|
200
179
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
201
|
-
"""Pure sync implementation of extract_bytes.
|
202
|
-
|
203
|
-
Args:
|
204
|
-
content: The content bytes to process.
|
205
|
-
|
206
|
-
Returns:
|
207
|
-
ExtractionResult with the extracted text and metadata.
|
208
|
-
"""
|
209
180
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
210
181
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
211
182
|
|
@@ -219,17 +190,6 @@ class PandocExtractor(Extractor):
|
|
219
190
|
Path(temp_path).unlink()
|
220
191
|
|
221
192
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
222
|
-
"""Pure sync implementation of extract_path.
|
223
|
-
|
224
|
-
Args:
|
225
|
-
path: The path to the file to process.
|
226
|
-
|
227
|
-
Returns:
|
228
|
-
ExtractionResult with the extracted text and metadata.
|
229
|
-
|
230
|
-
Raises:
|
231
|
-
ParsingError: When file processing fails.
|
232
|
-
"""
|
233
193
|
self._validate_pandoc_version_sync()
|
234
194
|
self._get_pandoc_type_from_mime_type(self.mime_type)
|
235
195
|
|
@@ -612,8 +572,6 @@ class PandocExtractor(Extractor):
|
|
612
572
|
|
613
573
|
|
614
574
|
class MarkdownExtractor(PandocExtractor):
|
615
|
-
"""Extractor for Markdown-based document formats."""
|
616
|
-
|
617
575
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
618
576
|
"text/x-markdown",
|
619
577
|
"text/x-commonmark",
|
@@ -625,8 +583,6 @@ class MarkdownExtractor(PandocExtractor):
|
|
625
583
|
|
626
584
|
|
627
585
|
class OfficeDocumentExtractor(PandocExtractor):
|
628
|
-
"""Extractor for Office document formats (Word, ODT)."""
|
629
|
-
|
630
586
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
631
587
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
632
588
|
"application/vnd.oasis.opendocument.text",
|
@@ -634,8 +590,6 @@ class OfficeDocumentExtractor(PandocExtractor):
|
|
634
590
|
|
635
591
|
|
636
592
|
class EbookExtractor(PandocExtractor):
|
637
|
-
"""Extractor for e-book formats (EPUB, FB2)."""
|
638
|
-
|
639
593
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
640
594
|
"application/epub+zip",
|
641
595
|
"application/x-fictionbook+xml",
|
@@ -643,8 +597,6 @@ class EbookExtractor(PandocExtractor):
|
|
643
597
|
|
644
598
|
|
645
599
|
class StructuredTextExtractor(PandocExtractor):
|
646
|
-
"""Extractor for structured text formats (RST, Org, etc.)."""
|
647
|
-
|
648
600
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
649
601
|
"text/x-rst",
|
650
602
|
"text/x-org",
|
@@ -654,8 +606,6 @@ class StructuredTextExtractor(PandocExtractor):
|
|
654
606
|
|
655
607
|
|
656
608
|
class LaTeXExtractor(PandocExtractor):
|
657
|
-
"""Extractor for LaTeX and Typst documents."""
|
658
|
-
|
659
609
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
660
610
|
"application/x-latex",
|
661
611
|
"application/x-typst",
|
@@ -663,8 +613,6 @@ class LaTeXExtractor(PandocExtractor):
|
|
663
613
|
|
664
614
|
|
665
615
|
class BibliographyExtractor(PandocExtractor):
|
666
|
-
"""Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
|
667
|
-
|
668
616
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
669
617
|
"application/x-bibtex",
|
670
618
|
"application/x-biblatex",
|
@@ -675,8 +623,6 @@ class BibliographyExtractor(PandocExtractor):
|
|
675
623
|
|
676
624
|
|
677
625
|
class XMLBasedExtractor(PandocExtractor):
|
678
|
-
"""Extractor for XML-based document formats (DocBook, JATS, OPML)."""
|
679
|
-
|
680
626
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
681
627
|
"application/docbook+xml",
|
682
628
|
"application/x-jats+xml",
|
@@ -685,8 +631,6 @@ class XMLBasedExtractor(PandocExtractor):
|
|
685
631
|
|
686
632
|
|
687
633
|
class TabularDataExtractor(PandocExtractor):
|
688
|
-
"""Extractor for tabular data formats (CSV, TSV)."""
|
689
|
-
|
690
634
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
691
635
|
"text/csv",
|
692
636
|
"text/tab-separated-values",
|
@@ -694,8 +638,6 @@ class TabularDataExtractor(PandocExtractor):
|
|
694
638
|
|
695
639
|
|
696
640
|
class MiscFormatExtractor(PandocExtractor):
|
697
|
-
"""Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
|
698
|
-
|
699
641
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
700
642
|
"application/rtf",
|
701
643
|
"text/troff",
|