kreuzberg 3.18.0__py3-none-any.whl → 3.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +4 -2
- kreuzberg/_entity_extraction.py +4 -8
- kreuzberg/_error_handling.py +182 -0
- kreuzberg/_extractors/_base.py +2 -2
- kreuzberg/_extractors/_html.py +2 -2
- kreuzberg/_extractors/_pdf.py +33 -54
- kreuzberg/_extractors/_structured.py +1 -1
- kreuzberg/_language_detection.py +2 -0
- kreuzberg/_ocr/_tesseract.py +28 -6
- kreuzberg/_types.py +18 -0
- kreuzberg/cli.py +36 -22
- kreuzberg/extraction.py +251 -107
- {kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/METADATA +4 -1
- {kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/RECORD +17 -16
- {kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.18.0.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
|
|
110
110
|
Environment Variables:
|
111
111
|
KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
|
112
112
|
"""
|
113
|
-
default_size = 1024 * 1024 * 1024
|
113
|
+
default_size = 1024 * 1024 * 1024
|
114
114
|
try:
|
115
115
|
size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
|
116
|
-
# Return default if negative
|
117
116
|
return size if size >= 0 else default_size
|
118
117
|
except ValueError:
|
119
118
|
return default_size
|
@@ -311,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
|
|
311
310
|
"""
|
312
311
|
static_config = discover_config_cached()
|
313
312
|
|
313
|
+
if not data:
|
314
|
+
raise ValidationError("No files provided for extraction", context={"file_count": 0})
|
315
|
+
|
314
316
|
min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
|
315
317
|
max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
|
316
318
|
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
|
|
144
144
|
try:
|
145
145
|
nlp = spacy.load(model_name)
|
146
146
|
except OSError:
|
147
|
-
|
147
|
+
|
148
148
|
async def install_model() -> tuple[bool, str | None]:
|
149
149
|
"""Install model and return success status and error message."""
|
150
|
-
# First try spaCy's built-in download
|
151
150
|
try:
|
152
151
|
success = await install_spacy_model_with_spacy(model_name)
|
153
152
|
if success:
|
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
|
|
157
156
|
else:
|
158
157
|
spacy_error = "spaCy download failed"
|
159
158
|
|
160
|
-
# If spaCy download failed and uv is available, try uv as fallback
|
161
159
|
if is_uv_available():
|
162
160
|
try:
|
163
161
|
result = await install_spacy_model_with_uv(model_name)
|
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
|
|
167
165
|
|
168
166
|
return False, spacy_error
|
169
167
|
|
170
|
-
# Run the async installation in a sync context
|
171
168
|
try:
|
172
169
|
success, error_details = anyio.run(install_model)
|
173
|
-
except
|
174
|
-
success, error_details = False,
|
170
|
+
except SystemExit as e:
|
171
|
+
success, error_details = False, f"spaCy CLI exit code: {e.code}"
|
175
172
|
|
176
173
|
if not success:
|
177
|
-
# Generate appropriate error message based on available tools
|
178
174
|
if is_uv_available():
|
179
175
|
model_url = get_spacy_model_url(model_name)
|
180
176
|
manual_install_cmd = f"uv pip install {model_url}"
|
@@ -234,7 +230,7 @@ def extract_keywords(
|
|
234
230
|
kw_model = KeyBERT()
|
235
231
|
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
236
232
|
return [(kw, float(score)) for kw, score in keywords]
|
237
|
-
except
|
233
|
+
except ValueError:
|
238
234
|
return []
|
239
235
|
except ImportError as e: # pragma: no cover
|
240
236
|
raise MissingDependencyError.create_for_package(
|
@@ -0,0 +1,182 @@
|
|
1
|
+
"""Type-safe error handling utilities for extraction pipeline."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import traceback
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from collections.abc import Callable
|
10
|
+
|
11
|
+
from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
|
12
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
|
13
|
+
|
14
|
+
|
15
|
+
def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
|
16
|
+
"""Determine if an exception should bubble up or be handled gracefully.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
exception: The exception to classify
|
20
|
+
context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
True if the exception should bubble up, False if it should be handled gracefully
|
24
|
+
"""
|
25
|
+
if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
|
26
|
+
return True
|
27
|
+
|
28
|
+
if isinstance(exception, MissingDependencyError):
|
29
|
+
return True
|
30
|
+
|
31
|
+
if isinstance(exception, ValidationError):
|
32
|
+
if context == "batch_processing":
|
33
|
+
return False
|
34
|
+
|
35
|
+
return context != "optional_feature"
|
36
|
+
|
37
|
+
if isinstance(exception, KreuzbergError) and context == "optional_feature":
|
38
|
+
return False
|
39
|
+
|
40
|
+
if context == "batch_processing":
|
41
|
+
return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
|
42
|
+
|
43
|
+
return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
|
44
|
+
|
45
|
+
|
46
|
+
class FeatureProcessingError:
|
47
|
+
"""Type-safe processing error for extraction features."""
|
48
|
+
|
49
|
+
def __init__(self, feature: str, error: Exception) -> None:
|
50
|
+
self._feature = feature
|
51
|
+
self._error = error
|
52
|
+
self._traceback = traceback.format_exc()
|
53
|
+
|
54
|
+
@property
|
55
|
+
def feature(self) -> str:
|
56
|
+
return self._feature
|
57
|
+
|
58
|
+
@property
|
59
|
+
def error_type(self) -> str:
|
60
|
+
return type(self._error).__name__
|
61
|
+
|
62
|
+
@property
|
63
|
+
def error_message(self) -> str:
|
64
|
+
return str(self._error)
|
65
|
+
|
66
|
+
@property
|
67
|
+
def traceback(self) -> str:
|
68
|
+
return self._traceback
|
69
|
+
|
70
|
+
def to_dict(self) -> ProcessingErrorDict:
|
71
|
+
return {
|
72
|
+
"feature": self.feature,
|
73
|
+
"error_type": self.error_type,
|
74
|
+
"error_message": self.error_message,
|
75
|
+
"traceback": self.traceback,
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
def safe_feature_execution(
|
80
|
+
feature_name: str,
|
81
|
+
execution_func: Callable[[], Any],
|
82
|
+
default_value: Any,
|
83
|
+
result: ExtractionResult,
|
84
|
+
context: ErrorContextType = "optional_feature",
|
85
|
+
) -> Any:
|
86
|
+
"""Safely execute a feature extraction function with proper error handling.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
feature_name: Name of the feature being executed
|
90
|
+
execution_func: Function to execute that may raise exceptions
|
91
|
+
default_value: Default value to return if execution fails
|
92
|
+
result: ExtractionResult to update with error information
|
93
|
+
context: The context for exception handling decisions
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
Either the successful result or the default value
|
97
|
+
"""
|
98
|
+
try:
|
99
|
+
return execution_func()
|
100
|
+
except Exception as e:
|
101
|
+
if should_exception_bubble_up(e, context):
|
102
|
+
raise
|
103
|
+
|
104
|
+
_add_processing_error(result, FeatureProcessingError(feature_name, e))
|
105
|
+
return default_value
|
106
|
+
|
107
|
+
|
108
|
+
def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
|
109
|
+
"""Add a processing error to the result metadata in a type-safe way."""
|
110
|
+
if result.metadata is None:
|
111
|
+
result.metadata = {}
|
112
|
+
|
113
|
+
if "processing_errors" not in result.metadata:
|
114
|
+
result.metadata["processing_errors"] = []
|
115
|
+
|
116
|
+
errors_list = result.metadata["processing_errors"]
|
117
|
+
if isinstance(errors_list, list):
|
118
|
+
errors_list.append(error.to_dict())
|
119
|
+
else:
|
120
|
+
result.metadata["processing_errors"] = [error.to_dict()]
|
121
|
+
|
122
|
+
|
123
|
+
def preserve_result_with_errors(
|
124
|
+
result: ExtractionResult,
|
125
|
+
errors: list[FeatureProcessingError],
|
126
|
+
) -> ExtractionResult:
|
127
|
+
"""Preserve a successful extraction result while adding error information.
|
128
|
+
|
129
|
+
This is used when core extraction succeeds but optional features fail.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
result: The successful extraction result
|
133
|
+
errors: List of errors that occurred during optional processing
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
The result with error information added to metadata
|
137
|
+
"""
|
138
|
+
for error in errors:
|
139
|
+
_add_processing_error(result, error)
|
140
|
+
|
141
|
+
return result
|
142
|
+
|
143
|
+
|
144
|
+
def create_error_result(
|
145
|
+
content: str,
|
146
|
+
mime_type: str,
|
147
|
+
errors: list[FeatureProcessingError],
|
148
|
+
**metadata_kwargs: Any,
|
149
|
+
) -> ExtractionResult:
|
150
|
+
"""Create an error result with proper type safety.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
content: Error content to include
|
154
|
+
mime_type: MIME type of the result
|
155
|
+
errors: List of errors that occurred
|
156
|
+
**metadata_kwargs: Additional metadata to include
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
An ExtractionResult with error information
|
160
|
+
"""
|
161
|
+
metadata: Metadata = {
|
162
|
+
"error": f"Multiple processing errors occurred: {len(errors)} errors",
|
163
|
+
"error_context": {
|
164
|
+
"error_count": len(errors),
|
165
|
+
"errors": [error.to_dict() for error in errors],
|
166
|
+
**metadata_kwargs,
|
167
|
+
},
|
168
|
+
"processing_errors": [error.to_dict() for error in errors],
|
169
|
+
}
|
170
|
+
|
171
|
+
return ExtractionResult(
|
172
|
+
content=content,
|
173
|
+
chunks=[],
|
174
|
+
mime_type=mime_type,
|
175
|
+
metadata=metadata,
|
176
|
+
entities=[],
|
177
|
+
keywords=[],
|
178
|
+
detected_languages=[],
|
179
|
+
tables=[],
|
180
|
+
images=[],
|
181
|
+
image_ocr_results=[],
|
182
|
+
)
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -230,13 +230,13 @@ class Extractor(ABC):
|
|
230
230
|
confidence_score=None,
|
231
231
|
processing_time=duration,
|
232
232
|
)
|
233
|
-
except
|
233
|
+
except ValueError as e: # pragma: no cover
|
234
234
|
return ImageOCRResult(
|
235
235
|
image=target,
|
236
236
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
237
237
|
skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
|
238
238
|
)
|
239
|
-
except
|
239
|
+
except TypeError as e: # pragma: no cover
|
240
240
|
return ImageOCRResult(
|
241
241
|
image=target,
|
242
242
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -75,7 +75,7 @@ class HTMLExtractor(Extractor):
|
|
75
75
|
soup = BeautifulSoup(html_content, "xml")
|
76
76
|
|
77
77
|
for img in soup.find_all("img"):
|
78
|
-
src_val = img.get("src")
|
78
|
+
src_val = img.get("src")
|
79
79
|
if isinstance(src_val, str) and src_val.startswith("data:image/"):
|
80
80
|
try:
|
81
81
|
header, data = src_val.split(",", 1)
|
@@ -105,7 +105,7 @@ class HTMLExtractor(Extractor):
|
|
105
105
|
except (OSError, ValueError) as e: # pragma: no cover
|
106
106
|
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
107
|
|
108
|
-
alt_val = img.get("alt")
|
108
|
+
alt_val = img.get("alt")
|
109
109
|
desc = alt_val if isinstance(alt_val, str) else None
|
110
110
|
images.append(
|
111
111
|
ExtractedImage(
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -6,7 +6,6 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import tempfile
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
-
from dataclasses import asdict
|
10
9
|
from itertools import count
|
11
10
|
from multiprocessing import cpu_count
|
12
11
|
from pathlib import Path
|
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
|
27
26
|
from kreuzberg._ocr import get_ocr_backend
|
28
27
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
29
28
|
from kreuzberg._types import (
|
30
|
-
EasyOCRConfig,
|
31
29
|
ExtractedImage,
|
32
30
|
ExtractionResult,
|
33
31
|
ImageOCRResult,
|
34
32
|
Metadata,
|
35
33
|
OcrBackendType,
|
36
|
-
PaddleOCRConfig,
|
37
|
-
TesseractConfig,
|
38
34
|
)
|
39
35
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
40
36
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
|
|
134
130
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
135
131
|
content_bytes = path.read_bytes()
|
136
132
|
|
133
|
+
result: ExtractionResult | None = None
|
134
|
+
|
137
135
|
document: Document | None = None
|
138
136
|
if self.config.extract_images or self.config.extract_tables:
|
139
137
|
document = self._parse_with_password_attempts(content_bytes)
|
140
138
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
139
|
+
if not self.config.force_ocr:
|
140
|
+
try:
|
141
|
+
content = self._extract_pdf_searchable_text_sync(path)
|
142
|
+
if self._validate_extracted_text(content):
|
143
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
144
|
+
except ParsingError:
|
145
|
+
pass
|
145
146
|
|
146
|
-
if
|
147
|
-
|
147
|
+
if not result and self.config.ocr_backend is not None:
|
148
|
+
result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
|
149
|
+
|
150
|
+
if not result:
|
151
|
+
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
152
|
+
|
153
|
+
metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
|
154
|
+
result.metadata = metadata
|
148
155
|
|
149
|
-
tables = []
|
150
156
|
if self.config.extract_tables:
|
151
157
|
# GMFT is optional dependency ~keep
|
152
158
|
try:
|
153
159
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
154
160
|
|
155
161
|
tables = extract_tables_sync(path)
|
162
|
+
result.tables = tables
|
156
163
|
except ImportError: # pragma: no cover
|
157
|
-
tables = []
|
158
|
-
|
159
|
-
if not self.config.force_ocr and self._validate_extracted_text(text):
|
160
|
-
text = self._extract_with_playa_sync(path, fallback_text=text)
|
161
|
-
|
162
|
-
text = normalize_spaces(text)
|
163
|
-
|
164
|
-
result = ExtractionResult(
|
165
|
-
content=text,
|
166
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
167
|
-
metadata={},
|
168
|
-
tables=list(tables),
|
169
|
-
)
|
164
|
+
result.tables = []
|
170
165
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
166
|
+
if result.tables:
|
167
|
+
table_summary = generate_table_summary(result.tables)
|
168
|
+
result.metadata = result.metadata | {
|
169
|
+
"table_count": table_summary["table_count"],
|
170
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
171
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
172
|
+
f"{table_summary['total_rows']} total rows",
|
173
|
+
}
|
179
174
|
|
180
175
|
if self.config.extract_images and document:
|
181
176
|
images = self._extract_images_from_playa_sync(document)
|
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
|
|
405
400
|
except Exception as e:
|
406
401
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
407
402
|
|
408
|
-
def
|
403
|
+
def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
409
404
|
temp_files: list[Path] = []
|
410
405
|
try:
|
411
406
|
with pdf_document_sync(path) as pdf:
|
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
|
|
443
438
|
with pdf_resources_sync(bitmap, page):
|
444
439
|
pil_image.close()
|
445
440
|
|
446
|
-
|
441
|
+
content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
|
442
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
447
443
|
|
448
444
|
except Exception as e:
|
449
445
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
|
|
452
448
|
with contextlib.suppress(OSError):
|
453
449
|
p.unlink()
|
454
450
|
|
455
|
-
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
456
|
-
backend = get_ocr_backend(
|
451
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
|
452
|
+
backend = get_ocr_backend(ocr_backend)
|
457
453
|
paths = [Path(p) for p in image_paths]
|
458
454
|
|
459
|
-
|
460
|
-
case "tesseract":
|
461
|
-
config = (
|
462
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
463
|
-
)
|
464
|
-
results = backend.process_batch_sync(paths, **asdict(config))
|
465
|
-
case "paddleocr":
|
466
|
-
paddle_config = (
|
467
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
468
|
-
)
|
469
|
-
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
470
|
-
case "easyocr":
|
471
|
-
easy_config = (
|
472
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
473
|
-
)
|
474
|
-
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
475
|
-
case _:
|
476
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
455
|
+
results = backend.process_batch_sync(paths, **self.config.get_config_dict())
|
477
456
|
|
478
457
|
return "\n\n".join(result.content for result in results)
|
479
458
|
|
kreuzberg/_language_detection.py
CHANGED
@@ -31,5 +31,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
|
|
31
31
|
langs = [result["lang"].lower() for result in results if result.get("lang")]
|
32
32
|
return langs if langs else None
|
33
33
|
return None
|
34
|
+
except (RuntimeError, OSError, MemoryError):
|
35
|
+
raise
|
34
36
|
except Exception: # noqa: BLE001
|
35
37
|
return None
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1113,6 +1113,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1113
1113
|
**run_config["remaining_kwargs"],
|
1114
1114
|
"language": run_config["language"],
|
1115
1115
|
"psm": run_config["psm"],
|
1116
|
+
"tesseract_format": run_config["tesseract_format"],
|
1117
|
+
"ext": run_config["ext"],
|
1118
|
+
"output_format": run_config["output_format"],
|
1119
|
+
"enable_table_detection": run_config["enable_table_detection"],
|
1116
1120
|
}
|
1117
1121
|
|
1118
1122
|
optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
|
@@ -1222,13 +1226,21 @@ def _process_image_with_tesseract(
|
|
1222
1226
|
config_dict: dict[str, Any],
|
1223
1227
|
) -> dict[str, Any]:
|
1224
1228
|
try:
|
1225
|
-
|
1226
|
-
|
1229
|
+
tesseract_format = config_dict.get("tesseract_format", "text")
|
1230
|
+
ext = config_dict.get("ext", ".txt")
|
1231
|
+
output_format = config_dict.get("output_format", "text")
|
1232
|
+
config_dict.get("enable_table_detection", False)
|
1233
|
+
|
1234
|
+
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
|
1235
|
+
output_base = tmp_file.name.replace(ext, "")
|
1227
1236
|
|
1228
1237
|
try:
|
1229
1238
|
language = config_dict.get("language", "eng")
|
1230
1239
|
psm = config_dict.get("psm", 3)
|
1231
1240
|
|
1241
|
+
# Convert PSM enum to integer value if needed
|
1242
|
+
psm_value = psm.value if hasattr(psm, "value") else psm
|
1243
|
+
|
1232
1244
|
command = [
|
1233
1245
|
"tesseract",
|
1234
1246
|
image_path,
|
@@ -1236,13 +1248,16 @@ def _process_image_with_tesseract(
|
|
1236
1248
|
"-l",
|
1237
1249
|
language,
|
1238
1250
|
"--psm",
|
1239
|
-
str(
|
1251
|
+
str(psm_value),
|
1240
1252
|
"--oem",
|
1241
1253
|
"1",
|
1242
1254
|
"--loglevel",
|
1243
1255
|
"OFF",
|
1244
1256
|
]
|
1245
1257
|
|
1258
|
+
if tesseract_format != "text":
|
1259
|
+
command.append(tesseract_format)
|
1260
|
+
|
1246
1261
|
boolean_options = [
|
1247
1262
|
"classify_use_pre_adapted_templates",
|
1248
1263
|
"language_model_ngram_on",
|
@@ -1275,10 +1290,17 @@ def _process_image_with_tesseract(
|
|
1275
1290
|
if result.returncode != 0:
|
1276
1291
|
raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
|
1277
1292
|
|
1278
|
-
output_file = output_base +
|
1293
|
+
output_file = output_base + ext
|
1279
1294
|
with Path(output_file).open(encoding="utf-8") as f:
|
1280
1295
|
text = f.read()
|
1281
1296
|
|
1297
|
+
# Process based on output format
|
1298
|
+
if output_format == "markdown" and tesseract_format == "hocr":
|
1299
|
+
# Import here to avoid circular dependency ~keep
|
1300
|
+
from html_to_markdown import convert_to_markdown # noqa: PLC0415
|
1301
|
+
|
1302
|
+
text = convert_to_markdown(text, heading_style="atx")
|
1303
|
+
|
1282
1304
|
text = normalize_spaces(text)
|
1283
1305
|
|
1284
1306
|
return {
|
@@ -1289,8 +1311,8 @@ def _process_image_with_tesseract(
|
|
1289
1311
|
}
|
1290
1312
|
|
1291
1313
|
finally:
|
1292
|
-
for
|
1293
|
-
temp_file = output_base +
|
1314
|
+
for possible_ext in [ext, ".txt", ".hocr", ".tsv"]:
|
1315
|
+
temp_file = output_base + possible_ext
|
1294
1316
|
temp_path = Path(temp_file)
|
1295
1317
|
if temp_path.exists():
|
1296
1318
|
temp_path.unlink()
|
kreuzberg/_types.py
CHANGED
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|
32
32
|
|
33
33
|
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
34
34
|
OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
|
35
|
+
ErrorContextType = Literal["batch_processing", "optional_feature", "single_extraction", "unknown"]
|
35
36
|
|
36
37
|
|
37
38
|
class ConfigDict:
|
@@ -503,6 +504,17 @@ class SpacyEntityExtractionConfig(ConfigDict):
|
|
503
504
|
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
504
505
|
|
505
506
|
|
507
|
+
class ProcessingErrorDict(TypedDict):
|
508
|
+
feature: str
|
509
|
+
"""Name of the feature that failed (e.g., 'chunking', 'entity_extraction', 'keyword_extraction')."""
|
510
|
+
error_type: str
|
511
|
+
"""Type of the exception that occurred (e.g., 'RuntimeError', 'ValidationError')."""
|
512
|
+
error_message: str
|
513
|
+
"""Human-readable error message."""
|
514
|
+
traceback: str
|
515
|
+
"""Full Python traceback for debugging."""
|
516
|
+
|
517
|
+
|
506
518
|
class BoundingBox(TypedDict):
|
507
519
|
left: int
|
508
520
|
"""X coordinate of the left edge."""
|
@@ -701,6 +713,10 @@ class Metadata(TypedDict, total=False):
|
|
701
713
|
"""Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
|
702
714
|
token_reduction: NotRequired[dict[str, float]]
|
703
715
|
"""Token reduction statistics including reduction ratios and counts."""
|
716
|
+
processing_errors: NotRequired[list[ProcessingErrorDict]]
|
717
|
+
"""List of processing errors that occurred during extraction."""
|
718
|
+
extraction_error: NotRequired[dict[str, Any]]
|
719
|
+
"""Error information for critical extraction failures."""
|
704
720
|
|
705
721
|
|
706
722
|
_VALID_METADATA_KEYS = {
|
@@ -756,6 +772,8 @@ _VALID_METADATA_KEYS = {
|
|
756
772
|
"message",
|
757
773
|
"attributes",
|
758
774
|
"token_reduction",
|
775
|
+
"processing_errors",
|
776
|
+
"extraction_error",
|
759
777
|
}
|
760
778
|
|
761
779
|
|
kreuzberg/cli.py
CHANGED
@@ -168,31 +168,45 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
|
|
168
168
|
input_text = sys.stdin.read()
|
169
169
|
input_bytes = input_text.encode("utf-8")
|
170
170
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
except ImportError: # pragma: no cover
|
184
|
-
content_str = input_bytes.decode("utf-8", errors="ignore").lower()
|
185
|
-
mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
|
171
|
+
# Detect MIME type from content
|
172
|
+
content_str = input_bytes.decode("utf-8", errors="ignore").lower()
|
173
|
+
if "<html" in content_str or "<!doctype html" in content_str or "<body" in content_str:
|
174
|
+
mime_type = "text/html"
|
175
|
+
elif (content_str.strip().startswith("{") and content_str.strip().endswith("}")) or (
|
176
|
+
content_str.strip().startswith("[") and content_str.strip().endswith("]")
|
177
|
+
):
|
178
|
+
mime_type = "application/json"
|
179
|
+
elif content_str.strip().startswith("---") or ":" in content_str[:100]:
|
180
|
+
mime_type = "application/x-yaml"
|
181
|
+
else:
|
182
|
+
mime_type = "text/plain"
|
186
183
|
|
184
|
+
# Use progress display if possible, fallback to simple extraction on Windows issues
|
185
|
+
try:
|
186
|
+
with Progress(
|
187
|
+
SpinnerColumn(),
|
188
|
+
TextColumn("[progress.description]{task.description}"),
|
189
|
+
console=console,
|
190
|
+
transient=True,
|
191
|
+
) as progress:
|
192
|
+
progress.add_task("Extracting text...", total=None)
|
193
|
+
return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
|
194
|
+
except (OSError, RuntimeError): # pragma: no cover
|
195
|
+
# Fallback for Windows console issues
|
187
196
|
return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
|
188
197
|
else:
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
198
|
+
# Use progress display if possible, fallback to simple extraction on Windows issues
|
199
|
+
try:
|
200
|
+
with Progress(
|
201
|
+
SpinnerColumn(),
|
202
|
+
TextColumn("[progress.description]{task.description}"),
|
203
|
+
console=console,
|
204
|
+
transient=True,
|
205
|
+
) as progress:
|
206
|
+
progress.add_task(f"Extracting text from {file.name}...", total=None)
|
207
|
+
return extract_file_sync(str(file), config=extraction_config)
|
208
|
+
except (OSError, RuntimeError): # pragma: no cover
|
209
|
+
# Fallback for Windows console issues
|
196
210
|
return extract_file_sync(str(file), config=extraction_config)
|
197
211
|
|
198
212
|
|
kreuzberg/extraction.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import multiprocessing as mp
|
4
|
+
import traceback
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import TYPE_CHECKING, Final, cast
|
@@ -10,6 +11,7 @@ import anyio
|
|
10
11
|
from kreuzberg._chunker import get_chunker
|
11
12
|
from kreuzberg._document_classification import auto_detect_document_type
|
12
13
|
from kreuzberg._entity_extraction import extract_entities, extract_keywords
|
14
|
+
from kreuzberg._error_handling import safe_feature_execution, should_exception_bubble_up
|
13
15
|
from kreuzberg._language_detection import detect_languages
|
14
16
|
from kreuzberg._mime_types import (
|
15
17
|
validate_mime_type,
|
@@ -21,7 +23,7 @@ from kreuzberg._utils._document_cache import get_document_cache
|
|
21
23
|
from kreuzberg._utils._errors import create_error_context
|
22
24
|
from kreuzberg._utils._string import safe_decode
|
23
25
|
from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
|
24
|
-
from kreuzberg.exceptions import ValidationError
|
26
|
+
from kreuzberg.exceptions import KreuzbergError, ValidationError
|
25
27
|
|
26
28
|
if TYPE_CHECKING:
|
27
29
|
from collections.abc import Sequence
|
@@ -50,69 +52,107 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
|
|
50
52
|
def _validate_and_post_process_helper(
|
51
53
|
result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
|
52
54
|
) -> ExtractionResult:
|
55
|
+
if result.metadata is None:
|
56
|
+
result.metadata = {}
|
57
|
+
|
53
58
|
if config.chunk_content:
|
54
|
-
result.chunks =
|
55
|
-
|
56
|
-
|
57
|
-
|
59
|
+
result.chunks = safe_feature_execution(
|
60
|
+
feature_name="chunking",
|
61
|
+
execution_func=lambda: _handle_chunk_content(
|
62
|
+
mime_type=result.mime_type,
|
63
|
+
config=config,
|
64
|
+
content=result.content,
|
65
|
+
),
|
66
|
+
default_value=[],
|
67
|
+
result=result,
|
58
68
|
)
|
59
69
|
|
60
70
|
if config.extract_entities:
|
61
|
-
|
62
|
-
|
71
|
+
result.entities = safe_feature_execution(
|
72
|
+
feature_name="entity_extraction",
|
73
|
+
execution_func=lambda: extract_entities(
|
63
74
|
result.content,
|
64
75
|
custom_patterns=config.custom_entity_patterns,
|
65
|
-
)
|
66
|
-
|
67
|
-
result
|
76
|
+
),
|
77
|
+
default_value=None,
|
78
|
+
result=result,
|
79
|
+
)
|
68
80
|
|
69
81
|
if config.extract_keywords:
|
70
|
-
|
71
|
-
|
82
|
+
result.keywords = safe_feature_execution(
|
83
|
+
feature_name="keyword_extraction",
|
84
|
+
execution_func=lambda: extract_keywords(
|
72
85
|
result.content,
|
73
86
|
keyword_count=config.keyword_count,
|
74
|
-
)
|
75
|
-
|
76
|
-
result
|
87
|
+
),
|
88
|
+
default_value=None,
|
89
|
+
result=result,
|
90
|
+
)
|
77
91
|
|
78
92
|
if config.auto_detect_language:
|
79
|
-
lang_config = config.language_detection_config
|
80
|
-
if lang_config is None:
|
81
|
-
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
82
93
|
|
83
|
-
|
94
|
+
def _detect_language() -> list[str]:
|
95
|
+
lang_config = config.language_detection_config
|
96
|
+
if lang_config is None:
|
97
|
+
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
98
|
+
|
99
|
+
lang_config = LanguageDetectionConfig(model=config.language_detection_model)
|
84
100
|
|
85
|
-
|
86
|
-
|
87
|
-
|
101
|
+
return detect_languages(result.content, config=lang_config) or []
|
102
|
+
|
103
|
+
result.detected_languages = safe_feature_execution(
|
104
|
+
feature_name="language_detection",
|
105
|
+
execution_func=_detect_language,
|
106
|
+
default_value=[],
|
107
|
+
result=result,
|
88
108
|
)
|
89
109
|
|
90
110
|
if config.auto_detect_document_type:
|
91
|
-
result =
|
111
|
+
result = safe_feature_execution(
|
112
|
+
feature_name="document_type_detection",
|
113
|
+
execution_func=lambda: auto_detect_document_type(result, config, file_path=file_path),
|
114
|
+
default_value=result,
|
115
|
+
result=result,
|
116
|
+
)
|
92
117
|
|
93
118
|
if config.token_reduction is not None and config.token_reduction.mode != "off":
|
94
|
-
original_content = result.content
|
95
119
|
|
96
|
-
|
97
|
-
|
98
|
-
language_hint = result.detected_languages[0]
|
120
|
+
def _apply_token_reduction() -> str:
|
121
|
+
original_content = result.content
|
99
122
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
123
|
+
language_hint = None
|
124
|
+
if result.detected_languages and len(result.detected_languages) > 0:
|
125
|
+
language_hint = result.detected_languages[0]
|
126
|
+
|
127
|
+
reduced_content = (
|
128
|
+
reduce_tokens(
|
129
|
+
original_content,
|
130
|
+
config=config.token_reduction,
|
131
|
+
language=language_hint,
|
132
|
+
)
|
133
|
+
if config.token_reduction
|
134
|
+
else original_content
|
135
|
+
)
|
136
|
+
reduction_stats = get_reduction_stats(original_content, reduced_content)
|
137
|
+
|
138
|
+
if result.metadata is not None:
|
139
|
+
result.metadata["token_reduction"] = {
|
140
|
+
"character_reduction_ratio": reduction_stats["character_reduction_ratio"],
|
141
|
+
"token_reduction_ratio": reduction_stats["token_reduction_ratio"],
|
142
|
+
"original_characters": reduction_stats["original_characters"],
|
143
|
+
"reduced_characters": reduction_stats["reduced_characters"],
|
144
|
+
"original_tokens": reduction_stats["original_tokens"],
|
145
|
+
"reduced_tokens": reduction_stats["reduced_tokens"],
|
146
|
+
}
|
147
|
+
|
148
|
+
return reduced_content
|
149
|
+
|
150
|
+
result.content = safe_feature_execution(
|
151
|
+
feature_name="token_reduction",
|
152
|
+
execution_func=_apply_token_reduction,
|
153
|
+
default_value=result.content,
|
154
|
+
result=result,
|
104
155
|
)
|
105
|
-
reduction_stats = get_reduction_stats(original_content, reduced_content)
|
106
|
-
|
107
|
-
result.content = reduced_content
|
108
|
-
result.metadata["token_reduction"] = {
|
109
|
-
"character_reduction_ratio": reduction_stats["character_reduction_ratio"],
|
110
|
-
"token_reduction_ratio": reduction_stats["token_reduction_ratio"],
|
111
|
-
"original_characters": reduction_stats["original_characters"],
|
112
|
-
"reduced_characters": reduction_stats["reduced_characters"],
|
113
|
-
"original_tokens": reduction_stats["original_tokens"],
|
114
|
-
"reduced_tokens": reduction_stats["reduced_tokens"],
|
115
|
-
}
|
116
156
|
|
117
157
|
return result
|
118
158
|
|
@@ -125,8 +165,22 @@ async def _validate_and_post_process_async(
|
|
125
165
|
|
126
166
|
result = _validate_and_post_process_helper(result, config, file_path)
|
127
167
|
|
128
|
-
for post_processor in config.post_processing_hooks or []:
|
129
|
-
|
168
|
+
for i, post_processor in enumerate(config.post_processing_hooks or []):
|
169
|
+
try:
|
170
|
+
result = await run_maybe_sync(post_processor, result)
|
171
|
+
except (KreuzbergError, ValueError, RuntimeError, TypeError) as e: # noqa: PERF203
|
172
|
+
if result.metadata is None:
|
173
|
+
result.metadata = {}
|
174
|
+
error_list = result.metadata.setdefault("processing_errors", [])
|
175
|
+
if isinstance(error_list, list):
|
176
|
+
error_list.append(
|
177
|
+
{
|
178
|
+
"feature": f"post_processing_hook_{i}",
|
179
|
+
"error_type": type(e).__name__,
|
180
|
+
"error_message": str(e),
|
181
|
+
"traceback": traceback.format_exc(),
|
182
|
+
}
|
183
|
+
)
|
130
184
|
|
131
185
|
return result
|
132
186
|
|
@@ -260,22 +314,18 @@ async def batch_extract_file(
|
|
260
314
|
config,
|
261
315
|
)
|
262
316
|
results[index] = result
|
263
|
-
except Exception as e:
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
index=index,
|
274
|
-
),
|
275
|
-
},
|
276
|
-
chunks=[],
|
317
|
+
except Exception as e:
|
318
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
319
|
+
raise
|
320
|
+
|
321
|
+
basic_result = _attempt_basic_extraction(
|
322
|
+
None,
|
323
|
+
None,
|
324
|
+
e,
|
325
|
+
index,
|
326
|
+
file_path=str(path),
|
277
327
|
)
|
278
|
-
results[index] =
|
328
|
+
results[index] = basic_result
|
279
329
|
|
280
330
|
async with anyio.create_task_group() as tg:
|
281
331
|
for i, path in enumerate(file_paths):
|
@@ -309,23 +359,12 @@ async def batch_extract_bytes(
|
|
309
359
|
try:
|
310
360
|
result = await extract_bytes(content, mime_type, config)
|
311
361
|
results[index] = result
|
312
|
-
except Exception as e:
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
"error_context": create_error_context(
|
319
|
-
operation="batch_extract_bytes",
|
320
|
-
error=e,
|
321
|
-
index=index,
|
322
|
-
mime_type=mime_type,
|
323
|
-
content_size=len(content),
|
324
|
-
),
|
325
|
-
},
|
326
|
-
chunks=[],
|
327
|
-
)
|
328
|
-
results[index] = error_result
|
362
|
+
except Exception as e:
|
363
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
364
|
+
raise
|
365
|
+
|
366
|
+
basic_result = _attempt_basic_extraction(content, mime_type, e, index)
|
367
|
+
results[index] = basic_result
|
329
368
|
|
330
369
|
async with anyio.create_task_group() as tg:
|
331
370
|
for i, (content, mime_type) in enumerate(contents):
|
@@ -334,6 +373,125 @@ async def batch_extract_bytes(
|
|
334
373
|
return results
|
335
374
|
|
336
375
|
|
376
|
+
def _attempt_basic_extraction(
|
377
|
+
content: bytes | None, mime_type: str | None, original_error: Exception, index: int, *, file_path: str | None = None
|
378
|
+
) -> ExtractionResult:
|
379
|
+
"""Attempt basic extraction when full extraction fails, preserving as much as possible.
|
380
|
+
|
381
|
+
This function tries to extract at least basic text content even when advanced
|
382
|
+
features like OCR, entity extraction, etc. fail.
|
383
|
+
|
384
|
+
Args:
|
385
|
+
content: The raw content bytes (None for file extractions)
|
386
|
+
mime_type: The MIME type of the content (None if unknown)
|
387
|
+
original_error: The exception that caused the main extraction to fail
|
388
|
+
index: Index of this content in the batch
|
389
|
+
file_path: Optional file path for file-based extractions
|
390
|
+
|
391
|
+
Returns:
|
392
|
+
A basic ExtractionResult with whatever could be extracted
|
393
|
+
"""
|
394
|
+
if (
|
395
|
+
isinstance(original_error, (ValueError, TypeError, ValidationError))
|
396
|
+
or "mock" in str(type(original_error)).lower()
|
397
|
+
):
|
398
|
+
return ExtractionResult(
|
399
|
+
content=f"Error: {type(original_error).__name__}: {original_error!s}",
|
400
|
+
mime_type="text/plain",
|
401
|
+
metadata={
|
402
|
+
"error": f"{type(original_error).__name__}: {original_error!s}",
|
403
|
+
"error_context": create_error_context(
|
404
|
+
operation="batch_extract_file" if file_path else "batch_extract_bytes",
|
405
|
+
error=original_error,
|
406
|
+
index=index,
|
407
|
+
mime_type=mime_type,
|
408
|
+
content_size=len(content) if content else 0,
|
409
|
+
file_path=file_path,
|
410
|
+
),
|
411
|
+
},
|
412
|
+
chunks=[],
|
413
|
+
entities=[],
|
414
|
+
keywords=[],
|
415
|
+
detected_languages=[],
|
416
|
+
tables=[],
|
417
|
+
images=[],
|
418
|
+
image_ocr_results=[],
|
419
|
+
)
|
420
|
+
|
421
|
+
try:
|
422
|
+
if content is None:
|
423
|
+
return ExtractionResult(
|
424
|
+
content=f"Error: {type(original_error).__name__}: {original_error!s}",
|
425
|
+
mime_type="text/plain",
|
426
|
+
metadata={
|
427
|
+
"error": f"{type(original_error).__name__}: {original_error!s}",
|
428
|
+
"error_context": create_error_context(
|
429
|
+
operation="batch_extract_file",
|
430
|
+
error=original_error,
|
431
|
+
index=index,
|
432
|
+
file_path=file_path,
|
433
|
+
),
|
434
|
+
},
|
435
|
+
chunks=[],
|
436
|
+
entities=[],
|
437
|
+
keywords=[],
|
438
|
+
detected_languages=[],
|
439
|
+
tables=[],
|
440
|
+
images=[],
|
441
|
+
image_ocr_results=[],
|
442
|
+
)
|
443
|
+
|
444
|
+
mime_type = validate_mime_type(mime_type=mime_type)
|
445
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=ExtractionConfig()):
|
446
|
+
basic_result = extractor.extract_bytes_sync(content)
|
447
|
+
|
448
|
+
if basic_result.metadata is None:
|
449
|
+
basic_result.metadata = {}
|
450
|
+
|
451
|
+
basic_result.metadata["extraction_error"] = {
|
452
|
+
"error_type": type(original_error).__name__,
|
453
|
+
"error_message": str(original_error),
|
454
|
+
"traceback": traceback.format_exc(),
|
455
|
+
"context": create_error_context(
|
456
|
+
operation="batch_extract_file" if file_path else "batch_extract_bytes",
|
457
|
+
error=original_error,
|
458
|
+
index=index,
|
459
|
+
mime_type=mime_type,
|
460
|
+
content_size=len(content),
|
461
|
+
file_path=file_path,
|
462
|
+
),
|
463
|
+
"recovery_mode": "basic_extraction",
|
464
|
+
}
|
465
|
+
|
466
|
+
return basic_result
|
467
|
+
|
468
|
+
except (KreuzbergError, ValueError, RuntimeError, TypeError):
|
469
|
+
pass
|
470
|
+
|
471
|
+
return ExtractionResult(
|
472
|
+
content=f"Error: {type(original_error).__name__}: {original_error!s}",
|
473
|
+
mime_type="text/plain",
|
474
|
+
metadata={
|
475
|
+
"error": f"{type(original_error).__name__}: {original_error!s}",
|
476
|
+
"error_context": create_error_context(
|
477
|
+
operation="batch_extract_file" if file_path else "batch_extract_bytes",
|
478
|
+
error=original_error,
|
479
|
+
index=index,
|
480
|
+
mime_type=mime_type,
|
481
|
+
content_size=len(content) if content else 0,
|
482
|
+
file_path=file_path,
|
483
|
+
),
|
484
|
+
},
|
485
|
+
chunks=[],
|
486
|
+
entities=[],
|
487
|
+
keywords=[],
|
488
|
+
detected_languages=[],
|
489
|
+
tables=[],
|
490
|
+
images=[],
|
491
|
+
image_ocr_results=[],
|
492
|
+
)
|
493
|
+
|
494
|
+
|
337
495
|
def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
|
338
496
|
"""Synchronous version of extract_bytes.
|
339
497
|
|
@@ -444,21 +602,18 @@ def batch_extract_file_sync(
|
|
444
602
|
index,
|
445
603
|
extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
|
446
604
|
)
|
447
|
-
except Exception as e:
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
),
|
458
|
-
},
|
459
|
-
chunks=[],
|
605
|
+
except Exception as e:
|
606
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
607
|
+
raise
|
608
|
+
|
609
|
+
basic_result = _attempt_basic_extraction(
|
610
|
+
None,
|
611
|
+
None,
|
612
|
+
e,
|
613
|
+
index,
|
614
|
+
file_path=str(file_path),
|
460
615
|
)
|
461
|
-
return (index,
|
616
|
+
return (index, basic_result)
|
462
617
|
|
463
618
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
464
619
|
future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
|
@@ -494,23 +649,12 @@ def batch_extract_bytes_sync(
|
|
494
649
|
"""Extract single content with index for ordering."""
|
495
650
|
try:
|
496
651
|
return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
|
497
|
-
except Exception as e:
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
"error_context": create_error_context(
|
504
|
-
operation="batch_extract_bytes_sync",
|
505
|
-
error=e,
|
506
|
-
index=index,
|
507
|
-
mime_type=mime_type,
|
508
|
-
content_size=len(content),
|
509
|
-
),
|
510
|
-
},
|
511
|
-
chunks=[],
|
512
|
-
)
|
513
|
-
return (index, error_result)
|
652
|
+
except Exception as e:
|
653
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
654
|
+
raise
|
655
|
+
|
656
|
+
basic_result = _attempt_basic_extraction(content, mime_type, e, index)
|
657
|
+
return (index, basic_result)
|
514
658
|
|
515
659
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
516
660
|
future_to_index = {
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.19.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -42,6 +42,7 @@ Requires-Dist: psutil>=7.1.0
|
|
42
42
|
Requires-Dist: pypdfium2==4.30.0
|
43
43
|
Requires-Dist: python-calamine>=0.5.3
|
44
44
|
Requires-Dist: python-pptx>=1.0.2
|
45
|
+
Requires-Dist: transformers>=4.30.0
|
45
46
|
Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
|
46
47
|
Provides-Extra: additional-extensions
|
47
48
|
Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
|
@@ -63,6 +64,7 @@ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
|
63
64
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
64
65
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
65
66
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
67
|
+
Requires-Dist: transformers>=4.25.0; extra == 'all'
|
66
68
|
Provides-Extra: api
|
67
69
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
68
70
|
Provides-Extra: chunking
|
@@ -82,6 +84,7 @@ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
|
82
84
|
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
83
85
|
Provides-Extra: gmft
|
84
86
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
87
|
+
Requires-Dist: transformers>=4.25.0; extra == 'gmft'
|
85
88
|
Provides-Extra: langdetect
|
86
89
|
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
87
90
|
Provides-Extra: paddleocr
|
@@ -4,30 +4,31 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
|
4
4
|
kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
|
5
5
|
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
6
|
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
|
-
kreuzberg/_entity_extraction.py,sha256=
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=Ks-1gZIYDqgg2uJerd0FH_lYhjIwS0f0bMVhR9M59jA,7518
|
8
|
+
kreuzberg/_error_handling.py,sha256=Isr9yrY4JRKOmUVaUOky_LZ7tGVZAm8jxRD3qGbkc1g,5604
|
8
9
|
kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
|
9
|
-
kreuzberg/_language_detection.py,sha256=
|
10
|
+
kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD2Ks,1143
|
10
11
|
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
11
12
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
13
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
-
kreuzberg/_types.py,sha256=
|
14
|
-
kreuzberg/cli.py,sha256=
|
14
|
+
kreuzberg/_types.py,sha256=6oBsmUUihVr4hJJrYeuWoUVzCP_-eciCrBVvGQHQTDI,49920
|
15
|
+
kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
|
15
16
|
kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
|
16
|
-
kreuzberg/extraction.py,sha256=
|
17
|
+
kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
|
17
18
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
19
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
20
|
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
20
|
-
kreuzberg/_api/main.py,sha256=
|
21
|
+
kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
|
21
22
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
kreuzberg/_extractors/_base.py,sha256=
|
23
|
+
kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
|
23
24
|
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
24
|
-
kreuzberg/_extractors/_html.py,sha256=
|
25
|
+
kreuzberg/_extractors/_html.py,sha256=vNAgBrfok-16SOkhhsy10unqVwAczlTL_2KEn2X6S98,6315
|
25
26
|
kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
|
26
27
|
kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
|
27
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
28
|
+
kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
|
28
29
|
kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
|
29
30
|
kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
|
30
|
-
kreuzberg/_extractors/_structured.py,sha256=
|
31
|
+
kreuzberg/_extractors/_structured.py,sha256=thpXhsBnvaHzGQX4sy6eVHowFv0yaYxLGHwxx4DouCI,8947
|
31
32
|
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
32
33
|
kreuzberg/_mcp/server.py,sha256=71MhjiFDwgFROdGejf0djgO1eG370qudWmZsN59CUeA,16743
|
33
34
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
@@ -35,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
|
35
36
|
kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
|
36
37
|
kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
|
37
38
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
38
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
39
|
+
kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
|
39
40
|
kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
|
40
41
|
kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
|
41
42
|
kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
|
@@ -121,8 +122,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
121
122
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
122
123
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
123
124
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
124
|
-
kreuzberg-3.
|
125
|
-
kreuzberg-3.
|
126
|
-
kreuzberg-3.
|
127
|
-
kreuzberg-3.
|
128
|
-
kreuzberg-3.
|
125
|
+
kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
|
126
|
+
kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
127
|
+
kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
128
|
+
kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
129
|
+
kreuzberg-3.19.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|