kreuzberg 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +45 -3
- kreuzberg/_entity_extraction.py +108 -18
- kreuzberg/_error_handling.py +182 -0
- kreuzberg/_extractors/_base.py +2 -2
- kreuzberg/_extractors/_html.py +2 -2
- kreuzberg/_extractors/_pdf.py +33 -54
- kreuzberg/_extractors/_structured.py +1 -1
- kreuzberg/_language_detection.py +2 -0
- kreuzberg/_ocr/_tesseract.py +28 -6
- kreuzberg/_types.py +18 -0
- kreuzberg/cli.py +36 -22
- kreuzberg/extraction.py +251 -107
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/METADATA +7 -4
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/RECORD +17 -16
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -1113,6 +1113,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1113
1113
|
**run_config["remaining_kwargs"],
|
1114
1114
|
"language": run_config["language"],
|
1115
1115
|
"psm": run_config["psm"],
|
1116
|
+
"tesseract_format": run_config["tesseract_format"],
|
1117
|
+
"ext": run_config["ext"],
|
1118
|
+
"output_format": run_config["output_format"],
|
1119
|
+
"enable_table_detection": run_config["enable_table_detection"],
|
1116
1120
|
}
|
1117
1121
|
|
1118
1122
|
optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
|
@@ -1222,13 +1226,21 @@ def _process_image_with_tesseract(
|
|
1222
1226
|
config_dict: dict[str, Any],
|
1223
1227
|
) -> dict[str, Any]:
|
1224
1228
|
try:
|
1225
|
-
|
1226
|
-
|
1229
|
+
tesseract_format = config_dict.get("tesseract_format", "text")
|
1230
|
+
ext = config_dict.get("ext", ".txt")
|
1231
|
+
output_format = config_dict.get("output_format", "text")
|
1232
|
+
config_dict.get("enable_table_detection", False)
|
1233
|
+
|
1234
|
+
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
|
1235
|
+
output_base = tmp_file.name.replace(ext, "")
|
1227
1236
|
|
1228
1237
|
try:
|
1229
1238
|
language = config_dict.get("language", "eng")
|
1230
1239
|
psm = config_dict.get("psm", 3)
|
1231
1240
|
|
1241
|
+
# Convert PSM enum to integer value if needed
|
1242
|
+
psm_value = psm.value if hasattr(psm, "value") else psm
|
1243
|
+
|
1232
1244
|
command = [
|
1233
1245
|
"tesseract",
|
1234
1246
|
image_path,
|
@@ -1236,13 +1248,16 @@ def _process_image_with_tesseract(
|
|
1236
1248
|
"-l",
|
1237
1249
|
language,
|
1238
1250
|
"--psm",
|
1239
|
-
str(
|
1251
|
+
str(psm_value),
|
1240
1252
|
"--oem",
|
1241
1253
|
"1",
|
1242
1254
|
"--loglevel",
|
1243
1255
|
"OFF",
|
1244
1256
|
]
|
1245
1257
|
|
1258
|
+
if tesseract_format != "text":
|
1259
|
+
command.append(tesseract_format)
|
1260
|
+
|
1246
1261
|
boolean_options = [
|
1247
1262
|
"classify_use_pre_adapted_templates",
|
1248
1263
|
"language_model_ngram_on",
|
@@ -1275,10 +1290,17 @@ def _process_image_with_tesseract(
|
|
1275
1290
|
if result.returncode != 0:
|
1276
1291
|
raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
|
1277
1292
|
|
1278
|
-
output_file = output_base +
|
1293
|
+
output_file = output_base + ext
|
1279
1294
|
with Path(output_file).open(encoding="utf-8") as f:
|
1280
1295
|
text = f.read()
|
1281
1296
|
|
1297
|
+
# Process based on output format
|
1298
|
+
if output_format == "markdown" and tesseract_format == "hocr":
|
1299
|
+
# Import here to avoid circular dependency ~keep
|
1300
|
+
from html_to_markdown import convert_to_markdown # noqa: PLC0415
|
1301
|
+
|
1302
|
+
text = convert_to_markdown(text, heading_style="atx")
|
1303
|
+
|
1282
1304
|
text = normalize_spaces(text)
|
1283
1305
|
|
1284
1306
|
return {
|
@@ -1289,8 +1311,8 @@ def _process_image_with_tesseract(
|
|
1289
1311
|
}
|
1290
1312
|
|
1291
1313
|
finally:
|
1292
|
-
for
|
1293
|
-
temp_file = output_base +
|
1314
|
+
for possible_ext in [ext, ".txt", ".hocr", ".tsv"]:
|
1315
|
+
temp_file = output_base + possible_ext
|
1294
1316
|
temp_path = Path(temp_file)
|
1295
1317
|
if temp_path.exists():
|
1296
1318
|
temp_path.unlink()
|
kreuzberg/_types.py
CHANGED
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
|
|
32
32
|
|
33
33
|
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
34
34
|
OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
|
35
|
+
ErrorContextType = Literal["batch_processing", "optional_feature", "single_extraction", "unknown"]
|
35
36
|
|
36
37
|
|
37
38
|
class ConfigDict:
|
@@ -503,6 +504,17 @@ class SpacyEntityExtractionConfig(ConfigDict):
|
|
503
504
|
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
504
505
|
|
505
506
|
|
507
|
+
class ProcessingErrorDict(TypedDict):
|
508
|
+
feature: str
|
509
|
+
"""Name of the feature that failed (e.g., 'chunking', 'entity_extraction', 'keyword_extraction')."""
|
510
|
+
error_type: str
|
511
|
+
"""Type of the exception that occurred (e.g., 'RuntimeError', 'ValidationError')."""
|
512
|
+
error_message: str
|
513
|
+
"""Human-readable error message."""
|
514
|
+
traceback: str
|
515
|
+
"""Full Python traceback for debugging."""
|
516
|
+
|
517
|
+
|
506
518
|
class BoundingBox(TypedDict):
|
507
519
|
left: int
|
508
520
|
"""X coordinate of the left edge."""
|
@@ -701,6 +713,10 @@ class Metadata(TypedDict, total=False):
|
|
701
713
|
"""Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
|
702
714
|
token_reduction: NotRequired[dict[str, float]]
|
703
715
|
"""Token reduction statistics including reduction ratios and counts."""
|
716
|
+
processing_errors: NotRequired[list[ProcessingErrorDict]]
|
717
|
+
"""List of processing errors that occurred during extraction."""
|
718
|
+
extraction_error: NotRequired[dict[str, Any]]
|
719
|
+
"""Error information for critical extraction failures."""
|
704
720
|
|
705
721
|
|
706
722
|
_VALID_METADATA_KEYS = {
|
@@ -756,6 +772,8 @@ _VALID_METADATA_KEYS = {
|
|
756
772
|
"message",
|
757
773
|
"attributes",
|
758
774
|
"token_reduction",
|
775
|
+
"processing_errors",
|
776
|
+
"extraction_error",
|
759
777
|
}
|
760
778
|
|
761
779
|
|
kreuzberg/cli.py
CHANGED
@@ -168,31 +168,45 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
|
|
168
168
|
input_text = sys.stdin.read()
|
169
169
|
input_bytes = input_text.encode("utf-8")
|
170
170
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
except ImportError: # pragma: no cover
|
184
|
-
content_str = input_bytes.decode("utf-8", errors="ignore").lower()
|
185
|
-
mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
|
171
|
+
# Detect MIME type from content
|
172
|
+
content_str = input_bytes.decode("utf-8", errors="ignore").lower()
|
173
|
+
if "<html" in content_str or "<!doctype html" in content_str or "<body" in content_str:
|
174
|
+
mime_type = "text/html"
|
175
|
+
elif (content_str.strip().startswith("{") and content_str.strip().endswith("}")) or (
|
176
|
+
content_str.strip().startswith("[") and content_str.strip().endswith("]")
|
177
|
+
):
|
178
|
+
mime_type = "application/json"
|
179
|
+
elif content_str.strip().startswith("---") or ":" in content_str[:100]:
|
180
|
+
mime_type = "application/x-yaml"
|
181
|
+
else:
|
182
|
+
mime_type = "text/plain"
|
186
183
|
|
184
|
+
# Use progress display if possible, fallback to simple extraction on Windows issues
|
185
|
+
try:
|
186
|
+
with Progress(
|
187
|
+
SpinnerColumn(),
|
188
|
+
TextColumn("[progress.description]{task.description}"),
|
189
|
+
console=console,
|
190
|
+
transient=True,
|
191
|
+
) as progress:
|
192
|
+
progress.add_task("Extracting text...", total=None)
|
193
|
+
return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
|
194
|
+
except (OSError, RuntimeError): # pragma: no cover
|
195
|
+
# Fallback for Windows console issues
|
187
196
|
return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
|
188
197
|
else:
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
198
|
+
# Use progress display if possible, fallback to simple extraction on Windows issues
|
199
|
+
try:
|
200
|
+
with Progress(
|
201
|
+
SpinnerColumn(),
|
202
|
+
TextColumn("[progress.description]{task.description}"),
|
203
|
+
console=console,
|
204
|
+
transient=True,
|
205
|
+
) as progress:
|
206
|
+
progress.add_task(f"Extracting text from {file.name}...", total=None)
|
207
|
+
return extract_file_sync(str(file), config=extraction_config)
|
208
|
+
except (OSError, RuntimeError): # pragma: no cover
|
209
|
+
# Fallback for Windows console issues
|
196
210
|
return extract_file_sync(str(file), config=extraction_config)
|
197
211
|
|
198
212
|
|
kreuzberg/extraction.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import multiprocessing as mp
|
4
|
+
import traceback
|
4
5
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
6
|
from pathlib import Path
|
6
7
|
from typing import TYPE_CHECKING, Final, cast
|
@@ -10,6 +11,7 @@ import anyio
|
|
10
11
|
from kreuzberg._chunker import get_chunker
|
11
12
|
from kreuzberg._document_classification import auto_detect_document_type
|
12
13
|
from kreuzberg._entity_extraction import extract_entities, extract_keywords
|
14
|
+
from kreuzberg._error_handling import safe_feature_execution, should_exception_bubble_up
|
13
15
|
from kreuzberg._language_detection import detect_languages
|
14
16
|
from kreuzberg._mime_types import (
|
15
17
|
validate_mime_type,
|
@@ -21,7 +23,7 @@ from kreuzberg._utils._document_cache import get_document_cache
|
|
21
23
|
from kreuzberg._utils._errors import create_error_context
|
22
24
|
from kreuzberg._utils._string import safe_decode
|
23
25
|
from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
|
24
|
-
from kreuzberg.exceptions import ValidationError
|
26
|
+
from kreuzberg.exceptions import KreuzbergError, ValidationError
|
25
27
|
|
26
28
|
if TYPE_CHECKING:
|
27
29
|
from collections.abc import Sequence
|
@@ -50,69 +52,107 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
|
|
50
52
|
def _validate_and_post_process_helper(
|
51
53
|
result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
|
52
54
|
) -> ExtractionResult:
|
55
|
+
if result.metadata is None:
|
56
|
+
result.metadata = {}
|
57
|
+
|
53
58
|
if config.chunk_content:
|
54
|
-
result.chunks =
|
55
|
-
|
56
|
-
|
57
|
-
|
59
|
+
result.chunks = safe_feature_execution(
|
60
|
+
feature_name="chunking",
|
61
|
+
execution_func=lambda: _handle_chunk_content(
|
62
|
+
mime_type=result.mime_type,
|
63
|
+
config=config,
|
64
|
+
content=result.content,
|
65
|
+
),
|
66
|
+
default_value=[],
|
67
|
+
result=result,
|
58
68
|
)
|
59
69
|
|
60
70
|
if config.extract_entities:
|
61
|
-
|
62
|
-
|
71
|
+
result.entities = safe_feature_execution(
|
72
|
+
feature_name="entity_extraction",
|
73
|
+
execution_func=lambda: extract_entities(
|
63
74
|
result.content,
|
64
75
|
custom_patterns=config.custom_entity_patterns,
|
65
|
-
)
|
66
|
-
|
67
|
-
result
|
76
|
+
),
|
77
|
+
default_value=None,
|
78
|
+
result=result,
|
79
|
+
)
|
68
80
|
|
69
81
|
if config.extract_keywords:
|
70
|
-
|
71
|
-
|
82
|
+
result.keywords = safe_feature_execution(
|
83
|
+
feature_name="keyword_extraction",
|
84
|
+
execution_func=lambda: extract_keywords(
|
72
85
|
result.content,
|
73
86
|
keyword_count=config.keyword_count,
|
74
|
-
)
|
75
|
-
|
76
|
-
result
|
87
|
+
),
|
88
|
+
default_value=None,
|
89
|
+
result=result,
|
90
|
+
)
|
77
91
|
|
78
92
|
if config.auto_detect_language:
|
79
|
-
lang_config = config.language_detection_config
|
80
|
-
if lang_config is None:
|
81
|
-
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
82
93
|
|
83
|
-
|
94
|
+
def _detect_language() -> list[str]:
|
95
|
+
lang_config = config.language_detection_config
|
96
|
+
if lang_config is None:
|
97
|
+
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
98
|
+
|
99
|
+
lang_config = LanguageDetectionConfig(model=config.language_detection_model)
|
84
100
|
|
85
|
-
|
86
|
-
|
87
|
-
|
101
|
+
return detect_languages(result.content, config=lang_config) or []
|
102
|
+
|
103
|
+
result.detected_languages = safe_feature_execution(
|
104
|
+
feature_name="language_detection",
|
105
|
+
execution_func=_detect_language,
|
106
|
+
default_value=[],
|
107
|
+
result=result,
|
88
108
|
)
|
89
109
|
|
90
110
|
if config.auto_detect_document_type:
|
91
|
-
result =
|
111
|
+
result = safe_feature_execution(
|
112
|
+
feature_name="document_type_detection",
|
113
|
+
execution_func=lambda: auto_detect_document_type(result, config, file_path=file_path),
|
114
|
+
default_value=result,
|
115
|
+
result=result,
|
116
|
+
)
|
92
117
|
|
93
118
|
if config.token_reduction is not None and config.token_reduction.mode != "off":
|
94
|
-
original_content = result.content
|
95
119
|
|
96
|
-
|
97
|
-
|
98
|
-
language_hint = result.detected_languages[0]
|
120
|
+
def _apply_token_reduction() -> str:
|
121
|
+
original_content = result.content
|
99
122
|
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
123
|
+
language_hint = None
|
124
|
+
if result.detected_languages and len(result.detected_languages) > 0:
|
125
|
+
language_hint = result.detected_languages[0]
|
126
|
+
|
127
|
+
reduced_content = (
|
128
|
+
reduce_tokens(
|
129
|
+
original_content,
|
130
|
+
config=config.token_reduction,
|
131
|
+
language=language_hint,
|
132
|
+
)
|
133
|
+
if config.token_reduction
|
134
|
+
else original_content
|
135
|
+
)
|
136
|
+
reduction_stats = get_reduction_stats(original_content, reduced_content)
|
137
|
+
|
138
|
+
if result.metadata is not None:
|
139
|
+
result.metadata["token_reduction"] = {
|
140
|
+
"character_reduction_ratio": reduction_stats["character_reduction_ratio"],
|
141
|
+
"token_reduction_ratio": reduction_stats["token_reduction_ratio"],
|
142
|
+
"original_characters": reduction_stats["original_characters"],
|
143
|
+
"reduced_characters": reduction_stats["reduced_characters"],
|
144
|
+
"original_tokens": reduction_stats["original_tokens"],
|
145
|
+
"reduced_tokens": reduction_stats["reduced_tokens"],
|
146
|
+
}
|
147
|
+
|
148
|
+
return reduced_content
|
149
|
+
|
150
|
+
result.content = safe_feature_execution(
|
151
|
+
feature_name="token_reduction",
|
152
|
+
execution_func=_apply_token_reduction,
|
153
|
+
default_value=result.content,
|
154
|
+
result=result,
|
104
155
|
)
|
105
|
-
reduction_stats = get_reduction_stats(original_content, reduced_content)
|
106
|
-
|
107
|
-
result.content = reduced_content
|
108
|
-
result.metadata["token_reduction"] = {
|
109
|
-
"character_reduction_ratio": reduction_stats["character_reduction_ratio"],
|
110
|
-
"token_reduction_ratio": reduction_stats["token_reduction_ratio"],
|
111
|
-
"original_characters": reduction_stats["original_characters"],
|
112
|
-
"reduced_characters": reduction_stats["reduced_characters"],
|
113
|
-
"original_tokens": reduction_stats["original_tokens"],
|
114
|
-
"reduced_tokens": reduction_stats["reduced_tokens"],
|
115
|
-
}
|
116
156
|
|
117
157
|
return result
|
118
158
|
|
@@ -125,8 +165,22 @@ async def _validate_and_post_process_async(
|
|
125
165
|
|
126
166
|
result = _validate_and_post_process_helper(result, config, file_path)
|
127
167
|
|
128
|
-
for post_processor in config.post_processing_hooks or []:
|
129
|
-
|
168
|
+
for i, post_processor in enumerate(config.post_processing_hooks or []):
|
169
|
+
try:
|
170
|
+
result = await run_maybe_sync(post_processor, result)
|
171
|
+
except (KreuzbergError, ValueError, RuntimeError, TypeError) as e: # noqa: PERF203
|
172
|
+
if result.metadata is None:
|
173
|
+
result.metadata = {}
|
174
|
+
error_list = result.metadata.setdefault("processing_errors", [])
|
175
|
+
if isinstance(error_list, list):
|
176
|
+
error_list.append(
|
177
|
+
{
|
178
|
+
"feature": f"post_processing_hook_{i}",
|
179
|
+
"error_type": type(e).__name__,
|
180
|
+
"error_message": str(e),
|
181
|
+
"traceback": traceback.format_exc(),
|
182
|
+
}
|
183
|
+
)
|
130
184
|
|
131
185
|
return result
|
132
186
|
|
@@ -260,22 +314,18 @@ async def batch_extract_file(
|
|
260
314
|
config,
|
261
315
|
)
|
262
316
|
results[index] = result
|
263
|
-
except Exception as e:
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
index=index,
|
274
|
-
),
|
275
|
-
},
|
276
|
-
chunks=[],
|
317
|
+
except Exception as e:
|
318
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
319
|
+
raise
|
320
|
+
|
321
|
+
basic_result = _attempt_basic_extraction(
|
322
|
+
None,
|
323
|
+
None,
|
324
|
+
e,
|
325
|
+
index,
|
326
|
+
file_path=str(path),
|
277
327
|
)
|
278
|
-
results[index] =
|
328
|
+
results[index] = basic_result
|
279
329
|
|
280
330
|
async with anyio.create_task_group() as tg:
|
281
331
|
for i, path in enumerate(file_paths):
|
@@ -309,23 +359,12 @@ async def batch_extract_bytes(
|
|
309
359
|
try:
|
310
360
|
result = await extract_bytes(content, mime_type, config)
|
311
361
|
results[index] = result
|
312
|
-
except Exception as e:
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
"error_context": create_error_context(
|
319
|
-
operation="batch_extract_bytes",
|
320
|
-
error=e,
|
321
|
-
index=index,
|
322
|
-
mime_type=mime_type,
|
323
|
-
content_size=len(content),
|
324
|
-
),
|
325
|
-
},
|
326
|
-
chunks=[],
|
327
|
-
)
|
328
|
-
results[index] = error_result
|
362
|
+
except Exception as e:
|
363
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
364
|
+
raise
|
365
|
+
|
366
|
+
basic_result = _attempt_basic_extraction(content, mime_type, e, index)
|
367
|
+
results[index] = basic_result
|
329
368
|
|
330
369
|
async with anyio.create_task_group() as tg:
|
331
370
|
for i, (content, mime_type) in enumerate(contents):
|
@@ -334,6 +373,125 @@ async def batch_extract_bytes(
|
|
334
373
|
return results
|
335
374
|
|
336
375
|
|
376
|
+
def _attempt_basic_extraction(
|
377
|
+
content: bytes | None, mime_type: str | None, original_error: Exception, index: int, *, file_path: str | None = None
|
378
|
+
) -> ExtractionResult:
|
379
|
+
"""Attempt basic extraction when full extraction fails, preserving as much as possible.
|
380
|
+
|
381
|
+
This function tries to extract at least basic text content even when advanced
|
382
|
+
features like OCR, entity extraction, etc. fail.
|
383
|
+
|
384
|
+
Args:
|
385
|
+
content: The raw content bytes (None for file extractions)
|
386
|
+
mime_type: The MIME type of the content (None if unknown)
|
387
|
+
original_error: The exception that caused the main extraction to fail
|
388
|
+
index: Index of this content in the batch
|
389
|
+
file_path: Optional file path for file-based extractions
|
390
|
+
|
391
|
+
Returns:
|
392
|
+
A basic ExtractionResult with whatever could be extracted
|
393
|
+
"""
|
394
|
+
if (
|
395
|
+
isinstance(original_error, (ValueError, TypeError, ValidationError))
|
396
|
+
or "mock" in str(type(original_error)).lower()
|
397
|
+
):
|
398
|
+
return ExtractionResult(
|
399
|
+
content=f"Error: {type(original_error).__name__}: {original_error!s}",
|
400
|
+
mime_type="text/plain",
|
401
|
+
metadata={
|
402
|
+
"error": f"{type(original_error).__name__}: {original_error!s}",
|
403
|
+
"error_context": create_error_context(
|
404
|
+
operation="batch_extract_file" if file_path else "batch_extract_bytes",
|
405
|
+
error=original_error,
|
406
|
+
index=index,
|
407
|
+
mime_type=mime_type,
|
408
|
+
content_size=len(content) if content else 0,
|
409
|
+
file_path=file_path,
|
410
|
+
),
|
411
|
+
},
|
412
|
+
chunks=[],
|
413
|
+
entities=[],
|
414
|
+
keywords=[],
|
415
|
+
detected_languages=[],
|
416
|
+
tables=[],
|
417
|
+
images=[],
|
418
|
+
image_ocr_results=[],
|
419
|
+
)
|
420
|
+
|
421
|
+
try:
|
422
|
+
if content is None:
|
423
|
+
return ExtractionResult(
|
424
|
+
content=f"Error: {type(original_error).__name__}: {original_error!s}",
|
425
|
+
mime_type="text/plain",
|
426
|
+
metadata={
|
427
|
+
"error": f"{type(original_error).__name__}: {original_error!s}",
|
428
|
+
"error_context": create_error_context(
|
429
|
+
operation="batch_extract_file",
|
430
|
+
error=original_error,
|
431
|
+
index=index,
|
432
|
+
file_path=file_path,
|
433
|
+
),
|
434
|
+
},
|
435
|
+
chunks=[],
|
436
|
+
entities=[],
|
437
|
+
keywords=[],
|
438
|
+
detected_languages=[],
|
439
|
+
tables=[],
|
440
|
+
images=[],
|
441
|
+
image_ocr_results=[],
|
442
|
+
)
|
443
|
+
|
444
|
+
mime_type = validate_mime_type(mime_type=mime_type)
|
445
|
+
if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=ExtractionConfig()):
|
446
|
+
basic_result = extractor.extract_bytes_sync(content)
|
447
|
+
|
448
|
+
if basic_result.metadata is None:
|
449
|
+
basic_result.metadata = {}
|
450
|
+
|
451
|
+
basic_result.metadata["extraction_error"] = {
|
452
|
+
"error_type": type(original_error).__name__,
|
453
|
+
"error_message": str(original_error),
|
454
|
+
"traceback": traceback.format_exc(),
|
455
|
+
"context": create_error_context(
|
456
|
+
operation="batch_extract_file" if file_path else "batch_extract_bytes",
|
457
|
+
error=original_error,
|
458
|
+
index=index,
|
459
|
+
mime_type=mime_type,
|
460
|
+
content_size=len(content),
|
461
|
+
file_path=file_path,
|
462
|
+
),
|
463
|
+
"recovery_mode": "basic_extraction",
|
464
|
+
}
|
465
|
+
|
466
|
+
return basic_result
|
467
|
+
|
468
|
+
except (KreuzbergError, ValueError, RuntimeError, TypeError):
|
469
|
+
pass
|
470
|
+
|
471
|
+
return ExtractionResult(
|
472
|
+
content=f"Error: {type(original_error).__name__}: {original_error!s}",
|
473
|
+
mime_type="text/plain",
|
474
|
+
metadata={
|
475
|
+
"error": f"{type(original_error).__name__}: {original_error!s}",
|
476
|
+
"error_context": create_error_context(
|
477
|
+
operation="batch_extract_file" if file_path else "batch_extract_bytes",
|
478
|
+
error=original_error,
|
479
|
+
index=index,
|
480
|
+
mime_type=mime_type,
|
481
|
+
content_size=len(content) if content else 0,
|
482
|
+
file_path=file_path,
|
483
|
+
),
|
484
|
+
},
|
485
|
+
chunks=[],
|
486
|
+
entities=[],
|
487
|
+
keywords=[],
|
488
|
+
detected_languages=[],
|
489
|
+
tables=[],
|
490
|
+
images=[],
|
491
|
+
image_ocr_results=[],
|
492
|
+
)
|
493
|
+
|
494
|
+
|
337
495
|
def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
|
338
496
|
"""Synchronous version of extract_bytes.
|
339
497
|
|
@@ -444,21 +602,18 @@ def batch_extract_file_sync(
|
|
444
602
|
index,
|
445
603
|
extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
|
446
604
|
)
|
447
|
-
except Exception as e:
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
),
|
458
|
-
},
|
459
|
-
chunks=[],
|
605
|
+
except Exception as e:
|
606
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
607
|
+
raise
|
608
|
+
|
609
|
+
basic_result = _attempt_basic_extraction(
|
610
|
+
None,
|
611
|
+
None,
|
612
|
+
e,
|
613
|
+
index,
|
614
|
+
file_path=str(file_path),
|
460
615
|
)
|
461
|
-
return (index,
|
616
|
+
return (index, basic_result)
|
462
617
|
|
463
618
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
464
619
|
future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
|
@@ -494,23 +649,12 @@ def batch_extract_bytes_sync(
|
|
494
649
|
"""Extract single content with index for ordering."""
|
495
650
|
try:
|
496
651
|
return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
|
497
|
-
except Exception as e:
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
"error_context": create_error_context(
|
504
|
-
operation="batch_extract_bytes_sync",
|
505
|
-
error=e,
|
506
|
-
index=index,
|
507
|
-
mime_type=mime_type,
|
508
|
-
content_size=len(content),
|
509
|
-
),
|
510
|
-
},
|
511
|
-
chunks=[],
|
512
|
-
)
|
513
|
-
return (index, error_result)
|
652
|
+
except Exception as e:
|
653
|
+
if should_exception_bubble_up(e, "batch_processing"):
|
654
|
+
raise
|
655
|
+
|
656
|
+
basic_result = _attempt_basic_extraction(content, mime_type, e, index)
|
657
|
+
return (index, basic_result)
|
514
658
|
|
515
659
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
516
660
|
future_to_index = {
|