kreuzberg 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1113,6 +1113,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1113
1113
  **run_config["remaining_kwargs"],
1114
1114
  "language": run_config["language"],
1115
1115
  "psm": run_config["psm"],
1116
+ "tesseract_format": run_config["tesseract_format"],
1117
+ "ext": run_config["ext"],
1118
+ "output_format": run_config["output_format"],
1119
+ "enable_table_detection": run_config["enable_table_detection"],
1116
1120
  }
1117
1121
 
1118
1122
  optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
@@ -1222,13 +1226,21 @@ def _process_image_with_tesseract(
1222
1226
  config_dict: dict[str, Any],
1223
1227
  ) -> dict[str, Any]:
1224
1228
  try:
1225
- with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
1226
- output_base = tmp_file.name.replace(".txt", "")
1229
+ tesseract_format = config_dict.get("tesseract_format", "text")
1230
+ ext = config_dict.get("ext", ".txt")
1231
+ output_format = config_dict.get("output_format", "text")
1232
+ config_dict.get("enable_table_detection", False)
1233
+
1234
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
1235
+ output_base = tmp_file.name.replace(ext, "")
1227
1236
 
1228
1237
  try:
1229
1238
  language = config_dict.get("language", "eng")
1230
1239
  psm = config_dict.get("psm", 3)
1231
1240
 
1241
+ # Convert PSM enum to integer value if needed
1242
+ psm_value = psm.value if hasattr(psm, "value") else psm
1243
+
1232
1244
  command = [
1233
1245
  "tesseract",
1234
1246
  image_path,
@@ -1236,13 +1248,16 @@ def _process_image_with_tesseract(
1236
1248
  "-l",
1237
1249
  language,
1238
1250
  "--psm",
1239
- str(psm),
1251
+ str(psm_value),
1240
1252
  "--oem",
1241
1253
  "1",
1242
1254
  "--loglevel",
1243
1255
  "OFF",
1244
1256
  ]
1245
1257
 
1258
+ if tesseract_format != "text":
1259
+ command.append(tesseract_format)
1260
+
1246
1261
  boolean_options = [
1247
1262
  "classify_use_pre_adapted_templates",
1248
1263
  "language_model_ngram_on",
@@ -1275,10 +1290,17 @@ def _process_image_with_tesseract(
1275
1290
  if result.returncode != 0:
1276
1291
  raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
1277
1292
 
1278
- output_file = output_base + ".txt"
1293
+ output_file = output_base + ext
1279
1294
  with Path(output_file).open(encoding="utf-8") as f:
1280
1295
  text = f.read()
1281
1296
 
1297
+ # Process based on output format
1298
+ if output_format == "markdown" and tesseract_format == "hocr":
1299
+ # Import here to avoid circular dependency ~keep
1300
+ from html_to_markdown import convert_to_markdown # noqa: PLC0415
1301
+
1302
+ text = convert_to_markdown(text, heading_style="atx")
1303
+
1282
1304
  text = normalize_spaces(text)
1283
1305
 
1284
1306
  return {
@@ -1289,8 +1311,8 @@ def _process_image_with_tesseract(
1289
1311
  }
1290
1312
 
1291
1313
  finally:
1292
- for ext in [".txt"]:
1293
- temp_file = output_base + ext
1314
+ for possible_ext in [ext, ".txt", ".hocr", ".tsv"]:
1315
+ temp_file = output_base + possible_ext
1294
1316
  temp_path = Path(temp_file)
1295
1317
  if temp_path.exists():
1296
1318
  temp_path.unlink()
kreuzberg/_types.py CHANGED
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
32
32
 
33
33
  OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
34
34
  OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
35
+ ErrorContextType = Literal["batch_processing", "optional_feature", "single_extraction", "unknown"]
35
36
 
36
37
 
37
38
  class ConfigDict:
@@ -503,6 +504,17 @@ class SpacyEntityExtractionConfig(ConfigDict):
503
504
  return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
504
505
 
505
506
 
507
+ class ProcessingErrorDict(TypedDict):
508
+ feature: str
509
+ """Name of the feature that failed (e.g., 'chunking', 'entity_extraction', 'keyword_extraction')."""
510
+ error_type: str
511
+ """Type of the exception that occurred (e.g., 'RuntimeError', 'ValidationError')."""
512
+ error_message: str
513
+ """Human-readable error message."""
514
+ traceback: str
515
+ """Full Python traceback for debugging."""
516
+
517
+
506
518
  class BoundingBox(TypedDict):
507
519
  left: int
508
520
  """X coordinate of the left edge."""
@@ -701,6 +713,10 @@ class Metadata(TypedDict, total=False):
701
713
  """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
702
714
  token_reduction: NotRequired[dict[str, float]]
703
715
  """Token reduction statistics including reduction ratios and counts."""
716
+ processing_errors: NotRequired[list[ProcessingErrorDict]]
717
+ """List of processing errors that occurred during extraction."""
718
+ extraction_error: NotRequired[dict[str, Any]]
719
+ """Error information for critical extraction failures."""
704
720
 
705
721
 
706
722
  _VALID_METADATA_KEYS = {
@@ -756,6 +772,8 @@ _VALID_METADATA_KEYS = {
756
772
  "message",
757
773
  "attributes",
758
774
  "token_reduction",
775
+ "processing_errors",
776
+ "extraction_error",
759
777
  }
760
778
 
761
779
 
kreuzberg/cli.py CHANGED
@@ -168,31 +168,45 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
168
168
  input_text = sys.stdin.read()
169
169
  input_bytes = input_text.encode("utf-8")
170
170
 
171
- with Progress(
172
- SpinnerColumn(),
173
- TextColumn("[progress.description]{task.description}"),
174
- console=console,
175
- transient=True,
176
- ) as progress:
177
- progress.add_task("Extracting text...", total=None)
178
-
179
- try:
180
- import magic # type: ignore[import-not-found] # noqa: PLC0415
181
-
182
- mime_type = magic.from_buffer(input_bytes, mime=True)
183
- except ImportError: # pragma: no cover
184
- content_str = input_bytes.decode("utf-8", errors="ignore").lower()
185
- mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
171
+ # Detect MIME type from content
172
+ content_str = input_bytes.decode("utf-8", errors="ignore").lower()
173
+ if "<html" in content_str or "<!doctype html" in content_str or "<body" in content_str:
174
+ mime_type = "text/html"
175
+ elif (content_str.strip().startswith("{") and content_str.strip().endswith("}")) or (
176
+ content_str.strip().startswith("[") and content_str.strip().endswith("]")
177
+ ):
178
+ mime_type = "application/json"
179
+ elif content_str.strip().startswith("---") or ":" in content_str[:100]:
180
+ mime_type = "application/x-yaml"
181
+ else:
182
+ mime_type = "text/plain"
186
183
 
184
+ # Use progress display if possible, fallback to simple extraction on Windows issues
185
+ try:
186
+ with Progress(
187
+ SpinnerColumn(),
188
+ TextColumn("[progress.description]{task.description}"),
189
+ console=console,
190
+ transient=True,
191
+ ) as progress:
192
+ progress.add_task("Extracting text...", total=None)
193
+ return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
194
+ except (OSError, RuntimeError): # pragma: no cover
195
+ # Fallback for Windows console issues
187
196
  return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
188
197
  else:
189
- with Progress(
190
- SpinnerColumn(),
191
- TextColumn("[progress.description]{task.description}"),
192
- console=console,
193
- transient=True,
194
- ) as progress:
195
- progress.add_task(f"Extracting text from {file.name}...", total=None)
198
+ # Use progress display if possible, fallback to simple extraction on Windows issues
199
+ try:
200
+ with Progress(
201
+ SpinnerColumn(),
202
+ TextColumn("[progress.description]{task.description}"),
203
+ console=console,
204
+ transient=True,
205
+ ) as progress:
206
+ progress.add_task(f"Extracting text from {file.name}...", total=None)
207
+ return extract_file_sync(str(file), config=extraction_config)
208
+ except (OSError, RuntimeError): # pragma: no cover
209
+ # Fallback for Windows console issues
196
210
  return extract_file_sync(str(file), config=extraction_config)
197
211
 
198
212
 
kreuzberg/extraction.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import multiprocessing as mp
4
+ import traceback
4
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
6
  from pathlib import Path
6
7
  from typing import TYPE_CHECKING, Final, cast
@@ -10,6 +11,7 @@ import anyio
10
11
  from kreuzberg._chunker import get_chunker
11
12
  from kreuzberg._document_classification import auto_detect_document_type
12
13
  from kreuzberg._entity_extraction import extract_entities, extract_keywords
14
+ from kreuzberg._error_handling import safe_feature_execution, should_exception_bubble_up
13
15
  from kreuzberg._language_detection import detect_languages
14
16
  from kreuzberg._mime_types import (
15
17
  validate_mime_type,
@@ -21,7 +23,7 @@ from kreuzberg._utils._document_cache import get_document_cache
21
23
  from kreuzberg._utils._errors import create_error_context
22
24
  from kreuzberg._utils._string import safe_decode
23
25
  from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
24
- from kreuzberg.exceptions import ValidationError
26
+ from kreuzberg.exceptions import KreuzbergError, ValidationError
25
27
 
26
28
  if TYPE_CHECKING:
27
29
  from collections.abc import Sequence
@@ -50,69 +52,107 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
50
52
  def _validate_and_post_process_helper(
51
53
  result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
52
54
  ) -> ExtractionResult:
55
+ if result.metadata is None:
56
+ result.metadata = {}
57
+
53
58
  if config.chunk_content:
54
- result.chunks = _handle_chunk_content(
55
- mime_type=result.mime_type,
56
- config=config,
57
- content=result.content,
59
+ result.chunks = safe_feature_execution(
60
+ feature_name="chunking",
61
+ execution_func=lambda: _handle_chunk_content(
62
+ mime_type=result.mime_type,
63
+ config=config,
64
+ content=result.content,
65
+ ),
66
+ default_value=[],
67
+ result=result,
58
68
  )
59
69
 
60
70
  if config.extract_entities:
61
- try:
62
- result.entities = extract_entities(
71
+ result.entities = safe_feature_execution(
72
+ feature_name="entity_extraction",
73
+ execution_func=lambda: extract_entities(
63
74
  result.content,
64
75
  custom_patterns=config.custom_entity_patterns,
65
- )
66
- except RuntimeError:
67
- result.entities = None
76
+ ),
77
+ default_value=None,
78
+ result=result,
79
+ )
68
80
 
69
81
  if config.extract_keywords:
70
- try:
71
- result.keywords = extract_keywords(
82
+ result.keywords = safe_feature_execution(
83
+ feature_name="keyword_extraction",
84
+ execution_func=lambda: extract_keywords(
72
85
  result.content,
73
86
  keyword_count=config.keyword_count,
74
- )
75
- except RuntimeError:
76
- result.keywords = None
87
+ ),
88
+ default_value=None,
89
+ result=result,
90
+ )
77
91
 
78
92
  if config.auto_detect_language:
79
- lang_config = config.language_detection_config
80
- if lang_config is None:
81
- from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
82
93
 
83
- lang_config = LanguageDetectionConfig(model=config.language_detection_model)
94
+ def _detect_language() -> list[str]:
95
+ lang_config = config.language_detection_config
96
+ if lang_config is None:
97
+ from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
98
+
99
+ lang_config = LanguageDetectionConfig(model=config.language_detection_model)
84
100
 
85
- result.detected_languages = detect_languages(
86
- result.content,
87
- config=lang_config,
101
+ return detect_languages(result.content, config=lang_config) or []
102
+
103
+ result.detected_languages = safe_feature_execution(
104
+ feature_name="language_detection",
105
+ execution_func=_detect_language,
106
+ default_value=[],
107
+ result=result,
88
108
  )
89
109
 
90
110
  if config.auto_detect_document_type:
91
- result = auto_detect_document_type(result, config, file_path=file_path)
111
+ result = safe_feature_execution(
112
+ feature_name="document_type_detection",
113
+ execution_func=lambda: auto_detect_document_type(result, config, file_path=file_path),
114
+ default_value=result,
115
+ result=result,
116
+ )
92
117
 
93
118
  if config.token_reduction is not None and config.token_reduction.mode != "off":
94
- original_content = result.content
95
119
 
96
- language_hint = None
97
- if result.detected_languages and len(result.detected_languages) > 0:
98
- language_hint = result.detected_languages[0]
120
+ def _apply_token_reduction() -> str:
121
+ original_content = result.content
99
122
 
100
- reduced_content = reduce_tokens(
101
- original_content,
102
- config=config.token_reduction,
103
- language=language_hint,
123
+ language_hint = None
124
+ if result.detected_languages and len(result.detected_languages) > 0:
125
+ language_hint = result.detected_languages[0]
126
+
127
+ reduced_content = (
128
+ reduce_tokens(
129
+ original_content,
130
+ config=config.token_reduction,
131
+ language=language_hint,
132
+ )
133
+ if config.token_reduction
134
+ else original_content
135
+ )
136
+ reduction_stats = get_reduction_stats(original_content, reduced_content)
137
+
138
+ if result.metadata is not None:
139
+ result.metadata["token_reduction"] = {
140
+ "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
141
+ "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
142
+ "original_characters": reduction_stats["original_characters"],
143
+ "reduced_characters": reduction_stats["reduced_characters"],
144
+ "original_tokens": reduction_stats["original_tokens"],
145
+ "reduced_tokens": reduction_stats["reduced_tokens"],
146
+ }
147
+
148
+ return reduced_content
149
+
150
+ result.content = safe_feature_execution(
151
+ feature_name="token_reduction",
152
+ execution_func=_apply_token_reduction,
153
+ default_value=result.content,
154
+ result=result,
104
155
  )
105
- reduction_stats = get_reduction_stats(original_content, reduced_content)
106
-
107
- result.content = reduced_content
108
- result.metadata["token_reduction"] = {
109
- "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
110
- "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
111
- "original_characters": reduction_stats["original_characters"],
112
- "reduced_characters": reduction_stats["reduced_characters"],
113
- "original_tokens": reduction_stats["original_tokens"],
114
- "reduced_tokens": reduction_stats["reduced_tokens"],
115
- }
116
156
 
117
157
  return result
118
158
 
@@ -125,8 +165,22 @@ async def _validate_and_post_process_async(
125
165
 
126
166
  result = _validate_and_post_process_helper(result, config, file_path)
127
167
 
128
- for post_processor in config.post_processing_hooks or []:
129
- result = await run_maybe_sync(post_processor, result)
168
+ for i, post_processor in enumerate(config.post_processing_hooks or []):
169
+ try:
170
+ result = await run_maybe_sync(post_processor, result)
171
+ except (KreuzbergError, ValueError, RuntimeError, TypeError) as e: # noqa: PERF203
172
+ if result.metadata is None:
173
+ result.metadata = {}
174
+ error_list = result.metadata.setdefault("processing_errors", [])
175
+ if isinstance(error_list, list):
176
+ error_list.append(
177
+ {
178
+ "feature": f"post_processing_hook_{i}",
179
+ "error_type": type(e).__name__,
180
+ "error_message": str(e),
181
+ "traceback": traceback.format_exc(),
182
+ }
183
+ )
130
184
 
131
185
  return result
132
186
 
@@ -260,22 +314,18 @@ async def batch_extract_file(
260
314
  config,
261
315
  )
262
316
  results[index] = result
263
- except Exception as e: # noqa: BLE001
264
- error_result = ExtractionResult(
265
- content=f"Error: {type(e).__name__}: {e!s}",
266
- mime_type="text/plain",
267
- metadata={
268
- "error": f"{type(e).__name__}: {e!s}",
269
- "error_context": create_error_context(
270
- operation="batch_extract_file",
271
- file_path=str(path),
272
- error=e,
273
- index=index,
274
- ),
275
- },
276
- chunks=[],
317
+ except Exception as e:
318
+ if should_exception_bubble_up(e, "batch_processing"):
319
+ raise
320
+
321
+ basic_result = _attempt_basic_extraction(
322
+ None,
323
+ None,
324
+ e,
325
+ index,
326
+ file_path=str(path),
277
327
  )
278
- results[index] = error_result
328
+ results[index] = basic_result
279
329
 
280
330
  async with anyio.create_task_group() as tg:
281
331
  for i, path in enumerate(file_paths):
@@ -309,23 +359,12 @@ async def batch_extract_bytes(
309
359
  try:
310
360
  result = await extract_bytes(content, mime_type, config)
311
361
  results[index] = result
312
- except Exception as e: # noqa: BLE001
313
- error_result = ExtractionResult(
314
- content=f"Error: {type(e).__name__}: {e!s}",
315
- mime_type="text/plain",
316
- metadata={
317
- "error": f"{type(e).__name__}: {e!s}",
318
- "error_context": create_error_context(
319
- operation="batch_extract_bytes",
320
- error=e,
321
- index=index,
322
- mime_type=mime_type,
323
- content_size=len(content),
324
- ),
325
- },
326
- chunks=[],
327
- )
328
- results[index] = error_result
362
+ except Exception as e:
363
+ if should_exception_bubble_up(e, "batch_processing"):
364
+ raise
365
+
366
+ basic_result = _attempt_basic_extraction(content, mime_type, e, index)
367
+ results[index] = basic_result
329
368
 
330
369
  async with anyio.create_task_group() as tg:
331
370
  for i, (content, mime_type) in enumerate(contents):
@@ -334,6 +373,125 @@ async def batch_extract_bytes(
334
373
  return results
335
374
 
336
375
 
376
+ def _attempt_basic_extraction(
377
+ content: bytes | None, mime_type: str | None, original_error: Exception, index: int, *, file_path: str | None = None
378
+ ) -> ExtractionResult:
379
+ """Attempt basic extraction when full extraction fails, preserving as much as possible.
380
+
381
+ This function tries to extract at least basic text content even when advanced
382
+ features like OCR, entity extraction, etc. fail.
383
+
384
+ Args:
385
+ content: The raw content bytes (None for file extractions)
386
+ mime_type: The MIME type of the content (None if unknown)
387
+ original_error: The exception that caused the main extraction to fail
388
+ index: Index of this content in the batch
389
+ file_path: Optional file path for file-based extractions
390
+
391
+ Returns:
392
+ A basic ExtractionResult with whatever could be extracted
393
+ """
394
+ if (
395
+ isinstance(original_error, (ValueError, TypeError, ValidationError))
396
+ or "mock" in str(type(original_error)).lower()
397
+ ):
398
+ return ExtractionResult(
399
+ content=f"Error: {type(original_error).__name__}: {original_error!s}",
400
+ mime_type="text/plain",
401
+ metadata={
402
+ "error": f"{type(original_error).__name__}: {original_error!s}",
403
+ "error_context": create_error_context(
404
+ operation="batch_extract_file" if file_path else "batch_extract_bytes",
405
+ error=original_error,
406
+ index=index,
407
+ mime_type=mime_type,
408
+ content_size=len(content) if content else 0,
409
+ file_path=file_path,
410
+ ),
411
+ },
412
+ chunks=[],
413
+ entities=[],
414
+ keywords=[],
415
+ detected_languages=[],
416
+ tables=[],
417
+ images=[],
418
+ image_ocr_results=[],
419
+ )
420
+
421
+ try:
422
+ if content is None:
423
+ return ExtractionResult(
424
+ content=f"Error: {type(original_error).__name__}: {original_error!s}",
425
+ mime_type="text/plain",
426
+ metadata={
427
+ "error": f"{type(original_error).__name__}: {original_error!s}",
428
+ "error_context": create_error_context(
429
+ operation="batch_extract_file",
430
+ error=original_error,
431
+ index=index,
432
+ file_path=file_path,
433
+ ),
434
+ },
435
+ chunks=[],
436
+ entities=[],
437
+ keywords=[],
438
+ detected_languages=[],
439
+ tables=[],
440
+ images=[],
441
+ image_ocr_results=[],
442
+ )
443
+
444
+ mime_type = validate_mime_type(mime_type=mime_type)
445
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=ExtractionConfig()):
446
+ basic_result = extractor.extract_bytes_sync(content)
447
+
448
+ if basic_result.metadata is None:
449
+ basic_result.metadata = {}
450
+
451
+ basic_result.metadata["extraction_error"] = {
452
+ "error_type": type(original_error).__name__,
453
+ "error_message": str(original_error),
454
+ "traceback": traceback.format_exc(),
455
+ "context": create_error_context(
456
+ operation="batch_extract_file" if file_path else "batch_extract_bytes",
457
+ error=original_error,
458
+ index=index,
459
+ mime_type=mime_type,
460
+ content_size=len(content),
461
+ file_path=file_path,
462
+ ),
463
+ "recovery_mode": "basic_extraction",
464
+ }
465
+
466
+ return basic_result
467
+
468
+ except (KreuzbergError, ValueError, RuntimeError, TypeError):
469
+ pass
470
+
471
+ return ExtractionResult(
472
+ content=f"Error: {type(original_error).__name__}: {original_error!s}",
473
+ mime_type="text/plain",
474
+ metadata={
475
+ "error": f"{type(original_error).__name__}: {original_error!s}",
476
+ "error_context": create_error_context(
477
+ operation="batch_extract_file" if file_path else "batch_extract_bytes",
478
+ error=original_error,
479
+ index=index,
480
+ mime_type=mime_type,
481
+ content_size=len(content) if content else 0,
482
+ file_path=file_path,
483
+ ),
484
+ },
485
+ chunks=[],
486
+ entities=[],
487
+ keywords=[],
488
+ detected_languages=[],
489
+ tables=[],
490
+ images=[],
491
+ image_ocr_results=[],
492
+ )
493
+
494
+
337
495
  def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
338
496
  """Synchronous version of extract_bytes.
339
497
 
@@ -444,21 +602,18 @@ def batch_extract_file_sync(
444
602
  index,
445
603
  extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
446
604
  )
447
- except Exception as e: # noqa: BLE001
448
- error_result = ExtractionResult(
449
- content=f"Error: {type(e).__name__}: {e!s}",
450
- mime_type="text/plain",
451
- metadata={
452
- "error": f"{type(e).__name__}: {e!s}",
453
- "error_context": create_error_context(
454
- operation="batch_extract_file_sync",
455
- file_path=str(file_path),
456
- error=e,
457
- ),
458
- },
459
- chunks=[],
605
+ except Exception as e:
606
+ if should_exception_bubble_up(e, "batch_processing"):
607
+ raise
608
+
609
+ basic_result = _attempt_basic_extraction(
610
+ None,
611
+ None,
612
+ e,
613
+ index,
614
+ file_path=str(file_path),
460
615
  )
461
- return (index, error_result)
616
+ return (index, basic_result)
462
617
 
463
618
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
464
619
  future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
@@ -494,23 +649,12 @@ def batch_extract_bytes_sync(
494
649
  """Extract single content with index for ordering."""
495
650
  try:
496
651
  return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
497
- except Exception as e: # noqa: BLE001
498
- error_result = ExtractionResult(
499
- content=f"Error: {type(e).__name__}: {e!s}",
500
- mime_type="text/plain",
501
- metadata={
502
- "error": f"{type(e).__name__}: {e!s}",
503
- "error_context": create_error_context(
504
- operation="batch_extract_bytes_sync",
505
- error=e,
506
- index=index,
507
- mime_type=mime_type,
508
- content_size=len(content),
509
- ),
510
- },
511
- chunks=[],
512
- )
513
- return (index, error_result)
652
+ except Exception as e:
653
+ if should_exception_bubble_up(e, "batch_processing"):
654
+ raise
655
+
656
+ basic_result = _attempt_basic_extraction(content, mime_type, e, index)
657
+ return (index, basic_result)
514
658
 
515
659
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
516
660
  future_to_index = {