kreuzberg 3.18.0__py3-none-any.whl → 3.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -110,10 +110,9 @@ def _get_max_upload_size() -> int:
110
110
  Environment Variables:
111
111
  KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
112
  """
113
- default_size = 1024 * 1024 * 1024 # 1GB
113
+ default_size = 1024 * 1024 * 1024
114
114
  try:
115
115
  size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
- # Return default if negative
117
116
  return size if size >= 0 else default_size
118
117
  except ValueError:
119
118
  return default_size
@@ -311,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
311
310
  """
312
311
  static_config = discover_config_cached()
313
312
 
313
+ if not data:
314
+ raise ValidationError("No files provided for extraction", context={"file_count": 0})
315
+
314
316
  min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
315
317
  max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
316
318
 
@@ -144,10 +144,9 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
144
144
  try:
145
145
  nlp = spacy.load(model_name)
146
146
  except OSError:
147
- # Try to download the model automatically
147
+
148
148
  async def install_model() -> tuple[bool, str | None]:
149
149
  """Install model and return success status and error message."""
150
- # First try spaCy's built-in download
151
150
  try:
152
151
  success = await install_spacy_model_with_spacy(model_name)
153
152
  if success:
@@ -157,7 +156,6 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
157
156
  else:
158
157
  spacy_error = "spaCy download failed"
159
158
 
160
- # If spaCy download failed and uv is available, try uv as fallback
161
159
  if is_uv_available():
162
160
  try:
163
161
  result = await install_spacy_model_with_uv(model_name)
@@ -167,14 +165,12 @@ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig)
167
165
 
168
166
  return False, spacy_error
169
167
 
170
- # Run the async installation in a sync context
171
168
  try:
172
169
  success, error_details = anyio.run(install_model)
173
- except (OSError, RuntimeError) as e:
174
- success, error_details = False, str(e)
170
+ except SystemExit as e:
171
+ success, error_details = False, f"spaCy CLI exit code: {e.code}"
175
172
 
176
173
  if not success:
177
- # Generate appropriate error message based on available tools
178
174
  if is_uv_available():
179
175
  model_url = get_spacy_model_url(model_name)
180
176
  manual_install_cmd = f"uv pip install {model_url}"
@@ -234,7 +230,7 @@ def extract_keywords(
234
230
  kw_model = KeyBERT()
235
231
  keywords = kw_model.extract_keywords(text, top_n=keyword_count)
236
232
  return [(kw, float(score)) for kw, score in keywords]
237
- except (RuntimeError, OSError, ValueError):
233
+ except ValueError:
238
234
  return []
239
235
  except ImportError as e: # pragma: no cover
240
236
  raise MissingDependencyError.create_for_package(
@@ -0,0 +1,182 @@
1
+ """Type-safe error handling utilities for extraction pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import traceback
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+
11
+ from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
12
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
13
+
14
+
15
+ def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
16
+ """Determine if an exception should bubble up or be handled gracefully.
17
+
18
+ Args:
19
+ exception: The exception to classify
20
+ context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
21
+
22
+ Returns:
23
+ True if the exception should bubble up, False if it should be handled gracefully
24
+ """
25
+ if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
26
+ return True
27
+
28
+ if isinstance(exception, MissingDependencyError):
29
+ return True
30
+
31
+ if isinstance(exception, ValidationError):
32
+ if context == "batch_processing":
33
+ return False
34
+
35
+ return context != "optional_feature"
36
+
37
+ if isinstance(exception, KreuzbergError) and context == "optional_feature":
38
+ return False
39
+
40
+ if context == "batch_processing":
41
+ return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
42
+
43
+ return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
44
+
45
+
46
+ class FeatureProcessingError:
47
+ """Type-safe processing error for extraction features."""
48
+
49
+ def __init__(self, feature: str, error: Exception) -> None:
50
+ self._feature = feature
51
+ self._error = error
52
+ self._traceback = traceback.format_exc()
53
+
54
+ @property
55
+ def feature(self) -> str:
56
+ return self._feature
57
+
58
+ @property
59
+ def error_type(self) -> str:
60
+ return type(self._error).__name__
61
+
62
+ @property
63
+ def error_message(self) -> str:
64
+ return str(self._error)
65
+
66
+ @property
67
+ def traceback(self) -> str:
68
+ return self._traceback
69
+
70
+ def to_dict(self) -> ProcessingErrorDict:
71
+ return {
72
+ "feature": self.feature,
73
+ "error_type": self.error_type,
74
+ "error_message": self.error_message,
75
+ "traceback": self.traceback,
76
+ }
77
+
78
+
79
+ def safe_feature_execution(
80
+ feature_name: str,
81
+ execution_func: Callable[[], Any],
82
+ default_value: Any,
83
+ result: ExtractionResult,
84
+ context: ErrorContextType = "optional_feature",
85
+ ) -> Any:
86
+ """Safely execute a feature extraction function with proper error handling.
87
+
88
+ Args:
89
+ feature_name: Name of the feature being executed
90
+ execution_func: Function to execute that may raise exceptions
91
+ default_value: Default value to return if execution fails
92
+ result: ExtractionResult to update with error information
93
+ context: The context for exception handling decisions
94
+
95
+ Returns:
96
+ Either the successful result or the default value
97
+ """
98
+ try:
99
+ return execution_func()
100
+ except Exception as e:
101
+ if should_exception_bubble_up(e, context):
102
+ raise
103
+
104
+ _add_processing_error(result, FeatureProcessingError(feature_name, e))
105
+ return default_value
106
+
107
+
108
+ def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
109
+ """Add a processing error to the result metadata in a type-safe way."""
110
+ if result.metadata is None:
111
+ result.metadata = {}
112
+
113
+ if "processing_errors" not in result.metadata:
114
+ result.metadata["processing_errors"] = []
115
+
116
+ errors_list = result.metadata["processing_errors"]
117
+ if isinstance(errors_list, list):
118
+ errors_list.append(error.to_dict())
119
+ else:
120
+ result.metadata["processing_errors"] = [error.to_dict()]
121
+
122
+
123
+ def preserve_result_with_errors(
124
+ result: ExtractionResult,
125
+ errors: list[FeatureProcessingError],
126
+ ) -> ExtractionResult:
127
+ """Preserve a successful extraction result while adding error information.
128
+
129
+ This is used when core extraction succeeds but optional features fail.
130
+
131
+ Args:
132
+ result: The successful extraction result
133
+ errors: List of errors that occurred during optional processing
134
+
135
+ Returns:
136
+ The result with error information added to metadata
137
+ """
138
+ for error in errors:
139
+ _add_processing_error(result, error)
140
+
141
+ return result
142
+
143
+
144
+ def create_error_result(
145
+ content: str,
146
+ mime_type: str,
147
+ errors: list[FeatureProcessingError],
148
+ **metadata_kwargs: Any,
149
+ ) -> ExtractionResult:
150
+ """Create an error result with proper type safety.
151
+
152
+ Args:
153
+ content: Error content to include
154
+ mime_type: MIME type of the result
155
+ errors: List of errors that occurred
156
+ **metadata_kwargs: Additional metadata to include
157
+
158
+ Returns:
159
+ An ExtractionResult with error information
160
+ """
161
+ metadata: Metadata = {
162
+ "error": f"Multiple processing errors occurred: {len(errors)} errors",
163
+ "error_context": {
164
+ "error_count": len(errors),
165
+ "errors": [error.to_dict() for error in errors],
166
+ **metadata_kwargs,
167
+ },
168
+ "processing_errors": [error.to_dict() for error in errors],
169
+ }
170
+
171
+ return ExtractionResult(
172
+ content=content,
173
+ chunks=[],
174
+ mime_type=mime_type,
175
+ metadata=metadata,
176
+ entities=[],
177
+ keywords=[],
178
+ detected_languages=[],
179
+ tables=[],
180
+ images=[],
181
+ image_ocr_results=[],
182
+ )
@@ -230,13 +230,13 @@ class Extractor(ABC):
230
230
  confidence_score=None,
231
231
  processing_time=duration,
232
232
  )
233
- except (OSError, ValueError) as e: # pragma: no cover
233
+ except ValueError as e: # pragma: no cover
234
234
  return ImageOCRResult(
235
235
  image=target,
236
236
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
237
237
  skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
238
238
  )
239
- except (RuntimeError, TypeError) as e: # pragma: no cover
239
+ except TypeError as e: # pragma: no cover
240
240
  return ImageOCRResult(
241
241
  image=target,
242
242
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
@@ -75,7 +75,7 @@ class HTMLExtractor(Extractor):
75
75
  soup = BeautifulSoup(html_content, "xml")
76
76
 
77
77
  for img in soup.find_all("img"):
78
- src_val = img.get("src") # type: ignore[union-attr]
78
+ src_val = img.get("src")
79
79
  if isinstance(src_val, str) and src_val.startswith("data:image/"):
80
80
  try:
81
81
  header, data = src_val.split(",", 1)
@@ -105,7 +105,7 @@ class HTMLExtractor(Extractor):
105
105
  except (OSError, ValueError) as e: # pragma: no cover
106
106
  logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
107
 
108
- alt_val = img.get("alt") # type: ignore[union-attr]
108
+ alt_val = img.get("alt")
109
109
  desc = alt_val if isinstance(alt_val, str) else None
110
110
  images.append(
111
111
  ExtractedImage(
@@ -6,7 +6,6 @@ import logging
6
6
  import os
7
7
  import tempfile
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from dataclasses import asdict
10
9
  from itertools import count
11
10
  from multiprocessing import cpu_count
12
11
  from pathlib import Path
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
27
26
  from kreuzberg._ocr import get_ocr_backend
28
27
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
29
28
  from kreuzberg._types import (
30
- EasyOCRConfig,
31
29
  ExtractedImage,
32
30
  ExtractionResult,
33
31
  ImageOCRResult,
34
32
  Metadata,
35
33
  OcrBackendType,
36
- PaddleOCRConfig,
37
- TesseractConfig,
38
34
  )
39
35
  from kreuzberg._utils._errors import create_error_context, should_retry
40
36
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
134
130
  def extract_path_sync(self, path: Path) -> ExtractionResult:
135
131
  content_bytes = path.read_bytes()
136
132
 
133
+ result: ExtractionResult | None = None
134
+
137
135
  document: Document | None = None
138
136
  if self.config.extract_images or self.config.extract_tables:
139
137
  document = self._parse_with_password_attempts(content_bytes)
140
138
 
141
- try:
142
- text = self._extract_pdf_searchable_text_sync(path)
143
- except ParsingError:
144
- text = ""
139
+ if not self.config.force_ocr:
140
+ try:
141
+ content = self._extract_pdf_searchable_text_sync(path)
142
+ if self._validate_extracted_text(content):
143
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
144
+ except ParsingError:
145
+ pass
145
146
 
146
- if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
147
- text = self._extract_pdf_with_ocr_sync(path)
147
+ if not result and self.config.ocr_backend is not None:
148
+ result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
149
+
150
+ if not result:
151
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
152
+
153
+ metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
154
+ result.metadata = metadata
148
155
 
149
- tables = []
150
156
  if self.config.extract_tables:
151
157
  # GMFT is optional dependency ~keep
152
158
  try:
153
159
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
154
160
 
155
161
  tables = extract_tables_sync(path)
162
+ result.tables = tables
156
163
  except ImportError: # pragma: no cover
157
- tables = []
158
-
159
- if not self.config.force_ocr and self._validate_extracted_text(text):
160
- text = self._extract_with_playa_sync(path, fallback_text=text)
161
-
162
- text = normalize_spaces(text)
163
-
164
- result = ExtractionResult(
165
- content=text,
166
- mime_type=PLAIN_TEXT_MIME_TYPE,
167
- metadata={},
168
- tables=list(tables),
169
- )
164
+ result.tables = []
170
165
 
171
- if tables:
172
- table_summary = generate_table_summary(tables)
173
- result.metadata = result.metadata | {
174
- "table_count": table_summary["table_count"],
175
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
176
- f"across {table_summary['pages_with_tables']} pages with "
177
- f"{table_summary['total_rows']} total rows",
178
- }
166
+ if result.tables:
167
+ table_summary = generate_table_summary(result.tables)
168
+ result.metadata = result.metadata | {
169
+ "table_count": table_summary["table_count"],
170
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
171
+ f"across {table_summary['pages_with_tables']} pages with "
172
+ f"{table_summary['total_rows']} total rows",
173
+ }
179
174
 
180
175
  if self.config.extract_images and document:
181
176
  images = self._extract_images_from_playa_sync(document)
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
405
400
  except Exception as e:
406
401
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
407
402
 
408
- def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
403
+ def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
409
404
  temp_files: list[Path] = []
410
405
  try:
411
406
  with pdf_document_sync(path) as pdf:
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
443
438
  with pdf_resources_sync(bitmap, page):
444
439
  pil_image.close()
445
440
 
446
- return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
441
+ content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
442
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
447
443
 
448
444
  except Exception as e:
449
445
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
452
448
  with contextlib.suppress(OSError):
453
449
  p.unlink()
454
450
 
455
- def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
456
- backend = get_ocr_backend(self.config.ocr_backend)
451
+ def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
452
+ backend = get_ocr_backend(ocr_backend)
457
453
  paths = [Path(p) for p in image_paths]
458
454
 
459
- match self.config.ocr_backend:
460
- case "tesseract":
461
- config = (
462
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
463
- )
464
- results = backend.process_batch_sync(paths, **asdict(config))
465
- case "paddleocr":
466
- paddle_config = (
467
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
468
- )
469
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
470
- case "easyocr":
471
- easy_config = (
472
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
473
- )
474
- results = backend.process_batch_sync(paths, **asdict(easy_config))
475
- case _:
476
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
455
+ results = backend.process_batch_sync(paths, **self.config.get_config_dict())
477
456
 
478
457
  return "\n\n".join(result.content for result in results)
479
458
 
@@ -14,7 +14,7 @@ else: # pragma: no cover
14
14
  try:
15
15
  import yaml
16
16
  except ImportError: # pragma: no cover
17
- yaml = None
17
+ yaml = None # type: ignore[assignment]
18
18
 
19
19
 
20
20
  from anyio import Path as AsyncPath
@@ -31,5 +31,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
31
31
  langs = [result["lang"].lower() for result in results if result.get("lang")]
32
32
  return langs if langs else None
33
33
  return None
34
+ except (RuntimeError, OSError, MemoryError):
35
+ raise
34
36
  except Exception: # noqa: BLE001
35
37
  return None
@@ -1113,6 +1113,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1113
1113
  **run_config["remaining_kwargs"],
1114
1114
  "language": run_config["language"],
1115
1115
  "psm": run_config["psm"],
1116
+ "tesseract_format": run_config["tesseract_format"],
1117
+ "ext": run_config["ext"],
1118
+ "output_format": run_config["output_format"],
1119
+ "enable_table_detection": run_config["enable_table_detection"],
1116
1120
  }
1117
1121
 
1118
1122
  optimal_workers = get_optimal_worker_count(len(paths), cpu_intensive=True)
@@ -1222,13 +1226,21 @@ def _process_image_with_tesseract(
1222
1226
  config_dict: dict[str, Any],
1223
1227
  ) -> dict[str, Any]:
1224
1228
  try:
1225
- with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
1226
- output_base = tmp_file.name.replace(".txt", "")
1229
+ tesseract_format = config_dict.get("tesseract_format", "text")
1230
+ ext = config_dict.get("ext", ".txt")
1231
+ output_format = config_dict.get("output_format", "text")
1232
+ config_dict.get("enable_table_detection", False)
1233
+
1234
+ with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp_file:
1235
+ output_base = tmp_file.name.replace(ext, "")
1227
1236
 
1228
1237
  try:
1229
1238
  language = config_dict.get("language", "eng")
1230
1239
  psm = config_dict.get("psm", 3)
1231
1240
 
1241
+ # Convert PSM enum to integer value if needed
1242
+ psm_value = psm.value if hasattr(psm, "value") else psm
1243
+
1232
1244
  command = [
1233
1245
  "tesseract",
1234
1246
  image_path,
@@ -1236,13 +1248,16 @@ def _process_image_with_tesseract(
1236
1248
  "-l",
1237
1249
  language,
1238
1250
  "--psm",
1239
- str(psm),
1251
+ str(psm_value),
1240
1252
  "--oem",
1241
1253
  "1",
1242
1254
  "--loglevel",
1243
1255
  "OFF",
1244
1256
  ]
1245
1257
 
1258
+ if tesseract_format != "text":
1259
+ command.append(tesseract_format)
1260
+
1246
1261
  boolean_options = [
1247
1262
  "classify_use_pre_adapted_templates",
1248
1263
  "language_model_ngram_on",
@@ -1275,10 +1290,17 @@ def _process_image_with_tesseract(
1275
1290
  if result.returncode != 0:
1276
1291
  raise Exception(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
1277
1292
 
1278
- output_file = output_base + ".txt"
1293
+ output_file = output_base + ext
1279
1294
  with Path(output_file).open(encoding="utf-8") as f:
1280
1295
  text = f.read()
1281
1296
 
1297
+ # Process based on output format
1298
+ if output_format == "markdown" and tesseract_format == "hocr":
1299
+ # Import here to avoid circular dependency ~keep
1300
+ from html_to_markdown import convert_to_markdown # noqa: PLC0415
1301
+
1302
+ text = convert_to_markdown(text, heading_style="atx")
1303
+
1282
1304
  text = normalize_spaces(text)
1283
1305
 
1284
1306
  return {
@@ -1289,8 +1311,8 @@ def _process_image_with_tesseract(
1289
1311
  }
1290
1312
 
1291
1313
  finally:
1292
- for ext in [".txt"]:
1293
- temp_file = output_base + ext
1314
+ for possible_ext in [ext, ".txt", ".hocr", ".tsv"]:
1315
+ temp_file = output_base + possible_ext
1294
1316
  temp_path = Path(temp_file)
1295
1317
  if temp_path.exists():
1296
1318
  temp_path.unlink()
kreuzberg/_types.py CHANGED
@@ -32,6 +32,7 @@ if TYPE_CHECKING:
32
32
 
33
33
  OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
34
34
  OutputFormatType = Literal["text", "tsv", "hocr", "markdown"]
35
+ ErrorContextType = Literal["batch_processing", "optional_feature", "single_extraction", "unknown"]
35
36
 
36
37
 
37
38
  class ConfigDict:
@@ -503,6 +504,17 @@ class SpacyEntityExtractionConfig(ConfigDict):
503
504
  return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
504
505
 
505
506
 
507
+ class ProcessingErrorDict(TypedDict):
508
+ feature: str
509
+ """Name of the feature that failed (e.g., 'chunking', 'entity_extraction', 'keyword_extraction')."""
510
+ error_type: str
511
+ """Type of the exception that occurred (e.g., 'RuntimeError', 'ValidationError')."""
512
+ error_message: str
513
+ """Human-readable error message."""
514
+ traceback: str
515
+ """Full Python traceback for debugging."""
516
+
517
+
506
518
  class BoundingBox(TypedDict):
507
519
  left: int
508
520
  """X coordinate of the left edge."""
@@ -701,6 +713,10 @@ class Metadata(TypedDict, total=False):
701
713
  """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
702
714
  token_reduction: NotRequired[dict[str, float]]
703
715
  """Token reduction statistics including reduction ratios and counts."""
716
+ processing_errors: NotRequired[list[ProcessingErrorDict]]
717
+ """List of processing errors that occurred during extraction."""
718
+ extraction_error: NotRequired[dict[str, Any]]
719
+ """Error information for critical extraction failures."""
704
720
 
705
721
 
706
722
  _VALID_METADATA_KEYS = {
@@ -756,6 +772,8 @@ _VALID_METADATA_KEYS = {
756
772
  "message",
757
773
  "attributes",
758
774
  "token_reduction",
775
+ "processing_errors",
776
+ "extraction_error",
759
777
  }
760
778
 
761
779
 
kreuzberg/cli.py CHANGED
@@ -168,31 +168,45 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
168
168
  input_text = sys.stdin.read()
169
169
  input_bytes = input_text.encode("utf-8")
170
170
 
171
- with Progress(
172
- SpinnerColumn(),
173
- TextColumn("[progress.description]{task.description}"),
174
- console=console,
175
- transient=True,
176
- ) as progress:
177
- progress.add_task("Extracting text...", total=None)
178
-
179
- try:
180
- import magic # type: ignore[import-not-found] # noqa: PLC0415
181
-
182
- mime_type = magic.from_buffer(input_bytes, mime=True)
183
- except ImportError: # pragma: no cover
184
- content_str = input_bytes.decode("utf-8", errors="ignore").lower()
185
- mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
171
+ # Detect MIME type from content
172
+ content_str = input_bytes.decode("utf-8", errors="ignore").lower()
173
+ if "<html" in content_str or "<!doctype html" in content_str or "<body" in content_str:
174
+ mime_type = "text/html"
175
+ elif (content_str.strip().startswith("{") and content_str.strip().endswith("}")) or (
176
+ content_str.strip().startswith("[") and content_str.strip().endswith("]")
177
+ ):
178
+ mime_type = "application/json"
179
+ elif content_str.strip().startswith("---") or ":" in content_str[:100]:
180
+ mime_type = "application/x-yaml"
181
+ else:
182
+ mime_type = "text/plain"
186
183
 
184
+ # Use progress display if possible, fallback to simple extraction on Windows issues
185
+ try:
186
+ with Progress(
187
+ SpinnerColumn(),
188
+ TextColumn("[progress.description]{task.description}"),
189
+ console=console,
190
+ transient=True,
191
+ ) as progress:
192
+ progress.add_task("Extracting text...", total=None)
193
+ return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
194
+ except (OSError, RuntimeError): # pragma: no cover
195
+ # Fallback for Windows console issues
187
196
  return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
188
197
  else:
189
- with Progress(
190
- SpinnerColumn(),
191
- TextColumn("[progress.description]{task.description}"),
192
- console=console,
193
- transient=True,
194
- ) as progress:
195
- progress.add_task(f"Extracting text from {file.name}...", total=None)
198
+ # Use progress display if possible, fallback to simple extraction on Windows issues
199
+ try:
200
+ with Progress(
201
+ SpinnerColumn(),
202
+ TextColumn("[progress.description]{task.description}"),
203
+ console=console,
204
+ transient=True,
205
+ ) as progress:
206
+ progress.add_task(f"Extracting text from {file.name}...", total=None)
207
+ return extract_file_sync(str(file), config=extraction_config)
208
+ except (OSError, RuntimeError): # pragma: no cover
209
+ # Fallback for Windows console issues
196
210
  return extract_file_sync(str(file), config=extraction_config)
197
211
 
198
212
 
kreuzberg/extraction.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import multiprocessing as mp
4
+ import traceback
4
5
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
6
  from pathlib import Path
6
7
  from typing import TYPE_CHECKING, Final, cast
@@ -10,6 +11,7 @@ import anyio
10
11
  from kreuzberg._chunker import get_chunker
11
12
  from kreuzberg._document_classification import auto_detect_document_type
12
13
  from kreuzberg._entity_extraction import extract_entities, extract_keywords
14
+ from kreuzberg._error_handling import safe_feature_execution, should_exception_bubble_up
13
15
  from kreuzberg._language_detection import detect_languages
14
16
  from kreuzberg._mime_types import (
15
17
  validate_mime_type,
@@ -21,7 +23,7 @@ from kreuzberg._utils._document_cache import get_document_cache
21
23
  from kreuzberg._utils._errors import create_error_context
22
24
  from kreuzberg._utils._string import safe_decode
23
25
  from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
24
- from kreuzberg.exceptions import ValidationError
26
+ from kreuzberg.exceptions import KreuzbergError, ValidationError
25
27
 
26
28
  if TYPE_CHECKING:
27
29
  from collections.abc import Sequence
@@ -50,69 +52,107 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
50
52
  def _validate_and_post_process_helper(
51
53
  result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
52
54
  ) -> ExtractionResult:
55
+ if result.metadata is None:
56
+ result.metadata = {}
57
+
53
58
  if config.chunk_content:
54
- result.chunks = _handle_chunk_content(
55
- mime_type=result.mime_type,
56
- config=config,
57
- content=result.content,
59
+ result.chunks = safe_feature_execution(
60
+ feature_name="chunking",
61
+ execution_func=lambda: _handle_chunk_content(
62
+ mime_type=result.mime_type,
63
+ config=config,
64
+ content=result.content,
65
+ ),
66
+ default_value=[],
67
+ result=result,
58
68
  )
59
69
 
60
70
  if config.extract_entities:
61
- try:
62
- result.entities = extract_entities(
71
+ result.entities = safe_feature_execution(
72
+ feature_name="entity_extraction",
73
+ execution_func=lambda: extract_entities(
63
74
  result.content,
64
75
  custom_patterns=config.custom_entity_patterns,
65
- )
66
- except RuntimeError:
67
- result.entities = None
76
+ ),
77
+ default_value=None,
78
+ result=result,
79
+ )
68
80
 
69
81
  if config.extract_keywords:
70
- try:
71
- result.keywords = extract_keywords(
82
+ result.keywords = safe_feature_execution(
83
+ feature_name="keyword_extraction",
84
+ execution_func=lambda: extract_keywords(
72
85
  result.content,
73
86
  keyword_count=config.keyword_count,
74
- )
75
- except RuntimeError:
76
- result.keywords = None
87
+ ),
88
+ default_value=None,
89
+ result=result,
90
+ )
77
91
 
78
92
  if config.auto_detect_language:
79
- lang_config = config.language_detection_config
80
- if lang_config is None:
81
- from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
82
93
 
83
- lang_config = LanguageDetectionConfig(model=config.language_detection_model)
94
+ def _detect_language() -> list[str]:
95
+ lang_config = config.language_detection_config
96
+ if lang_config is None:
97
+ from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
98
+
99
+ lang_config = LanguageDetectionConfig(model=config.language_detection_model)
84
100
 
85
- result.detected_languages = detect_languages(
86
- result.content,
87
- config=lang_config,
101
+ return detect_languages(result.content, config=lang_config) or []
102
+
103
+ result.detected_languages = safe_feature_execution(
104
+ feature_name="language_detection",
105
+ execution_func=_detect_language,
106
+ default_value=[],
107
+ result=result,
88
108
  )
89
109
 
90
110
  if config.auto_detect_document_type:
91
- result = auto_detect_document_type(result, config, file_path=file_path)
111
+ result = safe_feature_execution(
112
+ feature_name="document_type_detection",
113
+ execution_func=lambda: auto_detect_document_type(result, config, file_path=file_path),
114
+ default_value=result,
115
+ result=result,
116
+ )
92
117
 
93
118
  if config.token_reduction is not None and config.token_reduction.mode != "off":
94
- original_content = result.content
95
119
 
96
- language_hint = None
97
- if result.detected_languages and len(result.detected_languages) > 0:
98
- language_hint = result.detected_languages[0]
120
+ def _apply_token_reduction() -> str:
121
+ original_content = result.content
99
122
 
100
- reduced_content = reduce_tokens(
101
- original_content,
102
- config=config.token_reduction,
103
- language=language_hint,
123
+ language_hint = None
124
+ if result.detected_languages and len(result.detected_languages) > 0:
125
+ language_hint = result.detected_languages[0]
126
+
127
+ reduced_content = (
128
+ reduce_tokens(
129
+ original_content,
130
+ config=config.token_reduction,
131
+ language=language_hint,
132
+ )
133
+ if config.token_reduction
134
+ else original_content
135
+ )
136
+ reduction_stats = get_reduction_stats(original_content, reduced_content)
137
+
138
+ if result.metadata is not None:
139
+ result.metadata["token_reduction"] = {
140
+ "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
141
+ "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
142
+ "original_characters": reduction_stats["original_characters"],
143
+ "reduced_characters": reduction_stats["reduced_characters"],
144
+ "original_tokens": reduction_stats["original_tokens"],
145
+ "reduced_tokens": reduction_stats["reduced_tokens"],
146
+ }
147
+
148
+ return reduced_content
149
+
150
+ result.content = safe_feature_execution(
151
+ feature_name="token_reduction",
152
+ execution_func=_apply_token_reduction,
153
+ default_value=result.content,
154
+ result=result,
104
155
  )
105
- reduction_stats = get_reduction_stats(original_content, reduced_content)
106
-
107
- result.content = reduced_content
108
- result.metadata["token_reduction"] = {
109
- "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
110
- "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
111
- "original_characters": reduction_stats["original_characters"],
112
- "reduced_characters": reduction_stats["reduced_characters"],
113
- "original_tokens": reduction_stats["original_tokens"],
114
- "reduced_tokens": reduction_stats["reduced_tokens"],
115
- }
116
156
 
117
157
  return result
118
158
 
@@ -125,8 +165,22 @@ async def _validate_and_post_process_async(
125
165
 
126
166
  result = _validate_and_post_process_helper(result, config, file_path)
127
167
 
128
- for post_processor in config.post_processing_hooks or []:
129
- result = await run_maybe_sync(post_processor, result)
168
+ for i, post_processor in enumerate(config.post_processing_hooks or []):
169
+ try:
170
+ result = await run_maybe_sync(post_processor, result)
171
+ except (KreuzbergError, ValueError, RuntimeError, TypeError) as e: # noqa: PERF203
172
+ if result.metadata is None:
173
+ result.metadata = {}
174
+ error_list = result.metadata.setdefault("processing_errors", [])
175
+ if isinstance(error_list, list):
176
+ error_list.append(
177
+ {
178
+ "feature": f"post_processing_hook_{i}",
179
+ "error_type": type(e).__name__,
180
+ "error_message": str(e),
181
+ "traceback": traceback.format_exc(),
182
+ }
183
+ )
130
184
 
131
185
  return result
132
186
 
@@ -260,22 +314,18 @@ async def batch_extract_file(
260
314
  config,
261
315
  )
262
316
  results[index] = result
263
- except Exception as e: # noqa: BLE001
264
- error_result = ExtractionResult(
265
- content=f"Error: {type(e).__name__}: {e!s}",
266
- mime_type="text/plain",
267
- metadata={
268
- "error": f"{type(e).__name__}: {e!s}",
269
- "error_context": create_error_context(
270
- operation="batch_extract_file",
271
- file_path=str(path),
272
- error=e,
273
- index=index,
274
- ),
275
- },
276
- chunks=[],
317
+ except Exception as e:
318
+ if should_exception_bubble_up(e, "batch_processing"):
319
+ raise
320
+
321
+ basic_result = _attempt_basic_extraction(
322
+ None,
323
+ None,
324
+ e,
325
+ index,
326
+ file_path=str(path),
277
327
  )
278
- results[index] = error_result
328
+ results[index] = basic_result
279
329
 
280
330
  async with anyio.create_task_group() as tg:
281
331
  for i, path in enumerate(file_paths):
@@ -309,23 +359,12 @@ async def batch_extract_bytes(
309
359
  try:
310
360
  result = await extract_bytes(content, mime_type, config)
311
361
  results[index] = result
312
- except Exception as e: # noqa: BLE001
313
- error_result = ExtractionResult(
314
- content=f"Error: {type(e).__name__}: {e!s}",
315
- mime_type="text/plain",
316
- metadata={
317
- "error": f"{type(e).__name__}: {e!s}",
318
- "error_context": create_error_context(
319
- operation="batch_extract_bytes",
320
- error=e,
321
- index=index,
322
- mime_type=mime_type,
323
- content_size=len(content),
324
- ),
325
- },
326
- chunks=[],
327
- )
328
- results[index] = error_result
362
+ except Exception as e:
363
+ if should_exception_bubble_up(e, "batch_processing"):
364
+ raise
365
+
366
+ basic_result = _attempt_basic_extraction(content, mime_type, e, index)
367
+ results[index] = basic_result
329
368
 
330
369
  async with anyio.create_task_group() as tg:
331
370
  for i, (content, mime_type) in enumerate(contents):
@@ -334,6 +373,125 @@ async def batch_extract_bytes(
334
373
  return results
335
374
 
336
375
 
376
+ def _attempt_basic_extraction(
377
+ content: bytes | None, mime_type: str | None, original_error: Exception, index: int, *, file_path: str | None = None
378
+ ) -> ExtractionResult:
379
+ """Attempt basic extraction when full extraction fails, preserving as much as possible.
380
+
381
+ This function tries to extract at least basic text content even when advanced
382
+ features like OCR, entity extraction, etc. fail.
383
+
384
+ Args:
385
+ content: The raw content bytes (None for file extractions)
386
+ mime_type: The MIME type of the content (None if unknown)
387
+ original_error: The exception that caused the main extraction to fail
388
+ index: Index of this content in the batch
389
+ file_path: Optional file path for file-based extractions
390
+
391
+ Returns:
392
+ A basic ExtractionResult with whatever could be extracted
393
+ """
394
+ if (
395
+ isinstance(original_error, (ValueError, TypeError, ValidationError))
396
+ or "mock" in str(type(original_error)).lower()
397
+ ):
398
+ return ExtractionResult(
399
+ content=f"Error: {type(original_error).__name__}: {original_error!s}",
400
+ mime_type="text/plain",
401
+ metadata={
402
+ "error": f"{type(original_error).__name__}: {original_error!s}",
403
+ "error_context": create_error_context(
404
+ operation="batch_extract_file" if file_path else "batch_extract_bytes",
405
+ error=original_error,
406
+ index=index,
407
+ mime_type=mime_type,
408
+ content_size=len(content) if content else 0,
409
+ file_path=file_path,
410
+ ),
411
+ },
412
+ chunks=[],
413
+ entities=[],
414
+ keywords=[],
415
+ detected_languages=[],
416
+ tables=[],
417
+ images=[],
418
+ image_ocr_results=[],
419
+ )
420
+
421
+ try:
422
+ if content is None:
423
+ return ExtractionResult(
424
+ content=f"Error: {type(original_error).__name__}: {original_error!s}",
425
+ mime_type="text/plain",
426
+ metadata={
427
+ "error": f"{type(original_error).__name__}: {original_error!s}",
428
+ "error_context": create_error_context(
429
+ operation="batch_extract_file",
430
+ error=original_error,
431
+ index=index,
432
+ file_path=file_path,
433
+ ),
434
+ },
435
+ chunks=[],
436
+ entities=[],
437
+ keywords=[],
438
+ detected_languages=[],
439
+ tables=[],
440
+ images=[],
441
+ image_ocr_results=[],
442
+ )
443
+
444
+ mime_type = validate_mime_type(mime_type=mime_type)
445
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=ExtractionConfig()):
446
+ basic_result = extractor.extract_bytes_sync(content)
447
+
448
+ if basic_result.metadata is None:
449
+ basic_result.metadata = {}
450
+
451
+ basic_result.metadata["extraction_error"] = {
452
+ "error_type": type(original_error).__name__,
453
+ "error_message": str(original_error),
454
+ "traceback": traceback.format_exc(),
455
+ "context": create_error_context(
456
+ operation="batch_extract_file" if file_path else "batch_extract_bytes",
457
+ error=original_error,
458
+ index=index,
459
+ mime_type=mime_type,
460
+ content_size=len(content),
461
+ file_path=file_path,
462
+ ),
463
+ "recovery_mode": "basic_extraction",
464
+ }
465
+
466
+ return basic_result
467
+
468
+ except (KreuzbergError, ValueError, RuntimeError, TypeError):
469
+ pass
470
+
471
+ return ExtractionResult(
472
+ content=f"Error: {type(original_error).__name__}: {original_error!s}",
473
+ mime_type="text/plain",
474
+ metadata={
475
+ "error": f"{type(original_error).__name__}: {original_error!s}",
476
+ "error_context": create_error_context(
477
+ operation="batch_extract_file" if file_path else "batch_extract_bytes",
478
+ error=original_error,
479
+ index=index,
480
+ mime_type=mime_type,
481
+ content_size=len(content) if content else 0,
482
+ file_path=file_path,
483
+ ),
484
+ },
485
+ chunks=[],
486
+ entities=[],
487
+ keywords=[],
488
+ detected_languages=[],
489
+ tables=[],
490
+ images=[],
491
+ image_ocr_results=[],
492
+ )
493
+
494
+
337
495
  def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
338
496
  """Synchronous version of extract_bytes.
339
497
 
@@ -444,21 +602,18 @@ def batch_extract_file_sync(
444
602
  index,
445
603
  extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
446
604
  )
447
- except Exception as e: # noqa: BLE001
448
- error_result = ExtractionResult(
449
- content=f"Error: {type(e).__name__}: {e!s}",
450
- mime_type="text/plain",
451
- metadata={
452
- "error": f"{type(e).__name__}: {e!s}",
453
- "error_context": create_error_context(
454
- operation="batch_extract_file_sync",
455
- file_path=str(file_path),
456
- error=e,
457
- ),
458
- },
459
- chunks=[],
605
+ except Exception as e:
606
+ if should_exception_bubble_up(e, "batch_processing"):
607
+ raise
608
+
609
+ basic_result = _attempt_basic_extraction(
610
+ None,
611
+ None,
612
+ e,
613
+ index,
614
+ file_path=str(file_path),
460
615
  )
461
- return (index, error_result)
616
+ return (index, basic_result)
462
617
 
463
618
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
464
619
  future_to_index = {executor.submit(extract_single, i, fp): i for i, fp in enumerate(file_paths)}
@@ -494,23 +649,12 @@ def batch_extract_bytes_sync(
494
649
  """Extract single content with index for ordering."""
495
650
  try:
496
651
  return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
497
- except Exception as e: # noqa: BLE001
498
- error_result = ExtractionResult(
499
- content=f"Error: {type(e).__name__}: {e!s}",
500
- mime_type="text/plain",
501
- metadata={
502
- "error": f"{type(e).__name__}: {e!s}",
503
- "error_context": create_error_context(
504
- operation="batch_extract_bytes_sync",
505
- error=e,
506
- index=index,
507
- mime_type=mime_type,
508
- content_size=len(content),
509
- ),
510
- },
511
- chunks=[],
512
- )
513
- return (index, error_result)
652
+ except Exception as e:
653
+ if should_exception_bubble_up(e, "batch_processing"):
654
+ raise
655
+
656
+ basic_result = _attempt_basic_extraction(content, mime_type, e, index)
657
+ return (index, basic_result)
514
658
 
515
659
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
516
660
  future_to_index = {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.18.0
3
+ Version: 3.19.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -42,6 +42,7 @@ Requires-Dist: psutil>=7.1.0
42
42
  Requires-Dist: pypdfium2==4.30.0
43
43
  Requires-Dist: python-calamine>=0.5.3
44
44
  Requires-Dist: python-pptx>=1.0.2
45
+ Requires-Dist: transformers>=4.30.0
45
46
  Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
46
47
  Provides-Extra: additional-extensions
47
48
  Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
@@ -63,6 +64,7 @@ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
63
64
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
64
65
  Requires-Dist: spacy>=3.8.7; extra == 'all'
65
66
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
67
+ Requires-Dist: transformers>=4.25.0; extra == 'all'
66
68
  Provides-Extra: api
67
69
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
68
70
  Provides-Extra: chunking
@@ -82,6 +84,7 @@ Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
82
84
  Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
83
85
  Provides-Extra: gmft
84
86
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
87
+ Requires-Dist: transformers>=4.25.0; extra == 'gmft'
85
88
  Provides-Extra: langdetect
86
89
  Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
87
90
  Provides-Extra: paddleocr
@@ -4,30 +4,31 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
4
4
  kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
5
5
  kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
6
  kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
- kreuzberg/_entity_extraction.py,sha256=zbwgvS_2M4JibmVVnclkmie0nmZQtyHtT_ucdbQc6nU,7837
7
+ kreuzberg/_entity_extraction.py,sha256=Ks-1gZIYDqgg2uJerd0FH_lYhjIwS0f0bMVhR9M59jA,7518
8
+ kreuzberg/_error_handling.py,sha256=Isr9yrY4JRKOmUVaUOky_LZ7tGVZAm8jxRD3qGbkc1g,5604
8
9
  kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
9
- kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
10
+ kreuzberg/_language_detection.py,sha256=4JzQldcDIVZRWUzRFc9AOFiq6Wfl9858mip1ZnrD2Ks,1143
10
11
  kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
11
12
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
13
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
- kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
14
- kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
14
+ kreuzberg/_types.py,sha256=6oBsmUUihVr4hJJrYeuWoUVzCP_-eciCrBVvGQHQTDI,49920
15
+ kreuzberg/cli.py,sha256=P_dqOHbGh-fFYZ4WErjngTKq7wbqaUmTD1Gjw2lIsDI,15242
15
16
  kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
16
- kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
17
+ kreuzberg/extraction.py,sha256=jMsomvg7SPnuXLGZKQl0YH64D0AhczSNDM4CKORd9d0,24185
17
18
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
20
  kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
20
- kreuzberg/_api/main.py,sha256=5LiqgyeHJy0GLLa-ehB0bq8ftEUYfM1Pt6f0j_a0dso,15190
21
+ kreuzberg/_api/main.py,sha256=tmg1fICU4wshq0XXhGOk22oivfXjELtsEgOumdkZNI4,15257
21
22
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- kreuzberg/_extractors/_base.py,sha256=4MRBXdLsgdtdrTuupWb2IT9YpRSnNPpWWviS2mfeOXg,9961
23
+ kreuzberg/_extractors/_base.py,sha256=99r-CUZcAp72c0mqkj-E41lj0SyzNaTb_w2EtKgfGJ8,9934
23
24
  kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
24
- kreuzberg/_extractors/_html.py,sha256=TXXgwQZuEvnrny5HdBpn8oikGktyxgY9jvgZmnFtnqY,6371
25
+ kreuzberg/_extractors/_html.py,sha256=vNAgBrfok-16SOkhhsy10unqVwAczlTL_2KEn2X6S98,6315
25
26
  kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
26
27
  kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
27
- kreuzberg/_extractors/_pdf.py,sha256=GFy7xHUH09i48E5Xixy6nReF_uBu9646UTjywKoH-Rs,23304
28
+ kreuzberg/_extractors/_pdf.py,sha256=_MPtO_8BCpyAXyIWusmfqOaEsPMDxucjTQKz3cTaj8o,22663
28
29
  kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
29
30
  kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
30
- kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
31
+ kreuzberg/_extractors/_structured.py,sha256=thpXhsBnvaHzGQX4sy6eVHowFv0yaYxLGHwxx4DouCI,8947
31
32
  kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
32
33
  kreuzberg/_mcp/server.py,sha256=71MhjiFDwgFROdGejf0djgO1eG370qudWmZsN59CUeA,16743
33
34
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
@@ -35,7 +36,7 @@ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
35
36
  kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
36
37
  kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
37
38
  kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
38
- kreuzberg/_ocr/_tesseract.py,sha256=1SEfrX_JvU6KIeWt31GsRWnNmjaAh3xgQaRMPvoZLJA,51349
39
+ kreuzberg/_ocr/_tesseract.py,sha256=Uu6H1LMh1WSC1OmKhPx-miG98r9KEfc0GF7b8isS33E,52420
39
40
  kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
40
41
  kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
41
42
  kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
@@ -121,8 +122,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
121
122
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
122
123
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
123
124
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
124
- kreuzberg-3.18.0.dist-info/METADATA,sha256=Z54em4GwMd18BmlIWmq1AHtCdFStstMV5RAXaB4x3_0,12351
125
- kreuzberg-3.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
- kreuzberg-3.18.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
- kreuzberg-3.18.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
- kreuzberg-3.18.0.dist-info/RECORD,,
125
+ kreuzberg-3.19.0.dist-info/METADATA,sha256=fV1j2iWA2-rcZodFFV3kmSsuBJhoDsW6OuyIu9Myf4A,12492
126
+ kreuzberg-3.19.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
127
+ kreuzberg-3.19.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
128
+ kreuzberg-3.19.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
129
+ kreuzberg-3.19.0.dist-info/RECORD,,