kreuzberg 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import base64
4
4
  import io
5
+ import os
5
6
  import traceback
6
7
  from json import dumps
7
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -100,6 +101,35 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
100
101
  )
101
102
 
102
103
 
104
+ def _get_max_upload_size() -> int:
105
+ """Get the maximum upload size from environment variable.
106
+
107
+ Returns:
108
+ Maximum upload size in bytes. Defaults to 1GB if not set.
109
+
110
+ Environment Variables:
111
+ KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
+ """
113
+ default_size = 1024 * 1024 * 1024
114
+ try:
115
+ size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
+ return size if size >= 0 else default_size
117
+ except ValueError:
118
+ return default_size
119
+
120
+
121
+ def _is_opentelemetry_enabled() -> bool:
122
+ """Check if OpenTelemetry should be enabled.
123
+
124
+ Returns:
125
+ True if OpenTelemetry should be enabled, False otherwise.
126
+
127
+ Environment Variables:
128
+ KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
129
+ """
130
+ return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
131
+
132
+
103
133
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
104
134
  error_type = type(exception).__name__
105
135
  error_message = str(exception)
@@ -242,7 +272,7 @@ async def handle_files_upload( # noqa: PLR0913
242
272
  - Language detection (if enabled)
243
273
 
244
274
  Supports various file formats including PDF, Office documents, images, and more.
245
- Maximum file size: 1GB per file.
275
+ Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
246
276
 
247
277
  Args:
248
278
  request: The HTTP request object
@@ -280,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
280
310
  """
281
311
  static_config = discover_config_cached()
282
312
 
313
+ if not data:
314
+ raise ValidationError("No files provided for extraction", context={"file_count": 0})
315
+
283
316
  min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
284
317
  max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
285
318
 
@@ -379,9 +412,18 @@ type_encoders = {
379
412
  Image.Image: _pil_image_encoder,
380
413
  }
381
414
 
415
+
416
+ def _get_plugins() -> list[Any]:
417
+ """Get configured plugins based on environment variables."""
418
+ plugins = []
419
+ if _is_opentelemetry_enabled():
420
+ plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
421
+ return plugins
422
+
423
+
382
424
  app = Litestar(
383
425
  route_handlers=[handle_files_upload, health_check, get_configuration],
384
- plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
426
+ plugins=_get_plugins(),
385
427
  logging_config=StructLoggingConfig(),
386
428
  openapi_config=openapi_config,
387
429
  exception_handlers={
@@ -389,5 +431,5 @@ app = Litestar(
389
431
  Exception: general_exception_handler,
390
432
  },
391
433
  type_encoders=type_encoders,
392
- request_max_body_size=1024 * 1024 * 1024,
434
+ request_max_body_size=_get_max_upload_size(),
393
435
  )
@@ -2,19 +2,77 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
+ import shutil
5
6
  import subprocess
6
- import sys
7
7
  from functools import lru_cache
8
8
  from itertools import chain
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
+ import anyio
12
+
11
13
  from kreuzberg._types import Entity, SpacyEntityExtractionConfig
14
+ from kreuzberg._utils._sync import run_sync
12
15
  from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
13
16
 
14
17
  if TYPE_CHECKING:
15
18
  from collections.abc import Sequence
16
19
 
17
20
 
21
+ def is_uv_available() -> bool:
22
+ """Check if uv is available in the environment."""
23
+ return shutil.which("uv") is not None
24
+
25
+
26
+ def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
27
+ """Get the direct download URL for a spaCy model.
28
+
29
+ Args:
30
+ model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
31
+ version: Model version to download (default: 3.8.0)
32
+
33
+ Returns:
34
+ Direct download URL for the model
35
+ """
36
+ return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
37
+
38
+
39
+ async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
40
+ """Install spaCy model using uv.
41
+
42
+ Args:
43
+ model_name: Name of the spaCy model to install
44
+
45
+ Returns:
46
+ Completed process result
47
+ """
48
+ model_url = get_spacy_model_url(model_name)
49
+ return await run_sync(
50
+ subprocess.run,
51
+ ["uv", "pip", "install", model_url],
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ )
56
+
57
+
58
+ async def install_spacy_model_with_spacy(model_name: str) -> bool:
59
+ """Install spaCy model using spacy download function.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to install
63
+
64
+ Returns:
65
+ True if successful, False otherwise
66
+ """
67
+ try:
68
+ import spacy.cli.download # noqa: PLC0415
69
+
70
+ await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
71
+ return True
72
+ except (ImportError, OSError, RuntimeError):
73
+ return False
74
+
75
+
18
76
  def extract_entities(
19
77
  text: str,
20
78
  entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
@@ -46,11 +104,11 @@ def extract_entities(
46
104
  functionality="Entity Extraction",
47
105
  ) from e
48
106
 
49
- model_name = _select_spacy_model(languages, spacy_config)
107
+ model_name = select_spacy_model(languages, spacy_config)
50
108
  if not model_name:
51
109
  return entities
52
110
 
53
- nlp = _load_spacy_model(model_name, spacy_config)
111
+ nlp = load_spacy_model(model_name, spacy_config)
54
112
 
55
113
  if len(text) > spacy_config.max_doc_length:
56
114
  text = text[: spacy_config.max_doc_length]
@@ -74,7 +132,7 @@ def extract_entities(
74
132
 
75
133
 
76
134
  @lru_cache(maxsize=32)
77
- def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
135
+ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
78
136
  try:
79
137
  import spacy # noqa: PLC0415
80
138
  except ImportError:
@@ -86,22 +144,54 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
86
144
  try:
87
145
  nlp = spacy.load(model_name)
88
146
  except OSError:
89
- result = subprocess.run(
90
- [sys.executable, "-m", "spacy", "download", model_name],
91
- capture_output=True,
92
- text=True,
93
- check=False,
94
- )
95
147
 
96
- if result.returncode != 0:
148
+ async def install_model() -> tuple[bool, str | None]:
149
+ """Install model and return success status and error message."""
150
+ try:
151
+ success = await install_spacy_model_with_spacy(model_name)
152
+ if success:
153
+ return True, None
154
+ except (ImportError, OSError, RuntimeError) as e:
155
+ spacy_error = str(e)
156
+ else:
157
+ spacy_error = "spaCy download failed"
158
+
159
+ if is_uv_available():
160
+ try:
161
+ result = await install_spacy_model_with_uv(model_name)
162
+ return result.returncode == 0, result.stderr
163
+ except (OSError, subprocess.SubprocessError) as e:
164
+ return False, f"spaCy: {spacy_error}, uv: {e!s}"
165
+
166
+ return False, spacy_error
167
+
168
+ try:
169
+ success, error_details = anyio.run(install_model)
170
+ except SystemExit as e:
171
+ success, error_details = False, f"spaCy CLI exit code: {e.code}"
172
+
173
+ if not success:
174
+ if is_uv_available():
175
+ model_url = get_spacy_model_url(model_name)
176
+ manual_install_cmd = f"uv pip install {model_url}"
177
+ else:
178
+ manual_install_cmd = f"python -m spacy download {model_name}"
179
+
97
180
  error_msg = (
98
- f"Failed to download spaCy model '{model_name}'. "
99
- f"Please install it manually with: python -m spacy download {model_name}"
181
+ f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
100
182
  )
101
- if result.stderr:
102
- error_msg += f"\nError details: {result.stderr}"
183
+
184
+ if error_details:
185
+ error_msg += f"\nError details: {error_details}"
186
+
103
187
  raise KreuzbergError(
104
- error_msg, context={"model": model_name, "stderr": result.stderr, "return_code": result.returncode}
188
+ error_msg,
189
+ context={
190
+ "model": model_name,
191
+ "manual_install_cmd": manual_install_cmd,
192
+ "error_details": error_details,
193
+ "uv_available": is_uv_available(),
194
+ },
105
195
  ) from None
106
196
 
107
197
  try:
@@ -118,7 +208,7 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
118
208
  return nlp
119
209
 
120
210
 
121
- def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
211
+ def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
122
212
  if not languages:
123
213
  return spacy_config.get_model_for_language("en")
124
214
 
@@ -140,7 +230,7 @@ def extract_keywords(
140
230
  kw_model = KeyBERT()
141
231
  keywords = kw_model.extract_keywords(text, top_n=keyword_count)
142
232
  return [(kw, float(score)) for kw, score in keywords]
143
- except (RuntimeError, OSError, ValueError):
233
+ except ValueError:
144
234
  return []
145
235
  except ImportError as e: # pragma: no cover
146
236
  raise MissingDependencyError.create_for_package(
@@ -0,0 +1,182 @@
1
+ """Type-safe error handling utilities for extraction pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import traceback
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+
11
+ from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
12
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
13
+
14
+
15
+ def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
16
+ """Determine if an exception should bubble up or be handled gracefully.
17
+
18
+ Args:
19
+ exception: The exception to classify
20
+ context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
21
+
22
+ Returns:
23
+ True if the exception should bubble up, False if it should be handled gracefully
24
+ """
25
+ if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
26
+ return True
27
+
28
+ if isinstance(exception, MissingDependencyError):
29
+ return True
30
+
31
+ if isinstance(exception, ValidationError):
32
+ if context == "batch_processing":
33
+ return False
34
+
35
+ return context != "optional_feature"
36
+
37
+ if isinstance(exception, KreuzbergError) and context == "optional_feature":
38
+ return False
39
+
40
+ if context == "batch_processing":
41
+ return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
42
+
43
+ return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
44
+
45
+
46
+ class FeatureProcessingError:
47
+ """Type-safe processing error for extraction features."""
48
+
49
+ def __init__(self, feature: str, error: Exception) -> None:
50
+ self._feature = feature
51
+ self._error = error
52
+ self._traceback = traceback.format_exc()
53
+
54
+ @property
55
+ def feature(self) -> str:
56
+ return self._feature
57
+
58
+ @property
59
+ def error_type(self) -> str:
60
+ return type(self._error).__name__
61
+
62
+ @property
63
+ def error_message(self) -> str:
64
+ return str(self._error)
65
+
66
+ @property
67
+ def traceback(self) -> str:
68
+ return self._traceback
69
+
70
+ def to_dict(self) -> ProcessingErrorDict:
71
+ return {
72
+ "feature": self.feature,
73
+ "error_type": self.error_type,
74
+ "error_message": self.error_message,
75
+ "traceback": self.traceback,
76
+ }
77
+
78
+
79
+ def safe_feature_execution(
80
+ feature_name: str,
81
+ execution_func: Callable[[], Any],
82
+ default_value: Any,
83
+ result: ExtractionResult,
84
+ context: ErrorContextType = "optional_feature",
85
+ ) -> Any:
86
+ """Safely execute a feature extraction function with proper error handling.
87
+
88
+ Args:
89
+ feature_name: Name of the feature being executed
90
+ execution_func: Function to execute that may raise exceptions
91
+ default_value: Default value to return if execution fails
92
+ result: ExtractionResult to update with error information
93
+ context: The context for exception handling decisions
94
+
95
+ Returns:
96
+ Either the successful result or the default value
97
+ """
98
+ try:
99
+ return execution_func()
100
+ except Exception as e:
101
+ if should_exception_bubble_up(e, context):
102
+ raise
103
+
104
+ _add_processing_error(result, FeatureProcessingError(feature_name, e))
105
+ return default_value
106
+
107
+
108
+ def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
109
+ """Add a processing error to the result metadata in a type-safe way."""
110
+ if result.metadata is None:
111
+ result.metadata = {}
112
+
113
+ if "processing_errors" not in result.metadata:
114
+ result.metadata["processing_errors"] = []
115
+
116
+ errors_list = result.metadata["processing_errors"]
117
+ if isinstance(errors_list, list):
118
+ errors_list.append(error.to_dict())
119
+ else:
120
+ result.metadata["processing_errors"] = [error.to_dict()]
121
+
122
+
123
+ def preserve_result_with_errors(
124
+ result: ExtractionResult,
125
+ errors: list[FeatureProcessingError],
126
+ ) -> ExtractionResult:
127
+ """Preserve a successful extraction result while adding error information.
128
+
129
+ This is used when core extraction succeeds but optional features fail.
130
+
131
+ Args:
132
+ result: The successful extraction result
133
+ errors: List of errors that occurred during optional processing
134
+
135
+ Returns:
136
+ The result with error information added to metadata
137
+ """
138
+ for error in errors:
139
+ _add_processing_error(result, error)
140
+
141
+ return result
142
+
143
+
144
+ def create_error_result(
145
+ content: str,
146
+ mime_type: str,
147
+ errors: list[FeatureProcessingError],
148
+ **metadata_kwargs: Any,
149
+ ) -> ExtractionResult:
150
+ """Create an error result with proper type safety.
151
+
152
+ Args:
153
+ content: Error content to include
154
+ mime_type: MIME type of the result
155
+ errors: List of errors that occurred
156
+ **metadata_kwargs: Additional metadata to include
157
+
158
+ Returns:
159
+ An ExtractionResult with error information
160
+ """
161
+ metadata: Metadata = {
162
+ "error": f"Multiple processing errors occurred: {len(errors)} errors",
163
+ "error_context": {
164
+ "error_count": len(errors),
165
+ "errors": [error.to_dict() for error in errors],
166
+ **metadata_kwargs,
167
+ },
168
+ "processing_errors": [error.to_dict() for error in errors],
169
+ }
170
+
171
+ return ExtractionResult(
172
+ content=content,
173
+ chunks=[],
174
+ mime_type=mime_type,
175
+ metadata=metadata,
176
+ entities=[],
177
+ keywords=[],
178
+ detected_languages=[],
179
+ tables=[],
180
+ images=[],
181
+ image_ocr_results=[],
182
+ )
@@ -230,13 +230,13 @@ class Extractor(ABC):
230
230
  confidence_score=None,
231
231
  processing_time=duration,
232
232
  )
233
- except (OSError, ValueError) as e: # pragma: no cover
233
+ except ValueError as e: # pragma: no cover
234
234
  return ImageOCRResult(
235
235
  image=target,
236
236
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
237
237
  skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
238
238
  )
239
- except (RuntimeError, TypeError) as e: # pragma: no cover
239
+ except TypeError as e: # pragma: no cover
240
240
  return ImageOCRResult(
241
241
  image=target,
242
242
  ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
@@ -75,7 +75,7 @@ class HTMLExtractor(Extractor):
75
75
  soup = BeautifulSoup(html_content, "xml")
76
76
 
77
77
  for img in soup.find_all("img"):
78
- src_val = img.get("src") # type: ignore[union-attr]
78
+ src_val = img.get("src")
79
79
  if isinstance(src_val, str) and src_val.startswith("data:image/"):
80
80
  try:
81
81
  header, data = src_val.split(",", 1)
@@ -105,7 +105,7 @@ class HTMLExtractor(Extractor):
105
105
  except (OSError, ValueError) as e: # pragma: no cover
106
106
  logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
107
107
 
108
- alt_val = img.get("alt") # type: ignore[union-attr]
108
+ alt_val = img.get("alt")
109
109
  desc = alt_val if isinstance(alt_val, str) else None
110
110
  images.append(
111
111
  ExtractedImage(
@@ -6,7 +6,6 @@ import logging
6
6
  import os
7
7
  import tempfile
8
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
9
- from dataclasses import asdict
10
9
  from itertools import count
11
10
  from multiprocessing import cpu_count
12
11
  from pathlib import Path
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
27
26
  from kreuzberg._ocr import get_ocr_backend
28
27
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
29
28
  from kreuzberg._types import (
30
- EasyOCRConfig,
31
29
  ExtractedImage,
32
30
  ExtractionResult,
33
31
  ImageOCRResult,
34
32
  Metadata,
35
33
  OcrBackendType,
36
- PaddleOCRConfig,
37
- TesseractConfig,
38
34
  )
39
35
  from kreuzberg._utils._errors import create_error_context, should_retry
40
36
  from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
134
130
  def extract_path_sync(self, path: Path) -> ExtractionResult:
135
131
  content_bytes = path.read_bytes()
136
132
 
133
+ result: ExtractionResult | None = None
134
+
137
135
  document: Document | None = None
138
136
  if self.config.extract_images or self.config.extract_tables:
139
137
  document = self._parse_with_password_attempts(content_bytes)
140
138
 
141
- try:
142
- text = self._extract_pdf_searchable_text_sync(path)
143
- except ParsingError:
144
- text = ""
139
+ if not self.config.force_ocr:
140
+ try:
141
+ content = self._extract_pdf_searchable_text_sync(path)
142
+ if self._validate_extracted_text(content):
143
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
144
+ except ParsingError:
145
+ pass
145
146
 
146
- if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
147
- text = self._extract_pdf_with_ocr_sync(path)
147
+ if not result and self.config.ocr_backend is not None:
148
+ result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
149
+
150
+ if not result:
151
+ result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
152
+
153
+ metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
154
+ result.metadata = metadata
148
155
 
149
- tables = []
150
156
  if self.config.extract_tables:
151
157
  # GMFT is optional dependency ~keep
152
158
  try:
153
159
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
154
160
 
155
161
  tables = extract_tables_sync(path)
162
+ result.tables = tables
156
163
  except ImportError: # pragma: no cover
157
- tables = []
158
-
159
- if not self.config.force_ocr and self._validate_extracted_text(text):
160
- text = self._extract_with_playa_sync(path, fallback_text=text)
161
-
162
- text = normalize_spaces(text)
163
-
164
- result = ExtractionResult(
165
- content=text,
166
- mime_type=PLAIN_TEXT_MIME_TYPE,
167
- metadata={},
168
- tables=list(tables),
169
- )
164
+ result.tables = []
170
165
 
171
- if tables:
172
- table_summary = generate_table_summary(tables)
173
- result.metadata = result.metadata | {
174
- "table_count": table_summary["table_count"],
175
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
176
- f"across {table_summary['pages_with_tables']} pages with "
177
- f"{table_summary['total_rows']} total rows",
178
- }
166
+ if result.tables:
167
+ table_summary = generate_table_summary(result.tables)
168
+ result.metadata = result.metadata | {
169
+ "table_count": table_summary["table_count"],
170
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
171
+ f"across {table_summary['pages_with_tables']} pages with "
172
+ f"{table_summary['total_rows']} total rows",
173
+ }
179
174
 
180
175
  if self.config.extract_images and document:
181
176
  images = self._extract_images_from_playa_sync(document)
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
405
400
  except Exception as e:
406
401
  raise ParsingError(f"Failed to extract PDF text: {e}") from e
407
402
 
408
- def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
403
+ def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
409
404
  temp_files: list[Path] = []
410
405
  try:
411
406
  with pdf_document_sync(path) as pdf:
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
443
438
  with pdf_resources_sync(bitmap, page):
444
439
  pil_image.close()
445
440
 
446
- return self._process_pdf_images_with_ocr([str(p) for p in temp_files])
441
+ content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
442
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
447
443
 
448
444
  except Exception as e:
449
445
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
452
448
  with contextlib.suppress(OSError):
453
449
  p.unlink()
454
450
 
455
- def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
456
- backend = get_ocr_backend(self.config.ocr_backend)
451
+ def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
452
+ backend = get_ocr_backend(ocr_backend)
457
453
  paths = [Path(p) for p in image_paths]
458
454
 
459
- match self.config.ocr_backend:
460
- case "tesseract":
461
- config = (
462
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
463
- )
464
- results = backend.process_batch_sync(paths, **asdict(config))
465
- case "paddleocr":
466
- paddle_config = (
467
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
468
- )
469
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
470
- case "easyocr":
471
- easy_config = (
472
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
473
- )
474
- results = backend.process_batch_sync(paths, **asdict(easy_config))
475
- case _:
476
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
455
+ results = backend.process_batch_sync(paths, **self.config.get_config_dict())
477
456
 
478
457
  return "\n\n".join(result.content for result in results)
479
458
 
@@ -14,7 +14,7 @@ else: # pragma: no cover
14
14
  try:
15
15
  import yaml
16
16
  except ImportError: # pragma: no cover
17
- yaml = None
17
+ yaml = None # type: ignore[assignment]
18
18
 
19
19
 
20
20
  from anyio import Path as AsyncPath
@@ -31,5 +31,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
31
31
  langs = [result["lang"].lower() for result in results if result.get("lang")]
32
32
  return langs if langs else None
33
33
  return None
34
+ except (RuntimeError, OSError, MemoryError):
35
+ raise
34
36
  except Exception: # noqa: BLE001
35
37
  return None