kreuzberg 3.17.3__py3-none-any.whl → 3.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +45 -3
- kreuzberg/_entity_extraction.py +108 -18
- kreuzberg/_error_handling.py +182 -0
- kreuzberg/_extractors/_base.py +2 -2
- kreuzberg/_extractors/_html.py +2 -2
- kreuzberg/_extractors/_pdf.py +33 -54
- kreuzberg/_extractors/_structured.py +1 -1
- kreuzberg/_language_detection.py +2 -0
- kreuzberg/_ocr/_tesseract.py +28 -6
- kreuzberg/_types.py +18 -0
- kreuzberg/cli.py +36 -22
- kreuzberg/extraction.py +251 -107
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/METADATA +7 -4
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/RECORD +17 -16
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.19.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import io
|
5
|
+
import os
|
5
6
|
import traceback
|
6
7
|
from json import dumps
|
7
8
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
@@ -100,6 +101,35 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
100
101
|
)
|
101
102
|
|
102
103
|
|
104
|
+
def _get_max_upload_size() -> int:
|
105
|
+
"""Get the maximum upload size from environment variable.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
Maximum upload size in bytes. Defaults to 1GB if not set.
|
109
|
+
|
110
|
+
Environment Variables:
|
111
|
+
KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
|
112
|
+
"""
|
113
|
+
default_size = 1024 * 1024 * 1024
|
114
|
+
try:
|
115
|
+
size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
|
116
|
+
return size if size >= 0 else default_size
|
117
|
+
except ValueError:
|
118
|
+
return default_size
|
119
|
+
|
120
|
+
|
121
|
+
def _is_opentelemetry_enabled() -> bool:
|
122
|
+
"""Check if OpenTelemetry should be enabled.
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
True if OpenTelemetry should be enabled, False otherwise.
|
126
|
+
|
127
|
+
Environment Variables:
|
128
|
+
KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
|
129
|
+
"""
|
130
|
+
return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
|
131
|
+
|
132
|
+
|
103
133
|
def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
|
104
134
|
error_type = type(exception).__name__
|
105
135
|
error_message = str(exception)
|
@@ -242,7 +272,7 @@ async def handle_files_upload( # noqa: PLR0913
|
|
242
272
|
- Language detection (if enabled)
|
243
273
|
|
244
274
|
Supports various file formats including PDF, Office documents, images, and more.
|
245
|
-
Maximum file size: 1GB per file.
|
275
|
+
Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
|
246
276
|
|
247
277
|
Args:
|
248
278
|
request: The HTTP request object
|
@@ -280,6 +310,9 @@ async def handle_files_upload( # noqa: PLR0913
|
|
280
310
|
"""
|
281
311
|
static_config = discover_config_cached()
|
282
312
|
|
313
|
+
if not data:
|
314
|
+
raise ValidationError("No files provided for extraction", context={"file_count": 0})
|
315
|
+
|
283
316
|
min_dims = _create_dimension_tuple(image_ocr_min_width, image_ocr_min_height)
|
284
317
|
max_dims = _create_dimension_tuple(image_ocr_max_width, image_ocr_max_height)
|
285
318
|
|
@@ -379,9 +412,18 @@ type_encoders = {
|
|
379
412
|
Image.Image: _pil_image_encoder,
|
380
413
|
}
|
381
414
|
|
415
|
+
|
416
|
+
def _get_plugins() -> list[Any]:
|
417
|
+
"""Get configured plugins based on environment variables."""
|
418
|
+
plugins = []
|
419
|
+
if _is_opentelemetry_enabled():
|
420
|
+
plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
|
421
|
+
return plugins
|
422
|
+
|
423
|
+
|
382
424
|
app = Litestar(
|
383
425
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
384
|
-
plugins=
|
426
|
+
plugins=_get_plugins(),
|
385
427
|
logging_config=StructLoggingConfig(),
|
386
428
|
openapi_config=openapi_config,
|
387
429
|
exception_handlers={
|
@@ -389,5 +431,5 @@ app = Litestar(
|
|
389
431
|
Exception: general_exception_handler,
|
390
432
|
},
|
391
433
|
type_encoders=type_encoders,
|
392
|
-
request_max_body_size=
|
434
|
+
request_max_body_size=_get_max_upload_size(),
|
393
435
|
)
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -2,19 +2,77 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
+
import shutil
|
5
6
|
import subprocess
|
6
|
-
import sys
|
7
7
|
from functools import lru_cache
|
8
8
|
from itertools import chain
|
9
9
|
from typing import TYPE_CHECKING, Any
|
10
10
|
|
11
|
+
import anyio
|
12
|
+
|
11
13
|
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
14
|
+
from kreuzberg._utils._sync import run_sync
|
12
15
|
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
13
16
|
|
14
17
|
if TYPE_CHECKING:
|
15
18
|
from collections.abc import Sequence
|
16
19
|
|
17
20
|
|
21
|
+
def is_uv_available() -> bool:
|
22
|
+
"""Check if uv is available in the environment."""
|
23
|
+
return shutil.which("uv") is not None
|
24
|
+
|
25
|
+
|
26
|
+
def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
|
27
|
+
"""Get the direct download URL for a spaCy model.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
|
31
|
+
version: Model version to download (default: 3.8.0)
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Direct download URL for the model
|
35
|
+
"""
|
36
|
+
return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
|
37
|
+
|
38
|
+
|
39
|
+
async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
|
40
|
+
"""Install spaCy model using uv.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
model_name: Name of the spaCy model to install
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
Completed process result
|
47
|
+
"""
|
48
|
+
model_url = get_spacy_model_url(model_name)
|
49
|
+
return await run_sync(
|
50
|
+
subprocess.run,
|
51
|
+
["uv", "pip", "install", model_url],
|
52
|
+
capture_output=True,
|
53
|
+
text=True,
|
54
|
+
check=False,
|
55
|
+
)
|
56
|
+
|
57
|
+
|
58
|
+
async def install_spacy_model_with_spacy(model_name: str) -> bool:
|
59
|
+
"""Install spaCy model using spacy download function.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
model_name: Name of the spaCy model to install
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
True if successful, False otherwise
|
66
|
+
"""
|
67
|
+
try:
|
68
|
+
import spacy.cli.download # noqa: PLC0415
|
69
|
+
|
70
|
+
await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
|
71
|
+
return True
|
72
|
+
except (ImportError, OSError, RuntimeError):
|
73
|
+
return False
|
74
|
+
|
75
|
+
|
18
76
|
def extract_entities(
|
19
77
|
text: str,
|
20
78
|
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
@@ -46,11 +104,11 @@ def extract_entities(
|
|
46
104
|
functionality="Entity Extraction",
|
47
105
|
) from e
|
48
106
|
|
49
|
-
model_name =
|
107
|
+
model_name = select_spacy_model(languages, spacy_config)
|
50
108
|
if not model_name:
|
51
109
|
return entities
|
52
110
|
|
53
|
-
nlp =
|
111
|
+
nlp = load_spacy_model(model_name, spacy_config)
|
54
112
|
|
55
113
|
if len(text) > spacy_config.max_doc_length:
|
56
114
|
text = text[: spacy_config.max_doc_length]
|
@@ -74,7 +132,7 @@ def extract_entities(
|
|
74
132
|
|
75
133
|
|
76
134
|
@lru_cache(maxsize=32)
|
77
|
-
def
|
135
|
+
def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
78
136
|
try:
|
79
137
|
import spacy # noqa: PLC0415
|
80
138
|
except ImportError:
|
@@ -86,22 +144,54 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
86
144
|
try:
|
87
145
|
nlp = spacy.load(model_name)
|
88
146
|
except OSError:
|
89
|
-
result = subprocess.run(
|
90
|
-
[sys.executable, "-m", "spacy", "download", model_name],
|
91
|
-
capture_output=True,
|
92
|
-
text=True,
|
93
|
-
check=False,
|
94
|
-
)
|
95
147
|
|
96
|
-
|
148
|
+
async def install_model() -> tuple[bool, str | None]:
|
149
|
+
"""Install model and return success status and error message."""
|
150
|
+
try:
|
151
|
+
success = await install_spacy_model_with_spacy(model_name)
|
152
|
+
if success:
|
153
|
+
return True, None
|
154
|
+
except (ImportError, OSError, RuntimeError) as e:
|
155
|
+
spacy_error = str(e)
|
156
|
+
else:
|
157
|
+
spacy_error = "spaCy download failed"
|
158
|
+
|
159
|
+
if is_uv_available():
|
160
|
+
try:
|
161
|
+
result = await install_spacy_model_with_uv(model_name)
|
162
|
+
return result.returncode == 0, result.stderr
|
163
|
+
except (OSError, subprocess.SubprocessError) as e:
|
164
|
+
return False, f"spaCy: {spacy_error}, uv: {e!s}"
|
165
|
+
|
166
|
+
return False, spacy_error
|
167
|
+
|
168
|
+
try:
|
169
|
+
success, error_details = anyio.run(install_model)
|
170
|
+
except SystemExit as e:
|
171
|
+
success, error_details = False, f"spaCy CLI exit code: {e.code}"
|
172
|
+
|
173
|
+
if not success:
|
174
|
+
if is_uv_available():
|
175
|
+
model_url = get_spacy_model_url(model_name)
|
176
|
+
manual_install_cmd = f"uv pip install {model_url}"
|
177
|
+
else:
|
178
|
+
manual_install_cmd = f"python -m spacy download {model_name}"
|
179
|
+
|
97
180
|
error_msg = (
|
98
|
-
f"Failed to download spaCy model '{model_name}'. "
|
99
|
-
f"Please install it manually with: python -m spacy download {model_name}"
|
181
|
+
f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
|
100
182
|
)
|
101
|
-
|
102
|
-
|
183
|
+
|
184
|
+
if error_details:
|
185
|
+
error_msg += f"\nError details: {error_details}"
|
186
|
+
|
103
187
|
raise KreuzbergError(
|
104
|
-
error_msg,
|
188
|
+
error_msg,
|
189
|
+
context={
|
190
|
+
"model": model_name,
|
191
|
+
"manual_install_cmd": manual_install_cmd,
|
192
|
+
"error_details": error_details,
|
193
|
+
"uv_available": is_uv_available(),
|
194
|
+
},
|
105
195
|
) from None
|
106
196
|
|
107
197
|
try:
|
@@ -118,7 +208,7 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
118
208
|
return nlp
|
119
209
|
|
120
210
|
|
121
|
-
def
|
211
|
+
def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
122
212
|
if not languages:
|
123
213
|
return spacy_config.get_model_for_language("en")
|
124
214
|
|
@@ -140,7 +230,7 @@ def extract_keywords(
|
|
140
230
|
kw_model = KeyBERT()
|
141
231
|
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
142
232
|
return [(kw, float(score)) for kw, score in keywords]
|
143
|
-
except
|
233
|
+
except ValueError:
|
144
234
|
return []
|
145
235
|
except ImportError as e: # pragma: no cover
|
146
236
|
raise MissingDependencyError.create_for_package(
|
@@ -0,0 +1,182 @@
|
|
1
|
+
"""Type-safe error handling utilities for extraction pipeline."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import traceback
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
if TYPE_CHECKING:
|
9
|
+
from collections.abc import Callable
|
10
|
+
|
11
|
+
from kreuzberg._types import ErrorContextType, ExtractionResult, Metadata, ProcessingErrorDict
|
12
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError, ValidationError
|
13
|
+
|
14
|
+
|
15
|
+
def should_exception_bubble_up(exception: Exception, context: ErrorContextType = "unknown") -> bool:
|
16
|
+
"""Determine if an exception should bubble up or be handled gracefully.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
exception: The exception to classify
|
20
|
+
context: The context where the exception occurred (e.g., "batch_processing", "single_extraction", "optional_feature")
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
True if the exception should bubble up, False if it should be handled gracefully
|
24
|
+
"""
|
25
|
+
if isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError)):
|
26
|
+
return True
|
27
|
+
|
28
|
+
if isinstance(exception, MissingDependencyError):
|
29
|
+
return True
|
30
|
+
|
31
|
+
if isinstance(exception, ValidationError):
|
32
|
+
if context == "batch_processing":
|
33
|
+
return False
|
34
|
+
|
35
|
+
return context != "optional_feature"
|
36
|
+
|
37
|
+
if isinstance(exception, KreuzbergError) and context == "optional_feature":
|
38
|
+
return False
|
39
|
+
|
40
|
+
if context == "batch_processing":
|
41
|
+
return isinstance(exception, (SystemExit, KeyboardInterrupt, MemoryError, OSError, RuntimeError))
|
42
|
+
|
43
|
+
return not (context == "optional_feature" and isinstance(exception, (IOError, ImportError)))
|
44
|
+
|
45
|
+
|
46
|
+
class FeatureProcessingError:
|
47
|
+
"""Type-safe processing error for extraction features."""
|
48
|
+
|
49
|
+
def __init__(self, feature: str, error: Exception) -> None:
|
50
|
+
self._feature = feature
|
51
|
+
self._error = error
|
52
|
+
self._traceback = traceback.format_exc()
|
53
|
+
|
54
|
+
@property
|
55
|
+
def feature(self) -> str:
|
56
|
+
return self._feature
|
57
|
+
|
58
|
+
@property
|
59
|
+
def error_type(self) -> str:
|
60
|
+
return type(self._error).__name__
|
61
|
+
|
62
|
+
@property
|
63
|
+
def error_message(self) -> str:
|
64
|
+
return str(self._error)
|
65
|
+
|
66
|
+
@property
|
67
|
+
def traceback(self) -> str:
|
68
|
+
return self._traceback
|
69
|
+
|
70
|
+
def to_dict(self) -> ProcessingErrorDict:
|
71
|
+
return {
|
72
|
+
"feature": self.feature,
|
73
|
+
"error_type": self.error_type,
|
74
|
+
"error_message": self.error_message,
|
75
|
+
"traceback": self.traceback,
|
76
|
+
}
|
77
|
+
|
78
|
+
|
79
|
+
def safe_feature_execution(
|
80
|
+
feature_name: str,
|
81
|
+
execution_func: Callable[[], Any],
|
82
|
+
default_value: Any,
|
83
|
+
result: ExtractionResult,
|
84
|
+
context: ErrorContextType = "optional_feature",
|
85
|
+
) -> Any:
|
86
|
+
"""Safely execute a feature extraction function with proper error handling.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
feature_name: Name of the feature being executed
|
90
|
+
execution_func: Function to execute that may raise exceptions
|
91
|
+
default_value: Default value to return if execution fails
|
92
|
+
result: ExtractionResult to update with error information
|
93
|
+
context: The context for exception handling decisions
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
Either the successful result or the default value
|
97
|
+
"""
|
98
|
+
try:
|
99
|
+
return execution_func()
|
100
|
+
except Exception as e:
|
101
|
+
if should_exception_bubble_up(e, context):
|
102
|
+
raise
|
103
|
+
|
104
|
+
_add_processing_error(result, FeatureProcessingError(feature_name, e))
|
105
|
+
return default_value
|
106
|
+
|
107
|
+
|
108
|
+
def _add_processing_error(result: ExtractionResult, error: FeatureProcessingError) -> None:
|
109
|
+
"""Add a processing error to the result metadata in a type-safe way."""
|
110
|
+
if result.metadata is None:
|
111
|
+
result.metadata = {}
|
112
|
+
|
113
|
+
if "processing_errors" not in result.metadata:
|
114
|
+
result.metadata["processing_errors"] = []
|
115
|
+
|
116
|
+
errors_list = result.metadata["processing_errors"]
|
117
|
+
if isinstance(errors_list, list):
|
118
|
+
errors_list.append(error.to_dict())
|
119
|
+
else:
|
120
|
+
result.metadata["processing_errors"] = [error.to_dict()]
|
121
|
+
|
122
|
+
|
123
|
+
def preserve_result_with_errors(
|
124
|
+
result: ExtractionResult,
|
125
|
+
errors: list[FeatureProcessingError],
|
126
|
+
) -> ExtractionResult:
|
127
|
+
"""Preserve a successful extraction result while adding error information.
|
128
|
+
|
129
|
+
This is used when core extraction succeeds but optional features fail.
|
130
|
+
|
131
|
+
Args:
|
132
|
+
result: The successful extraction result
|
133
|
+
errors: List of errors that occurred during optional processing
|
134
|
+
|
135
|
+
Returns:
|
136
|
+
The result with error information added to metadata
|
137
|
+
"""
|
138
|
+
for error in errors:
|
139
|
+
_add_processing_error(result, error)
|
140
|
+
|
141
|
+
return result
|
142
|
+
|
143
|
+
|
144
|
+
def create_error_result(
|
145
|
+
content: str,
|
146
|
+
mime_type: str,
|
147
|
+
errors: list[FeatureProcessingError],
|
148
|
+
**metadata_kwargs: Any,
|
149
|
+
) -> ExtractionResult:
|
150
|
+
"""Create an error result with proper type safety.
|
151
|
+
|
152
|
+
Args:
|
153
|
+
content: Error content to include
|
154
|
+
mime_type: MIME type of the result
|
155
|
+
errors: List of errors that occurred
|
156
|
+
**metadata_kwargs: Additional metadata to include
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
An ExtractionResult with error information
|
160
|
+
"""
|
161
|
+
metadata: Metadata = {
|
162
|
+
"error": f"Multiple processing errors occurred: {len(errors)} errors",
|
163
|
+
"error_context": {
|
164
|
+
"error_count": len(errors),
|
165
|
+
"errors": [error.to_dict() for error in errors],
|
166
|
+
**metadata_kwargs,
|
167
|
+
},
|
168
|
+
"processing_errors": [error.to_dict() for error in errors],
|
169
|
+
}
|
170
|
+
|
171
|
+
return ExtractionResult(
|
172
|
+
content=content,
|
173
|
+
chunks=[],
|
174
|
+
mime_type=mime_type,
|
175
|
+
metadata=metadata,
|
176
|
+
entities=[],
|
177
|
+
keywords=[],
|
178
|
+
detected_languages=[],
|
179
|
+
tables=[],
|
180
|
+
images=[],
|
181
|
+
image_ocr_results=[],
|
182
|
+
)
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -230,13 +230,13 @@ class Extractor(ABC):
|
|
230
230
|
confidence_score=None,
|
231
231
|
processing_time=duration,
|
232
232
|
)
|
233
|
-
except
|
233
|
+
except ValueError as e: # pragma: no cover
|
234
234
|
return ImageOCRResult(
|
235
235
|
image=target,
|
236
236
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
237
237
|
skipped_reason=f"OCR failed: {type(e).__name__}: {e}",
|
238
238
|
)
|
239
|
-
except
|
239
|
+
except TypeError as e: # pragma: no cover
|
240
240
|
return ImageOCRResult(
|
241
241
|
image=target,
|
242
242
|
ocr_result=ExtractionResult(content="", mime_type="text/plain", metadata={}),
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -75,7 +75,7 @@ class HTMLExtractor(Extractor):
|
|
75
75
|
soup = BeautifulSoup(html_content, "xml")
|
76
76
|
|
77
77
|
for img in soup.find_all("img"):
|
78
|
-
src_val = img.get("src")
|
78
|
+
src_val = img.get("src")
|
79
79
|
if isinstance(src_val, str) and src_val.startswith("data:image/"):
|
80
80
|
try:
|
81
81
|
header, data = src_val.split(",", 1)
|
@@ -105,7 +105,7 @@ class HTMLExtractor(Extractor):
|
|
105
105
|
except (OSError, ValueError) as e: # pragma: no cover
|
106
106
|
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
107
|
|
108
|
-
alt_val = img.get("alt")
|
108
|
+
alt_val = img.get("alt")
|
109
109
|
desc = alt_val if isinstance(alt_val, str) else None
|
110
110
|
images.append(
|
111
111
|
ExtractedImage(
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -6,7 +6,6 @@ import logging
|
|
6
6
|
import os
|
7
7
|
import tempfile
|
8
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
9
|
-
from dataclasses import asdict
|
10
9
|
from itertools import count
|
11
10
|
from multiprocessing import cpu_count
|
12
11
|
from pathlib import Path
|
@@ -27,14 +26,11 @@ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
|
27
26
|
from kreuzberg._ocr import get_ocr_backend
|
28
27
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
29
28
|
from kreuzberg._types import (
|
30
|
-
EasyOCRConfig,
|
31
29
|
ExtractedImage,
|
32
30
|
ExtractionResult,
|
33
31
|
ImageOCRResult,
|
34
32
|
Metadata,
|
35
33
|
OcrBackendType,
|
36
|
-
PaddleOCRConfig,
|
37
|
-
TesseractConfig,
|
38
34
|
)
|
39
35
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
40
36
|
from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
|
@@ -134,48 +130,47 @@ class PDFExtractor(Extractor):
|
|
134
130
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
135
131
|
content_bytes = path.read_bytes()
|
136
132
|
|
133
|
+
result: ExtractionResult | None = None
|
134
|
+
|
137
135
|
document: Document | None = None
|
138
136
|
if self.config.extract_images or self.config.extract_tables:
|
139
137
|
document = self._parse_with_password_attempts(content_bytes)
|
140
138
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
139
|
+
if not self.config.force_ocr:
|
140
|
+
try:
|
141
|
+
content = self._extract_pdf_searchable_text_sync(path)
|
142
|
+
if self._validate_extracted_text(content):
|
143
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
144
|
+
except ParsingError:
|
145
|
+
pass
|
145
146
|
|
146
|
-
if
|
147
|
-
|
147
|
+
if not result and self.config.ocr_backend is not None:
|
148
|
+
result = self._extract_pdf_text_with_ocr_sync(path, self.config.ocr_backend)
|
149
|
+
|
150
|
+
if not result:
|
151
|
+
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
152
|
+
|
153
|
+
metadata = self._extract_metadata_with_password_attempts_sync(content_bytes)
|
154
|
+
result.metadata = metadata
|
148
155
|
|
149
|
-
tables = []
|
150
156
|
if self.config.extract_tables:
|
151
157
|
# GMFT is optional dependency ~keep
|
152
158
|
try:
|
153
159
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
154
160
|
|
155
161
|
tables = extract_tables_sync(path)
|
162
|
+
result.tables = tables
|
156
163
|
except ImportError: # pragma: no cover
|
157
|
-
tables = []
|
158
|
-
|
159
|
-
if not self.config.force_ocr and self._validate_extracted_text(text):
|
160
|
-
text = self._extract_with_playa_sync(path, fallback_text=text)
|
161
|
-
|
162
|
-
text = normalize_spaces(text)
|
163
|
-
|
164
|
-
result = ExtractionResult(
|
165
|
-
content=text,
|
166
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
167
|
-
metadata={},
|
168
|
-
tables=list(tables),
|
169
|
-
)
|
164
|
+
result.tables = []
|
170
165
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
166
|
+
if result.tables:
|
167
|
+
table_summary = generate_table_summary(result.tables)
|
168
|
+
result.metadata = result.metadata | {
|
169
|
+
"table_count": table_summary["table_count"],
|
170
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
171
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
172
|
+
f"{table_summary['total_rows']} total rows",
|
173
|
+
}
|
179
174
|
|
180
175
|
if self.config.extract_images and document:
|
181
176
|
images = self._extract_images_from_playa_sync(document)
|
@@ -405,7 +400,7 @@ class PDFExtractor(Extractor):
|
|
405
400
|
except Exception as e:
|
406
401
|
raise ParsingError(f"Failed to extract PDF text: {e}") from e
|
407
402
|
|
408
|
-
def
|
403
|
+
def _extract_pdf_text_with_ocr_sync(self, path: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
409
404
|
temp_files: list[Path] = []
|
410
405
|
try:
|
411
406
|
with pdf_document_sync(path) as pdf:
|
@@ -443,7 +438,8 @@ class PDFExtractor(Extractor):
|
|
443
438
|
with pdf_resources_sync(bitmap, page):
|
444
439
|
pil_image.close()
|
445
440
|
|
446
|
-
|
441
|
+
content = self._process_pdf_images_with_ocr([str(p) for p in temp_files], ocr_backend)
|
442
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
447
443
|
|
448
444
|
except Exception as e:
|
449
445
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
@@ -452,28 +448,11 @@ class PDFExtractor(Extractor):
|
|
452
448
|
with contextlib.suppress(OSError):
|
453
449
|
p.unlink()
|
454
450
|
|
455
|
-
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
456
|
-
backend = get_ocr_backend(
|
451
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str], ocr_backend: OcrBackendType) -> str:
|
452
|
+
backend = get_ocr_backend(ocr_backend)
|
457
453
|
paths = [Path(p) for p in image_paths]
|
458
454
|
|
459
|
-
|
460
|
-
case "tesseract":
|
461
|
-
config = (
|
462
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
463
|
-
)
|
464
|
-
results = backend.process_batch_sync(paths, **asdict(config))
|
465
|
-
case "paddleocr":
|
466
|
-
paddle_config = (
|
467
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
468
|
-
)
|
469
|
-
results = backend.process_batch_sync(paths, **asdict(paddle_config))
|
470
|
-
case "easyocr":
|
471
|
-
easy_config = (
|
472
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
473
|
-
)
|
474
|
-
results = backend.process_batch_sync(paths, **asdict(easy_config))
|
475
|
-
case _:
|
476
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
455
|
+
results = backend.process_batch_sync(paths, **self.config.get_config_dict())
|
477
456
|
|
478
457
|
return "\n\n".join(result.content for result in results)
|
479
458
|
|
kreuzberg/_language_detection.py
CHANGED
@@ -31,5 +31,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
|
|
31
31
|
langs = [result["lang"].lower() for result in results if result.get("lang")]
|
32
32
|
return langs if langs else None
|
33
33
|
return None
|
34
|
+
except (RuntimeError, OSError, MemoryError):
|
35
|
+
raise
|
34
36
|
except Exception: # noqa: BLE001
|
35
37
|
return None
|