kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +17 -8
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +6 -7
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +7 -2
- kreuzberg/_mcp/server.py +1 -22
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +47 -20
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +27 -26
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +146 -43
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +27 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
- kreuzberg-3.17.0.dist-info/RECORD +128 -0
- kreuzberg-3.15.0.dist-info/RECORD +0 -60
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import sys
|
5
4
|
from typing import TYPE_CHECKING, Any, ClassVar
|
6
5
|
|
@@ -17,11 +16,13 @@ try:
|
|
17
16
|
except ImportError: # pragma: no cover
|
18
17
|
yaml = None
|
19
18
|
|
19
|
+
|
20
20
|
from anyio import Path as AsyncPath
|
21
21
|
|
22
22
|
from kreuzberg._extractors._base import Extractor
|
23
23
|
from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
|
24
|
-
from kreuzberg._types import ExtractionResult, normalize_metadata
|
24
|
+
from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
|
25
|
+
from kreuzberg._utils._serialization import deserialize
|
25
26
|
from kreuzberg._utils._string import normalize_spaces, safe_decode
|
26
27
|
from kreuzberg._utils._sync import run_sync
|
27
28
|
|
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
|
|
43
44
|
"text/toml",
|
44
45
|
}
|
45
46
|
|
47
|
+
@property
|
48
|
+
def _json_config(self) -> JSONExtractionConfig | None:
|
49
|
+
return self.config.json_config
|
50
|
+
|
51
|
+
def _get_text_field_keywords(self) -> frozenset[str]:
|
52
|
+
json_config = self._json_config
|
53
|
+
if json_config and json_config.custom_text_field_patterns:
|
54
|
+
return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
|
55
|
+
return _TEXT_FIELD_KEYWORDS
|
56
|
+
|
57
|
+
def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
|
58
|
+
json_config = self._json_config
|
59
|
+
if not json_config or not json_config.extract_schema:
|
60
|
+
return {}
|
61
|
+
|
62
|
+
if depth >= json_config.max_depth:
|
63
|
+
return {"max_depth_reached": True}
|
64
|
+
|
65
|
+
schema_info: dict[str, Any] = {"type": type(data).__name__}
|
66
|
+
|
67
|
+
if isinstance(data, dict):
|
68
|
+
schema_info["properties"] = {}
|
69
|
+
for key, value in data.items():
|
70
|
+
key_path = f"{path}.{key}" if path else key
|
71
|
+
schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
|
72
|
+
elif isinstance(data, list) and data:
|
73
|
+
if len(data) <= json_config.array_item_limit:
|
74
|
+
schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
|
75
|
+
schema_info["length"] = len(data)
|
76
|
+
else:
|
77
|
+
schema_info["items"] = {"type": "truncated"}
|
78
|
+
schema_info["length"] = len(data)
|
79
|
+
schema_info["truncated"] = True
|
80
|
+
|
81
|
+
return schema_info
|
82
|
+
|
46
83
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
47
84
|
return await run_sync(self.extract_bytes_sync, content)
|
48
85
|
|
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
|
|
51
88
|
return await self.extract_bytes_async(content)
|
52
89
|
|
53
90
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
54
|
-
text_content =
|
55
|
-
|
91
|
+
text_content: None | str = None
|
56
92
|
try:
|
57
93
|
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
58
|
-
data = json
|
94
|
+
data = deserialize(content, dict, json=True)
|
59
95
|
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
96
|
+
text_content = safe_decode(content)
|
60
97
|
if tomllib is None:
|
61
98
|
return ExtractionResult(
|
62
99
|
content=normalize_spaces(text_content),
|
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
|
|
66
103
|
)
|
67
104
|
data = tomllib.loads(text_content)
|
68
105
|
else:
|
106
|
+
text_content = safe_decode(content)
|
69
107
|
if yaml is None:
|
70
108
|
return ExtractionResult(
|
71
109
|
content=normalize_spaces(text_content),
|
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
|
|
75
113
|
)
|
76
114
|
data = yaml.safe_load(text_content)
|
77
115
|
|
78
|
-
text_parts: list[str] = []
|
79
116
|
metadata: dict[str, Any] = {}
|
80
117
|
|
118
|
+
if (
|
119
|
+
self.mime_type in {JSON_MIME_TYPE, "text/json"}
|
120
|
+
and self._json_config
|
121
|
+
and self._json_config.extract_schema
|
122
|
+
):
|
123
|
+
schema_info = self._extract_json_schema(data)
|
124
|
+
if schema_info:
|
125
|
+
metadata["json_schema"] = schema_info
|
126
|
+
|
81
127
|
if isinstance(data, dict):
|
82
128
|
text_parts = self._extract_from_dict(data, metadata)
|
83
129
|
elif isinstance(data, list):
|
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
|
|
85
131
|
else:
|
86
132
|
text_parts = [str(data)]
|
87
133
|
|
88
|
-
combined_text = "\n".join(text_parts) if text_parts else text_content
|
134
|
+
combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
|
89
135
|
|
90
136
|
return ExtractionResult(
|
91
137
|
content=normalize_spaces(combined_text),
|
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
|
|
96
142
|
|
97
143
|
except (ValueError, TypeError) as e:
|
98
144
|
return ExtractionResult(
|
99
|
-
content=normalize_spaces(text_content),
|
145
|
+
content=normalize_spaces(text_content or safe_decode(content)),
|
100
146
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
101
147
|
metadata={"parse_error": str(e)},
|
102
148
|
chunks=[],
|
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
|
|
113
159
|
full_key = f"{prefix}.{key}" if prefix else key
|
114
160
|
|
115
161
|
if isinstance(value, str) and value.strip():
|
116
|
-
|
162
|
+
if self._json_config and self._json_config.include_type_info:
|
163
|
+
text_parts.append(f"{full_key} (string): {value}")
|
164
|
+
else:
|
165
|
+
text_parts.append(f"{full_key}: {value}")
|
117
166
|
|
118
167
|
key_lower = key.lower()
|
119
|
-
|
168
|
+
text_field_keywords = self._get_text_field_keywords()
|
169
|
+
if any(keyword in key_lower for keyword in text_field_keywords):
|
120
170
|
metadata[full_key] = value
|
121
171
|
|
122
172
|
elif isinstance(value, (int, float, bool)):
|
123
|
-
|
173
|
+
if self._json_config and self._json_config.include_type_info:
|
174
|
+
type_name = type(value).__name__
|
175
|
+
text_parts.append(f"{full_key} ({type_name}): {value}")
|
176
|
+
else:
|
177
|
+
text_parts.append(f"{full_key}: {value}")
|
124
178
|
|
125
179
|
elif isinstance(value, dict):
|
126
|
-
|
180
|
+
if self._json_config and not self._json_config.flatten_nested_objects:
|
181
|
+
text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
|
182
|
+
else:
|
183
|
+
text_parts.extend(self._extract_from_dict(value, metadata, full_key))
|
127
184
|
|
128
185
|
elif isinstance(value, list):
|
129
186
|
text_parts.extend(self._extract_from_list(value, metadata, full_key))
|
130
187
|
|
131
188
|
elif value is not None:
|
132
|
-
|
189
|
+
if self._json_config and self._json_config.include_type_info:
|
190
|
+
type_name = type(value).__name__
|
191
|
+
text_parts.append(f"{full_key} ({type_name}): {value!s}")
|
192
|
+
else:
|
193
|
+
text_parts.append(f"{full_key}: {value!s}")
|
133
194
|
|
134
195
|
return text_parts
|
135
196
|
|
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
|
|
140
201
|
item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
|
141
202
|
|
142
203
|
if isinstance(item, str) and item.strip():
|
143
|
-
|
204
|
+
if self._json_config and self._json_config.include_type_info:
|
205
|
+
text_parts.append(f"{item_key} (string): {item}")
|
206
|
+
else:
|
207
|
+
text_parts.append(f"{item_key}: {item}")
|
144
208
|
|
145
209
|
elif isinstance(item, dict):
|
146
210
|
text_parts.extend(self._extract_from_dict(item, metadata, item_key))
|
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
|
|
149
213
|
text_parts.extend(self._extract_from_list(item, metadata, item_key))
|
150
214
|
|
151
215
|
elif item is not None:
|
152
|
-
|
216
|
+
if self._json_config and self._json_config.include_type_info:
|
217
|
+
type_name = type(item).__name__
|
218
|
+
text_parts.append(f"{item_key} ({type_name}): {item!s}")
|
219
|
+
else:
|
220
|
+
text_parts.append(f"{item_key}: {item!s}")
|
153
221
|
|
154
222
|
return text_parts
|
kreuzberg/_gmft.py
CHANGED
@@ -99,7 +99,7 @@ async def extract_tables(
|
|
99
99
|
"size": stat.st_size,
|
100
100
|
"mtime": stat.st_mtime,
|
101
101
|
}
|
102
|
-
except OSError:
|
102
|
+
except OSError: # pragma: no cover
|
103
103
|
file_info = {
|
104
104
|
"path": str(path),
|
105
105
|
"size": 0,
|
@@ -215,7 +215,7 @@ def extract_tables_sync(
|
|
215
215
|
"size": stat.st_size,
|
216
216
|
"mtime": stat.st_mtime,
|
217
217
|
}
|
218
|
-
except OSError:
|
218
|
+
except OSError: # pragma: no cover
|
219
219
|
file_info = {
|
220
220
|
"path": str(path),
|
221
221
|
"size": 0,
|
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
|
|
312
312
|
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
313
313
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
314
314
|
|
315
|
+
if "cell_required_confidence" in config_dict:
|
316
|
+
cell_config = config_dict["cell_required_confidence"]
|
317
|
+
if isinstance(cell_config, dict) and cell_config:
|
318
|
+
config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
|
319
|
+
|
315
320
|
config = GMFTConfig(**config_dict)
|
316
321
|
|
317
322
|
formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
|
kreuzberg/_mcp/server.py
CHANGED
@@ -22,7 +22,6 @@ from kreuzberg.extraction import (
|
|
22
22
|
|
23
23
|
mcp = FastMCP("Kreuzberg Text Extraction")
|
24
24
|
|
25
|
-
# Security and performance limits
|
26
25
|
MAX_BATCH_SIZE = 100
|
27
26
|
|
28
27
|
|
@@ -40,13 +39,12 @@ def _validate_file_path(file_path: str) -> Path:
|
|
40
39
|
"""
|
41
40
|
try:
|
42
41
|
path = Path(file_path).resolve()
|
43
|
-
except (OSError, ValueError) as e:
|
42
|
+
except (OSError, ValueError) as e: # pragma: no cover
|
44
43
|
raise ValidationError(
|
45
44
|
f"Invalid file path: {file_path}",
|
46
45
|
context={"file_path": file_path, "error": str(e)},
|
47
46
|
) from e
|
48
47
|
|
49
|
-
# Check for path traversal attempts
|
50
48
|
if ".." in file_path and not file_path.startswith("/"):
|
51
49
|
raise ValidationError(
|
52
50
|
"Path traversal detected in file path",
|
@@ -73,7 +71,6 @@ def _validate_file_path_with_context(file_path: str, index: int, total: int) ->
|
|
73
71
|
try:
|
74
72
|
return _validate_file_path(file_path)
|
75
73
|
except ValidationError as e:
|
76
|
-
# Add context about which file in the batch failed
|
77
74
|
e.context = e.context or {}
|
78
75
|
e.context["batch_index"] = index
|
79
76
|
e.context["total_files"] = total
|
@@ -99,7 +96,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
|
|
99
96
|
context={"context": context_info},
|
100
97
|
)
|
101
98
|
|
102
|
-
# Check for whitespace-only content
|
103
99
|
if not content_base64.strip():
|
104
100
|
raise ValidationError(
|
105
101
|
"Base64 content cannot be whitespace only",
|
@@ -126,7 +122,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
|
|
126
122
|
def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
127
123
|
base_config = discover_config()
|
128
124
|
|
129
|
-
# Extract Tesseract-specific parameters from kwargs first
|
130
125
|
tesseract_lang = kwargs.pop("tesseract_lang", None)
|
131
126
|
tesseract_psm = kwargs.pop("tesseract_psm", None)
|
132
127
|
tesseract_output_format = kwargs.pop("tesseract_output_format", None)
|
@@ -151,7 +146,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
151
146
|
}
|
152
147
|
config_dict = config_dict | kwargs
|
153
148
|
|
154
|
-
# Handle Tesseract OCR configuration
|
155
149
|
ocr_backend = config_dict.get("ocr_backend")
|
156
150
|
if ocr_backend == "tesseract" and (
|
157
151
|
tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
|
@@ -174,10 +168,8 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
174
168
|
tesseract_config_dict["enable_table_detection"] = True
|
175
169
|
|
176
170
|
if tesseract_config_dict:
|
177
|
-
# Merge with existing tesseract config if present
|
178
171
|
existing_ocr_config = config_dict.get("ocr_config")
|
179
172
|
if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
|
180
|
-
# Convert existing config to dict, merge, and recreate
|
181
173
|
existing_dict = existing_ocr_config.to_dict()
|
182
174
|
merged_dict = existing_dict | tesseract_config_dict
|
183
175
|
config_dict["ocr_config"] = TesseractConfig(**merged_dict)
|
@@ -206,7 +198,6 @@ def extract_document( # noqa: PLR0913
|
|
206
198
|
tesseract_output_format: str | None = None,
|
207
199
|
enable_table_detection: bool | None = None,
|
208
200
|
) -> dict[str, Any]:
|
209
|
-
# Validate file path for security
|
210
201
|
validated_path = _validate_file_path(file_path)
|
211
202
|
config = _create_config_with_overrides(
|
212
203
|
force_ocr=force_ocr,
|
@@ -289,7 +280,6 @@ def batch_extract_document( # noqa: PLR0913
|
|
289
280
|
tesseract_output_format: str | None = None,
|
290
281
|
enable_table_detection: bool | None = None,
|
291
282
|
) -> list[dict[str, Any]]:
|
292
|
-
# Validate batch size
|
293
283
|
if len(file_paths) > MAX_BATCH_SIZE:
|
294
284
|
raise ValidationError(
|
295
285
|
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
@@ -302,7 +292,6 @@ def batch_extract_document( # noqa: PLR0913
|
|
302
292
|
context={"file_paths": file_paths},
|
303
293
|
)
|
304
294
|
|
305
|
-
# Validate all file paths for security
|
306
295
|
validated_paths = []
|
307
296
|
for i, file_path in enumerate(file_paths):
|
308
297
|
validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
|
@@ -346,7 +335,6 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
346
335
|
tesseract_output_format: str | None = None,
|
347
336
|
enable_table_detection: bool | None = None,
|
348
337
|
) -> list[dict[str, Any]]:
|
349
|
-
# Validate input
|
350
338
|
if not content_items:
|
351
339
|
raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
|
352
340
|
|
@@ -355,7 +343,6 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
355
343
|
"content_items must be a list", context={"content_items_type": type(content_items).__name__}
|
356
344
|
)
|
357
345
|
|
358
|
-
# Validate batch size
|
359
346
|
if len(content_items) > MAX_BATCH_SIZE:
|
360
347
|
raise ValidationError(
|
361
348
|
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
@@ -379,17 +366,14 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
379
366
|
enable_table_detection=enable_table_detection,
|
380
367
|
)
|
381
368
|
|
382
|
-
# Convert list of dicts to list of tuples (bytes, mime_type)
|
383
369
|
contents = []
|
384
370
|
for i, item in enumerate(content_items):
|
385
|
-
# Validate item structure
|
386
371
|
if not isinstance(item, dict):
|
387
372
|
raise ValidationError(
|
388
373
|
f"Item at index {i} must be a dictionary",
|
389
374
|
context={"item_index": i, "item_type": type(item).__name__, "item": item},
|
390
375
|
)
|
391
376
|
|
392
|
-
# Check for required keys
|
393
377
|
if "content_base64" not in item:
|
394
378
|
raise ValidationError(
|
395
379
|
f"Item at index {i} is missing required key 'content_base64'",
|
@@ -405,11 +389,9 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
405
389
|
content_base64 = item["content_base64"]
|
406
390
|
mime_type = item["mime_type"]
|
407
391
|
|
408
|
-
# Validate base64 content
|
409
392
|
try:
|
410
393
|
content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
|
411
394
|
except ValidationError as e:
|
412
|
-
# Add batch-specific context
|
413
395
|
e.context = e.context or {}
|
414
396
|
e.context["item_index"] = i
|
415
397
|
e.context["total_items"] = len(content_items)
|
@@ -426,7 +408,6 @@ def extract_simple(
|
|
426
408
|
file_path: str,
|
427
409
|
mime_type: str | None = None,
|
428
410
|
) -> str:
|
429
|
-
# Validate file path for security
|
430
411
|
validated_path = _validate_file_path(file_path)
|
431
412
|
config = _create_config_with_overrides()
|
432
413
|
result = extract_file_sync(str(validated_path), mime_type, config)
|
@@ -467,7 +448,6 @@ def get_supported_formats() -> str:
|
|
467
448
|
|
468
449
|
@mcp.prompt()
|
469
450
|
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
470
|
-
# Validate file path for security
|
471
451
|
validated_path = _validate_file_path(file_path)
|
472
452
|
result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
|
473
453
|
|
@@ -481,7 +461,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
|
|
481
461
|
|
482
462
|
@mcp.prompt()
|
483
463
|
def extract_structured(file_path: str) -> list[TextContent]:
|
484
|
-
# Validate file path for security
|
485
464
|
validated_path = _validate_file_path(file_path)
|
486
465
|
config = _create_config_with_overrides(
|
487
466
|
extract_entities=True,
|
kreuzberg/_mime_types.py
CHANGED
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -33,22 +33,37 @@ except ImportError: # pragma: no cover
|
|
33
33
|
|
34
34
|
if TYPE_CHECKING:
|
35
35
|
import easyocr
|
36
|
-
import numpy as np
|
37
36
|
import torch
|
37
|
+
else:
|
38
|
+
easyocr: Any = None
|
39
|
+
torch: Any = None
|
40
|
+
|
41
|
+
HAS_EASYOCR: bool = False
|
42
|
+
|
43
|
+
|
44
|
+
def _import_easyocr() -> tuple[Any, Any]:
|
45
|
+
global HAS_EASYOCR, easyocr, torch
|
46
|
+
|
47
|
+
if easyocr is not None:
|
48
|
+
return easyocr, torch
|
49
|
+
|
50
|
+
if not HAS_EASYOCR and easyocr is None:
|
51
|
+
return None, None
|
38
52
|
|
39
|
-
HAS_EASYOCR: bool
|
40
|
-
if not TYPE_CHECKING:
|
41
53
|
try:
|
42
|
-
import easyocr
|
43
|
-
import numpy as np
|
44
|
-
import torch
|
54
|
+
import easyocr as _easyocr # noqa: PLC0415
|
45
55
|
|
56
|
+
try:
|
57
|
+
import torch as _torch # noqa: PLC0415
|
58
|
+
except ImportError: # pragma: no cover
|
59
|
+
_torch = None # type: ignore[assignment]
|
60
|
+
|
61
|
+
easyocr = _easyocr
|
62
|
+
torch = _torch
|
46
63
|
HAS_EASYOCR = True
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
np: Any = None
|
51
|
-
torch: Any = None
|
64
|
+
return easyocr, torch
|
65
|
+
except ImportError: # pragma: no cover
|
66
|
+
return None, None
|
52
67
|
|
53
68
|
|
54
69
|
EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
@@ -142,6 +157,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
142
157
|
_reader: ClassVar[Any] = None
|
143
158
|
|
144
159
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
160
|
+
try:
|
161
|
+
import numpy as np # noqa: PLC0415
|
162
|
+
except ImportError as e: # pragma: no cover
|
163
|
+
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
164
|
+
|
145
165
|
use_cache = kwargs.pop("use_cache", True)
|
146
166
|
|
147
167
|
cache_kwargs = None
|
@@ -292,7 +312,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
292
312
|
|
293
313
|
@classmethod
|
294
314
|
def _is_gpu_available(cls) -> bool:
|
295
|
-
if
|
315
|
+
if torch is None:
|
296
316
|
return False
|
297
317
|
return bool(torch.cuda.is_available())
|
298
318
|
|
@@ -301,13 +321,14 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
301
321
|
if cls._reader is not None:
|
302
322
|
return
|
303
323
|
|
304
|
-
|
324
|
+
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
325
|
+
|
326
|
+
easyocr_module, _ = _import_easyocr()
|
327
|
+
if easyocr_module is None:
|
305
328
|
raise MissingDependencyError.create_for_package(
|
306
329
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
307
330
|
)
|
308
331
|
|
309
|
-
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
310
|
-
|
311
332
|
device_info = cls._resolve_device_config(**kwargs)
|
312
333
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
313
334
|
|
@@ -318,7 +339,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
318
339
|
|
319
340
|
try:
|
320
341
|
cls._reader = await run_sync(
|
321
|
-
|
342
|
+
easyocr_module.Reader,
|
322
343
|
languages,
|
323
344
|
gpu=use_gpu,
|
324
345
|
verbose=False,
|
@@ -382,6 +403,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
382
403
|
return languages
|
383
404
|
|
384
405
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
406
|
+
try:
|
407
|
+
import numpy as np # noqa: PLC0415
|
408
|
+
except ImportError as e: # pragma: no cover
|
409
|
+
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
410
|
+
|
385
411
|
use_cache = kwargs.pop("use_cache", True)
|
386
412
|
|
387
413
|
cache_kwargs = None
|
@@ -453,13 +479,14 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
453
479
|
if cls._reader is not None:
|
454
480
|
return
|
455
481
|
|
456
|
-
|
482
|
+
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
483
|
+
|
484
|
+
easyocr_module, _ = _import_easyocr()
|
485
|
+
if easyocr_module is None:
|
457
486
|
raise MissingDependencyError.create_for_package(
|
458
487
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
459
488
|
)
|
460
489
|
|
461
|
-
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
462
|
-
|
463
490
|
device_info = cls._resolve_device_config(**kwargs)
|
464
491
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
465
492
|
|
@@ -469,7 +496,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
469
496
|
kwargs.setdefault("recog_network", "standard")
|
470
497
|
|
471
498
|
try:
|
472
|
-
cls._reader =
|
499
|
+
cls._reader = easyocr_module.Reader(
|
473
500
|
languages,
|
474
501
|
gpu=use_gpu,
|
475
502
|
verbose=False,
|