kreuzberg 3.16.0__py3-none-any.whl → 3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +2 -0
- kreuzberg/_config.py +8 -9
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_html.py +1 -1
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +4 -4
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mcp/server.py +1 -1
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +4 -9
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +15 -25
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +35 -3
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +25 -9
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +4 -3
- kreuzberg-3.17.0.dist-info/RECORD +128 -0
- kreuzberg-3.16.0.dist-info/RECORD +0 -61
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -19,6 +19,7 @@ from ._types import (
|
|
19
19
|
SpacyEntityExtractionConfig,
|
20
20
|
TableData,
|
21
21
|
TesseractConfig,
|
22
|
+
TokenReductionConfig,
|
22
23
|
)
|
23
24
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
24
25
|
from .extraction import (
|
@@ -57,6 +58,7 @@ __all__ = [
|
|
57
58
|
"SpacyEntityExtractionConfig",
|
58
59
|
"TableData",
|
59
60
|
"TesseractConfig",
|
61
|
+
"TokenReductionConfig",
|
60
62
|
"ValidationError",
|
61
63
|
"__version__",
|
62
64
|
"batch_extract_bytes",
|
kreuzberg/_config.py
CHANGED
@@ -69,12 +69,11 @@ def _build_ocr_config_from_cli(
|
|
69
69
|
try:
|
70
70
|
match ocr_backend:
|
71
71
|
case "tesseract":
|
72
|
-
# Handle PSM mode conversion from int to enum
|
73
72
|
processed_args = backend_args.copy()
|
74
73
|
if "psm" in processed_args and isinstance(processed_args["psm"], int):
|
75
74
|
try:
|
76
75
|
processed_args["psm"] = PSMMode(processed_args["psm"])
|
77
|
-
except ValueError as e:
|
76
|
+
except ValueError as e: # pragma: no cover
|
78
77
|
raise ValidationError(
|
79
78
|
f"Invalid PSM mode value: {processed_args['psm']}",
|
80
79
|
context={"psm_value": processed_args["psm"], "error": str(e)},
|
@@ -84,7 +83,7 @@ def _build_ocr_config_from_cli(
|
|
84
83
|
return EasyOCRConfig(**backend_args)
|
85
84
|
case "paddleocr":
|
86
85
|
return PaddleOCRConfig(**backend_args)
|
87
|
-
case _:
|
86
|
+
case _: # pragma: no cover
|
88
87
|
return None
|
89
88
|
except (TypeError, ValueError) as e:
|
90
89
|
raise ValidationError(
|
@@ -122,7 +121,7 @@ def _configure_gmft(
|
|
122
121
|
try:
|
123
122
|
if cli_args.get("gmft_config"):
|
124
123
|
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
125
|
-
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
124
|
+
elif "gmft" in file_config and isinstance(file_config["gmft"], dict): # pragma: no cover
|
126
125
|
gmft_config = GMFTConfig(**file_config["gmft"])
|
127
126
|
except (TypeError, ValueError) as e:
|
128
127
|
raise ValidationError(
|
@@ -130,7 +129,7 @@ def _configure_gmft(
|
|
130
129
|
context={"gmft_config": cli_args.get("gmft_config") or file_config.get("gmft"), "error": str(e)},
|
131
130
|
) from e
|
132
131
|
|
133
|
-
if gmft_config:
|
132
|
+
if gmft_config: # pragma: no cover
|
134
133
|
config_dict["gmft_config"] = gmft_config
|
135
134
|
|
136
135
|
|
@@ -161,7 +160,7 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
161
160
|
try:
|
162
161
|
with config_path.open("rb") as f:
|
163
162
|
data = tomllib.load(f)
|
164
|
-
except FileNotFoundError as e:
|
163
|
+
except FileNotFoundError as e: # pragma: no cover
|
165
164
|
raise ValidationError(f"Configuration file not found: {config_path}") from e
|
166
165
|
except tomllib.TOMLDecodeError as e:
|
167
166
|
raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
|
@@ -247,7 +246,7 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
|
|
247
246
|
|
248
247
|
try:
|
249
248
|
return ExtractionConfig(**extraction_config)
|
250
|
-
except (TypeError, ValueError) as e:
|
249
|
+
except (TypeError, ValueError) as e: # pragma: no cover
|
251
250
|
raise ValidationError(
|
252
251
|
f"Invalid extraction configuration: {e}",
|
253
252
|
context={"config": extraction_config, "error": str(e)},
|
@@ -271,7 +270,7 @@ def build_extraction_config(
|
|
271
270
|
|
272
271
|
try:
|
273
272
|
return ExtractionConfig(**config_dict)
|
274
|
-
except (TypeError, ValueError) as e:
|
273
|
+
except (TypeError, ValueError) as e: # pragma: no cover
|
275
274
|
raise ValidationError(
|
276
275
|
f"Invalid extraction configuration: {e}",
|
277
276
|
context={"config": config_dict, "error": str(e)},
|
@@ -293,7 +292,7 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
|
|
293
292
|
data = tomllib.load(f)
|
294
293
|
if "tool" in data and "kreuzberg" in data["tool"]:
|
295
294
|
return pyproject_toml
|
296
|
-
except OSError as e:
|
295
|
+
except OSError as e: # pragma: no cover
|
297
296
|
raise ValidationError(
|
298
297
|
f"Failed to read pyproject.toml: {e}",
|
299
298
|
context={"file": str(pyproject_toml), "error": str(e)},
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -96,7 +96,6 @@ class Extractor(ABC):
|
|
96
96
|
)
|
97
97
|
|
98
98
|
def _check_image_memory_limits(self, images: list[ExtractedImage]) -> list[ExtractedImage]:
|
99
|
-
"""Filter images based on memory safety limits."""
|
100
99
|
if not images:
|
101
100
|
return []
|
102
101
|
|
@@ -142,17 +141,6 @@ class Extractor(ABC):
|
|
142
141
|
_HASH_SAMPLE_SIZE = 512
|
143
142
|
|
144
143
|
def _compute_image_hash(self, img: ExtractedImage) -> int:
|
145
|
-
"""Compute hash for image deduplication using progressive hashing.
|
146
|
-
|
147
|
-
For small images (<1KB), hash the entire content.
|
148
|
-
For larger images, use size + first/last bytes for quick comparison.
|
149
|
-
|
150
|
-
Args:
|
151
|
-
img: Image to hash
|
152
|
-
|
153
|
-
Returns:
|
154
|
-
Hash value for deduplication
|
155
|
-
"""
|
156
144
|
data_len = len(img.data)
|
157
145
|
|
158
146
|
if data_len < self._SMALL_IMAGE_THRESHOLD:
|
@@ -189,14 +177,6 @@ class Extractor(ABC):
|
|
189
177
|
return unique_images
|
190
178
|
|
191
179
|
def _prepare_ocr_config(self, backend_name: str) -> dict[str, Any]:
|
192
|
-
"""Prepare OCR configuration for the specified backend.
|
193
|
-
|
194
|
-
Args:
|
195
|
-
backend_name: Name of the OCR backend
|
196
|
-
|
197
|
-
Returns:
|
198
|
-
Configuration dictionary for the backend
|
199
|
-
"""
|
200
180
|
default_config: TesseractConfig | EasyOCRConfig | PaddleOCRConfig
|
201
181
|
config_class: type[TesseractConfig | EasyOCRConfig | PaddleOCRConfig]
|
202
182
|
|
@@ -222,14 +202,6 @@ class Extractor(ABC):
|
|
222
202
|
return cfg
|
223
203
|
|
224
204
|
def _validate_image_for_ocr(self, img: ExtractedImage) -> str | None:
|
225
|
-
"""Validate if an image is suitable for OCR processing.
|
226
|
-
|
227
|
-
Args:
|
228
|
-
img: Image to validate
|
229
|
-
|
230
|
-
Returns:
|
231
|
-
Reason for skipping if invalid, None if valid
|
232
|
-
"""
|
233
205
|
fmt = img.format.lower()
|
234
206
|
if fmt not in self.config.image_ocr_formats:
|
235
207
|
return f"Unsupported format: {img.format}"
|
@@ -247,16 +219,6 @@ class Extractor(ABC):
|
|
247
219
|
return None
|
248
220
|
|
249
221
|
async def _ocr_single_image(self, target: ExtractedImage, backend: Any, cfg: dict[str, Any]) -> ImageOCRResult:
|
250
|
-
"""Process a single image with OCR.
|
251
|
-
|
252
|
-
Args:
|
253
|
-
target: Image to process
|
254
|
-
backend: OCR backend instance
|
255
|
-
cfg: Configuration for the backend
|
256
|
-
|
257
|
-
Returns:
|
258
|
-
OCR result for the image
|
259
|
-
"""
|
260
222
|
try:
|
261
223
|
start = time.time()
|
262
224
|
pil_img = Image.open(io.BytesIO(target.data))
|
@@ -284,14 +246,6 @@ class Extractor(ABC):
|
|
284
246
|
async def _process_images_with_ocr(
|
285
247
|
self, images: tuple[ExtractedImage, ...] | list[ExtractedImage]
|
286
248
|
) -> list[ImageOCRResult]:
|
287
|
-
"""Process multiple images with OCR.
|
288
|
-
|
289
|
-
Args:
|
290
|
-
images: Tuple or list of images to process
|
291
|
-
|
292
|
-
Returns:
|
293
|
-
List of OCR results
|
294
|
-
"""
|
295
249
|
if not images or not self.config.ocr_extracted_images:
|
296
250
|
return []
|
297
251
|
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -102,7 +102,7 @@ class HTMLExtractor(Extractor):
|
|
102
102
|
try:
|
103
103
|
with Image.open(io.BytesIO(image_data)) as pil_img:
|
104
104
|
dimensions = pil_img.size
|
105
|
-
except (OSError, ValueError) as e:
|
105
|
+
except (OSError, ValueError) as e: # pragma: no cover
|
106
106
|
logger.debug("Could not determine image dimensions for %s: %s", format_name, e)
|
107
107
|
|
108
108
|
alt_val = img.get("alt") # type: ignore[union-attr]
|
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -253,7 +253,7 @@ class PandocExtractor(Extractor):
|
|
253
253
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
254
254
|
)
|
255
255
|
|
256
|
-
except FileNotFoundError as e:
|
256
|
+
except FileNotFoundError as e: # pragma: no cover
|
257
257
|
raise MissingDependencyError(
|
258
258
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
259
259
|
) from e
|
@@ -491,7 +491,7 @@ class PandocExtractor(Extractor):
|
|
491
491
|
"Please install it on your system and make sure its available in $PATH."
|
492
492
|
)
|
493
493
|
|
494
|
-
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
494
|
+
except (subprocess.SubprocessError, FileNotFoundError) as e: # pragma: no cover
|
495
495
|
raise MissingDependencyError(
|
496
496
|
"Pandoc version 2 or above is a required system dependency. "
|
497
497
|
"Please install it on your system and make sure its available in $PATH."
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -153,7 +153,7 @@ class PDFExtractor(Extractor):
|
|
153
153
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
154
154
|
|
155
155
|
tables = extract_tables_sync(path)
|
156
|
-
except ImportError:
|
156
|
+
except ImportError: # pragma: no cover
|
157
157
|
tables = []
|
158
158
|
|
159
159
|
if not self.config.force_ocr and self._validate_extracted_text(text):
|
@@ -500,7 +500,7 @@ class PDFExtractor(Extractor):
|
|
500
500
|
except (ValueError, TypeError, KeyError, RuntimeError) as e: # noqa: PERF203
|
501
501
|
last_exception = e
|
502
502
|
continue
|
503
|
-
except OSError as e:
|
503
|
+
except OSError as e: # pragma: no cover
|
504
504
|
raise ParsingError(f"Failed to parse PDF: {e}") from e
|
505
505
|
|
506
506
|
if last_exception:
|
@@ -520,7 +520,7 @@ class PDFExtractor(Extractor):
|
|
520
520
|
for password in passwords:
|
521
521
|
try:
|
522
522
|
return await extract_pdf_metadata(content, password=password)
|
523
|
-
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
523
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
|
524
524
|
last_exception = e
|
525
525
|
continue
|
526
526
|
|
@@ -538,7 +538,7 @@ class PDFExtractor(Extractor):
|
|
538
538
|
for password in passwords:
|
539
539
|
try:
|
540
540
|
return extract_pdf_metadata_sync(content, password=password)
|
541
|
-
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203
|
541
|
+
except (ParsingError, ValueError, TypeError, OSError) as e: # noqa: PERF203 # pragma: no cover
|
542
542
|
last_exception = e
|
543
543
|
continue
|
544
544
|
|
kreuzberg/_gmft.py
CHANGED
@@ -99,7 +99,7 @@ async def extract_tables(
|
|
99
99
|
"size": stat.st_size,
|
100
100
|
"mtime": stat.st_mtime,
|
101
101
|
}
|
102
|
-
except OSError:
|
102
|
+
except OSError: # pragma: no cover
|
103
103
|
file_info = {
|
104
104
|
"path": str(path),
|
105
105
|
"size": 0,
|
@@ -215,7 +215,7 @@ def extract_tables_sync(
|
|
215
215
|
"size": stat.st_size,
|
216
216
|
"mtime": stat.st_mtime,
|
217
217
|
}
|
218
|
-
except OSError:
|
218
|
+
except OSError: # pragma: no cover
|
219
219
|
file_info = {
|
220
220
|
"path": str(path),
|
221
221
|
"size": 0,
|
kreuzberg/_mcp/server.py
CHANGED
@@ -39,7 +39,7 @@ def _validate_file_path(file_path: str) -> Path:
|
|
39
39
|
"""
|
40
40
|
try:
|
41
41
|
path = Path(file_path).resolve()
|
42
|
-
except (OSError, ValueError) as e:
|
42
|
+
except (OSError, ValueError) as e: # pragma: no cover
|
43
43
|
raise ValidationError(
|
44
44
|
f"Invalid file path: {file_path}",
|
45
45
|
context={"file_path": file_path, "error": str(e)},
|
kreuzberg/_mime_types.py
CHANGED
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -44,11 +44,9 @@ HAS_EASYOCR: bool = False
|
|
44
44
|
def _import_easyocr() -> tuple[Any, Any]:
|
45
45
|
global HAS_EASYOCR, easyocr, torch
|
46
46
|
|
47
|
-
# If easyocr is already set (either real module or mock), return it
|
48
47
|
if easyocr is not None:
|
49
48
|
return easyocr, torch
|
50
49
|
|
51
|
-
# If explicitly disabled for testing
|
52
50
|
if not HAS_EASYOCR and easyocr is None:
|
53
51
|
return None, None
|
54
52
|
|
@@ -57,14 +55,14 @@ def _import_easyocr() -> tuple[Any, Any]:
|
|
57
55
|
|
58
56
|
try:
|
59
57
|
import torch as _torch # noqa: PLC0415
|
60
|
-
except ImportError:
|
58
|
+
except ImportError: # pragma: no cover
|
61
59
|
_torch = None # type: ignore[assignment]
|
62
60
|
|
63
61
|
easyocr = _easyocr
|
64
62
|
torch = _torch
|
65
63
|
HAS_EASYOCR = True
|
66
64
|
return easyocr, torch
|
67
|
-
except ImportError:
|
65
|
+
except ImportError: # pragma: no cover
|
68
66
|
return None, None
|
69
67
|
|
70
68
|
|
@@ -161,7 +159,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
161
159
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
162
160
|
try:
|
163
161
|
import numpy as np # noqa: PLC0415
|
164
|
-
except ImportError as e:
|
162
|
+
except ImportError as e: # pragma: no cover
|
165
163
|
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
166
164
|
|
167
165
|
use_cache = kwargs.pop("use_cache", True)
|
@@ -314,7 +312,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
314
312
|
|
315
313
|
@classmethod
|
316
314
|
def _is_gpu_available(cls) -> bool:
|
317
|
-
# Use the module-level torch variable directly to respect patches
|
318
315
|
if torch is None:
|
319
316
|
return False
|
320
317
|
return bool(torch.cuda.is_available())
|
@@ -324,7 +321,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
324
321
|
if cls._reader is not None:
|
325
322
|
return
|
326
323
|
|
327
|
-
# Validate language first before attempting import
|
328
324
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
329
325
|
|
330
326
|
easyocr_module, _ = _import_easyocr()
|
@@ -409,7 +405,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
409
405
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
410
406
|
try:
|
411
407
|
import numpy as np # noqa: PLC0415
|
412
|
-
except ImportError as e:
|
408
|
+
except ImportError as e: # pragma: no cover
|
413
409
|
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
414
410
|
|
415
411
|
use_cache = kwargs.pop("use_cache", True)
|
@@ -483,7 +479,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
483
479
|
if cls._reader is not None:
|
484
480
|
return
|
485
481
|
|
486
|
-
# Validate language first before attempting import
|
487
482
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
488
483
|
|
489
484
|
easyocr_module, _ = _import_easyocr()
|
kreuzberg/_ocr/_paddleocr.py
CHANGED
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -215,7 +215,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
215
215
|
|
216
216
|
try:
|
217
217
|
await run_sync(save_image.save, str(image_path), format="PNG")
|
218
|
-
except OSError as e:
|
218
|
+
except OSError as e: # pragma: no cover
|
219
219
|
if "cannot write mode" not in str(e):
|
220
220
|
raise
|
221
221
|
save_image = image.convert("RGB")
|
@@ -357,7 +357,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
357
357
|
try:
|
358
358
|
stat = path.stat()
|
359
359
|
file_info = {"path": str(path.resolve()), "size": stat.st_size, "mtime": stat.st_mtime}
|
360
|
-
except OSError:
|
360
|
+
except OSError: # pragma: no cover
|
361
361
|
file_info = {"path": str(path), "size": 0, "mtime": 0}
|
362
362
|
|
363
363
|
cache_kwargs = {
|
@@ -399,7 +399,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
399
399
|
await ocr_cache.aset(extraction_result, **final_cache_kwargs)
|
400
400
|
|
401
401
|
return extraction_result
|
402
|
-
except (RuntimeError, OSError) as e:
|
402
|
+
except (RuntimeError, OSError) as e: # pragma: no cover
|
403
403
|
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
404
404
|
finally:
|
405
405
|
await unlink()
|
@@ -432,7 +432,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
432
432
|
|
433
433
|
try:
|
434
434
|
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
435
|
-
except (ImportError, IndexError):
|
435
|
+
except (ImportError, IndexError): # pragma: no cover
|
436
436
|
df = None
|
437
437
|
|
438
438
|
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
@@ -444,7 +444,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
444
444
|
tables=[table],
|
445
445
|
chunks=text_result.chunks,
|
446
446
|
)
|
447
|
-
except (ValueError, KeyError, ImportError):
|
447
|
+
except (ValueError, KeyError, ImportError): # pragma: no cover
|
448
448
|
pass
|
449
449
|
|
450
450
|
return text_result
|
@@ -507,12 +507,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
507
507
|
table_min_confidence: float = 30.0,
|
508
508
|
**_kwargs: Any,
|
509
509
|
) -> ExtractionResult:
|
510
|
-
config = html_to_markdown_config or HTMLToMarkdownConfig(
|
511
|
-
escape_asterisks=False,
|
512
|
-
escape_underscores=False,
|
513
|
-
extract_metadata=False,
|
514
|
-
strip=["meta", "title"],
|
515
|
-
)
|
510
|
+
config = html_to_markdown_config or HTMLToMarkdownConfig()
|
516
511
|
|
517
512
|
tables: list[TableData] = []
|
518
513
|
if enable_table_detection:
|
@@ -678,10 +673,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
678
673
|
|
679
674
|
html_config = HTMLToMarkdownConfig(
|
680
675
|
custom_converters=converters,
|
681
|
-
escape_asterisks=False,
|
682
|
-
escape_underscores=False,
|
683
|
-
extract_metadata=False,
|
684
|
-
strip=["meta", "title"],
|
685
676
|
)
|
686
677
|
|
687
678
|
config_dict = html_config.to_dict()
|
@@ -761,7 +752,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
761
752
|
|
762
753
|
try:
|
763
754
|
df = pl.DataFrame(table_data[1:], schema=table_data[0])
|
764
|
-
except (ImportError, IndexError):
|
755
|
+
except (ImportError, IndexError): # pragma: no cover
|
765
756
|
df = None
|
766
757
|
|
767
758
|
table: TableData = {"text": markdown, "df": df, "page_number": 1, "cropped_image": None} # type: ignore[typeddict-item]
|
@@ -773,7 +764,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
773
764
|
tables=[table],
|
774
765
|
chunks=text_result.chunks,
|
775
766
|
)
|
776
|
-
except (ValueError, KeyError, ImportError):
|
767
|
+
except (ValueError, KeyError, ImportError): # pragma: no cover
|
777
768
|
pass
|
778
769
|
|
779
770
|
return text_result
|
@@ -810,7 +801,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
810
801
|
|
811
802
|
try:
|
812
803
|
df = await run_sync(pl.DataFrame, table_data[1:], schema=table_data[0])
|
813
|
-
except (ImportError, IndexError):
|
804
|
+
except (ImportError, IndexError): # pragma: no cover
|
814
805
|
df = None
|
815
806
|
|
816
807
|
dummy_image = Image.new("RGB", (1, 1), "white")
|
@@ -823,7 +814,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
823
814
|
"metadata": {"bbox": (min_x, min_y, max_x, max_y)},
|
824
815
|
} # type: ignore[typeddict-unknown-key]
|
825
816
|
tables.append(table)
|
826
|
-
except (ValueError, KeyError, ImportError):
|
817
|
+
except (ValueError, KeyError, ImportError): # pragma: no cover
|
827
818
|
pass
|
828
819
|
|
829
820
|
return tables
|
@@ -879,7 +870,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
879
870
|
env = {"OMP_THREAD_LIMIT": "1"} if sys.platform.startswith("linux") else None
|
880
871
|
try:
|
881
872
|
result = await run_process(command, env=env)
|
882
|
-
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
873
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
|
883
874
|
raise MissingDependencyError(
|
884
875
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
885
876
|
) from e
|
@@ -890,7 +881,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
890
881
|
)
|
891
882
|
|
892
883
|
cls._version_checked = True
|
893
|
-
except FileNotFoundError as e:
|
884
|
+
except FileNotFoundError as e: # pragma: no cover
|
894
885
|
raise MissingDependencyError(
|
895
886
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
896
887
|
) from e
|
@@ -1087,7 +1078,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1087
1078
|
"size": stat.st_size,
|
1088
1079
|
"mtime": stat.st_mtime,
|
1089
1080
|
}
|
1090
|
-
except OSError:
|
1081
|
+
except OSError: # pragma: no cover
|
1091
1082
|
return {
|
1092
1083
|
"path": str(path),
|
1093
1084
|
"size": 0,
|
@@ -1095,7 +1086,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1095
1086
|
}
|
1096
1087
|
|
1097
1088
|
def _result_from_dict(self, result_dict: dict[str, Any]) -> ExtractionResult:
|
1098
|
-
"""Convert a worker result dict to ExtractionResult."""
|
1099
1089
|
if result_dict.get("success"):
|
1100
1090
|
return ExtractionResult(
|
1101
1091
|
content=str(result_dict.get("text", "")),
|
@@ -1189,7 +1179,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1189
1179
|
command = ["tesseract", "--version"]
|
1190
1180
|
try:
|
1191
1181
|
result = subprocess.run(command, capture_output=True, text=True, check=True, encoding="utf-8")
|
1192
|
-
except (subprocess.CalledProcessError, FileNotFoundError) as e:
|
1182
|
+
except (subprocess.CalledProcessError, FileNotFoundError) as e: # pragma: no cover
|
1193
1183
|
raise MissingDependencyError(
|
1194
1184
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
1195
1185
|
) from e
|
@@ -1200,7 +1190,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1200
1190
|
)
|
1201
1191
|
|
1202
1192
|
cls._version_checked = True
|
1203
|
-
except FileNotFoundError as e:
|
1193
|
+
except FileNotFoundError as e: # pragma: no cover
|
1204
1194
|
raise MissingDependencyError(
|
1205
1195
|
"Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
1206
1196
|
) from e
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from kreuzberg._token_reduction._reducer import ReductionStats, get_reduction_stats, reduce_tokens
|
4
|
+
from kreuzberg._token_reduction._stopwords import StopwordsManager
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"ReductionStats",
|
8
|
+
"StopwordsManager",
|
9
|
+
"get_reduction_stats",
|
10
|
+
"reduce_tokens",
|
11
|
+
]
|