kreuzberg 3.4.2__py3-none-any.whl → 3.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +2 -0
- kreuzberg/_extractors/_image.py +21 -1
- kreuzberg/_extractors/_pdf.py +44 -14
- kreuzberg/_extractors/_spread_sheet.py +2 -2
- kreuzberg/_gmft.py +4 -4
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
- kreuzberg/_multiprocessing/process_manager.py +2 -1
- kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
- kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
- kreuzberg/_ocr/_easyocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +7 -3
- kreuzberg/_types.py +11 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_process_pool.py +2 -2
- kreuzberg/_utils/_sync.py +1 -5
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/extraction.py +10 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.5.0.dist-info}/METADATA +3 -1
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.5.0.dist-info}/RECORD +23 -20
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.5.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.5.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.5.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
from importlib.metadata import version
|
2
2
|
|
3
3
|
from kreuzberg._gmft import GMFTConfig
|
4
|
+
from kreuzberg._language_detection import LanguageDetectionConfig
|
4
5
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
5
6
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
6
7
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
@@ -29,6 +30,7 @@ __all__ = [
|
|
29
30
|
"ExtractorRegistry",
|
30
31
|
"GMFTConfig",
|
31
32
|
"KreuzbergError",
|
33
|
+
"LanguageDetectionConfig",
|
32
34
|
"Metadata",
|
33
35
|
"MissingDependencyError",
|
34
36
|
"OCRError",
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
|
|
80
80
|
if self.config.ocr_backend is None:
|
81
81
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
82
|
|
83
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
84
83
|
from kreuzberg._types import ExtractionResult
|
85
84
|
|
86
85
|
if self.config.ocr_backend == "tesseract":
|
87
86
|
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
87
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
88
88
|
|
89
89
|
if isinstance(self.config.ocr_config, TesseractConfig):
|
90
90
|
config = self.config.ocr_config
|
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
|
|
96
96
|
return results[0]
|
97
97
|
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
98
98
|
|
99
|
+
if self.config.ocr_backend == "paddleocr":
|
100
|
+
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
101
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
102
|
+
|
103
|
+
paddle_config = (
|
104
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
105
|
+
)
|
106
|
+
|
107
|
+
return paddle_process(path, paddle_config)
|
108
|
+
|
109
|
+
if self.config.ocr_backend == "easyocr":
|
110
|
+
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
111
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
112
|
+
|
113
|
+
easy_config = (
|
114
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
115
|
+
)
|
116
|
+
|
117
|
+
return easy_process(path, easy_config)
|
118
|
+
|
99
119
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
100
120
|
|
101
121
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
|
|
299
299
|
"""Extract text from PDF using OCR (sync version)."""
|
300
300
|
pdf = None
|
301
301
|
try:
|
302
|
-
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
303
|
-
|
304
302
|
images = []
|
305
303
|
with pypdfium_file_lock(path):
|
306
304
|
pdf = pypdfium2.PdfDocument(str(path))
|
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
|
|
325
323
|
os.close(fd)
|
326
324
|
image_paths.append(temp_path)
|
327
325
|
|
328
|
-
|
329
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
330
|
-
|
331
|
-
if isinstance(self.config.ocr_config, TesseractConfig):
|
332
|
-
config = self.config.ocr_config
|
333
|
-
else:
|
334
|
-
config = TesseractConfig()
|
335
|
-
results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
|
336
|
-
text_parts = [r.content for r in results]
|
337
|
-
return "\n\n".join(text_parts)
|
338
|
-
|
339
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
326
|
+
return self._process_pdf_images_with_ocr(image_paths)
|
340
327
|
|
341
328
|
finally:
|
342
329
|
for _, temp_path in temp_files:
|
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
|
|
349
336
|
if pdf:
|
350
337
|
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
351
338
|
pdf.close()
|
339
|
+
|
340
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
341
|
+
"""Process PDF images with the configured OCR backend."""
|
342
|
+
if self.config.ocr_backend == "tesseract":
|
343
|
+
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
344
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
345
|
+
|
346
|
+
tesseract_config = (
|
347
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
348
|
+
)
|
349
|
+
results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
|
350
|
+
text_parts = [r.content for r in results]
|
351
|
+
return "\n\n".join(text_parts)
|
352
|
+
|
353
|
+
if self.config.ocr_backend == "paddleocr":
|
354
|
+
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
355
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
356
|
+
|
357
|
+
paddle_config = (
|
358
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
359
|
+
)
|
360
|
+
|
361
|
+
text_parts = []
|
362
|
+
for image_path in image_paths:
|
363
|
+
result = paddle_process(Path(image_path), paddle_config)
|
364
|
+
text_parts.append(result.content)
|
365
|
+
return "\n\n".join(text_parts)
|
366
|
+
|
367
|
+
if self.config.ocr_backend == "easyocr":
|
368
|
+
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
369
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
370
|
+
|
371
|
+
easy_config = (
|
372
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
373
|
+
)
|
374
|
+
|
375
|
+
text_parts = []
|
376
|
+
for image_path in image_paths:
|
377
|
+
result = easy_process(Path(image_path), easy_config)
|
378
|
+
text_parts.append(result.content)
|
379
|
+
return "\n\n".join(text_parts)
|
380
|
+
|
381
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
@@ -6,7 +6,7 @@ import sys
|
|
6
6
|
from datetime import date, datetime, time, timedelta
|
7
7
|
from io import StringIO
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any
|
9
|
+
from typing import Any
|
10
10
|
|
11
11
|
from anyio import Path as AsyncPath
|
12
12
|
from python_calamine import CalamineWorkbook
|
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
|
|
23
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
24
24
|
|
25
25
|
|
26
|
-
CellValue =
|
26
|
+
CellValue = int | float | str | bool | time | date | datetime | timedelta
|
27
27
|
|
28
28
|
|
29
29
|
class SpreadSheetExtractor(Extractor):
|
kreuzberg/_gmft.py
CHANGED
@@ -210,7 +210,7 @@ async def extract_tables( # noqa: PLR0915
|
|
210
210
|
from gmft.formatters.tatr import TATRFormatConfig
|
211
211
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
212
212
|
|
213
|
-
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
213
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
214
214
|
config=TATRFormatConfig(
|
215
215
|
verbosity=config.verbosity,
|
216
216
|
formatter_base_threshold=config.formatter_base_threshold,
|
@@ -226,7 +226,7 @@ async def extract_tables( # noqa: PLR0915
|
|
226
226
|
force_large_table_assumption=config.force_large_table_assumption,
|
227
227
|
)
|
228
228
|
)
|
229
|
-
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
229
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
230
230
|
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
231
231
|
)
|
232
232
|
doc = await run_sync(PyPDFium2Document, str(file_path))
|
@@ -247,7 +247,7 @@ async def extract_tables( # noqa: PLR0915
|
|
247
247
|
text=data_frame.to_markdown(),
|
248
248
|
df=data_frame,
|
249
249
|
)
|
250
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
250
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
251
251
|
]
|
252
252
|
|
253
253
|
await table_cache.aset(result, **cache_kwargs)
|
@@ -365,7 +365,7 @@ def extract_tables_sync(
|
|
365
365
|
text=data_frame.to_markdown(),
|
366
366
|
df=data_frame,
|
367
367
|
)
|
368
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
368
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
369
369
|
]
|
370
370
|
|
371
371
|
table_cache.set(result, **cache_kwargs)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from functools import lru_cache
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
from kreuzberg.exceptions import MissingDependencyError
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
11
|
+
|
12
|
+
try:
|
13
|
+
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
14
|
+
from fast_langdetect import detect, detect_multilingual
|
15
|
+
|
16
|
+
HAS_FAST_LANGDETECT = True
|
17
|
+
except ImportError:
|
18
|
+
HAS_FAST_LANGDETECT = False
|
19
|
+
detect = None
|
20
|
+
detect_multilingual = None
|
21
|
+
FastLangDetectConfig = None
|
22
|
+
|
23
|
+
_CACHE_SIZE = 128
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass(frozen=True)
|
27
|
+
class LanguageDetectionConfig:
|
28
|
+
"""Configuration for language detection.
|
29
|
+
|
30
|
+
Attributes:
|
31
|
+
low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
32
|
+
Defaults to True for better memory efficiency.
|
33
|
+
top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
|
34
|
+
multilingual: If True, uses multilingual detection to handle mixed-language text.
|
35
|
+
If False, uses single language detection. Defaults to False.
|
36
|
+
cache_dir: Custom directory for model cache. If None, uses system default.
|
37
|
+
allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
|
38
|
+
"""
|
39
|
+
|
40
|
+
low_memory: bool = True
|
41
|
+
top_k: int = 3
|
42
|
+
multilingual: bool = False
|
43
|
+
cache_dir: str | None = None
|
44
|
+
allow_fallback: bool = True
|
45
|
+
|
46
|
+
|
47
|
+
def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
|
48
|
+
"""Create FastLangDetectConfig from our config."""
|
49
|
+
if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
|
50
|
+
return None
|
51
|
+
|
52
|
+
kwargs: dict[str, Any] = {
|
53
|
+
"allow_fallback": config.allow_fallback,
|
54
|
+
}
|
55
|
+
if config.cache_dir is not None:
|
56
|
+
kwargs["cache_dir"] = config.cache_dir
|
57
|
+
|
58
|
+
return FastLangDetectConfig(**kwargs)
|
59
|
+
|
60
|
+
|
61
|
+
@lru_cache(maxsize=_CACHE_SIZE)
|
62
|
+
def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
|
63
|
+
"""Detect the most probable languages in the given text using fast-langdetect.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
text: The text to analyze.
|
67
|
+
config: Configuration for language detection. If None, uses defaults.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
|
71
|
+
or None if detection fails.
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
MissingDependencyError: If fast-langdetect is not installed.
|
75
|
+
"""
|
76
|
+
if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
|
77
|
+
raise MissingDependencyError.create_for_package(
|
78
|
+
dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
|
79
|
+
)
|
80
|
+
|
81
|
+
if config is None:
|
82
|
+
config = LanguageDetectionConfig()
|
83
|
+
|
84
|
+
try:
|
85
|
+
if config.multilingual:
|
86
|
+
results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
|
87
|
+
|
88
|
+
return [result["lang"].lower() for result in results if result.get("lang")]
|
89
|
+
|
90
|
+
result = detect(text, low_memory=config.low_memory)
|
91
|
+
if result and result.get("lang"):
|
92
|
+
return [result["lang"].lower()]
|
93
|
+
return None
|
94
|
+
except Exception: # noqa: BLE001
|
95
|
+
return None
|
@@ -56,9 +56,7 @@ def _extract_tables_in_process(
|
|
56
56
|
force_large_table_assumption=config.force_large_table_assumption,
|
57
57
|
)
|
58
58
|
)
|
59
|
-
detector = AutoTableDetector( # type: ignore[no-untyped-call]
|
60
|
-
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
61
|
-
)
|
59
|
+
detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
|
62
60
|
|
63
61
|
doc = PyPDFium2Document(str(file_path))
|
64
62
|
cropped_tables = []
|
@@ -73,7 +71,7 @@ def _extract_tables_in_process(
|
|
73
71
|
dataframes.append(formatted_table.df())
|
74
72
|
|
75
73
|
results = []
|
76
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables):
|
74
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
|
77
75
|
import io
|
78
76
|
|
79
77
|
img_bytes = io.BytesIO()
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import multiprocessing as mp
|
6
6
|
from concurrent.futures import ProcessPoolExecutor
|
7
|
-
from typing import TYPE_CHECKING, Any,
|
7
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
8
8
|
|
9
9
|
import anyio
|
10
10
|
import psutil
|
@@ -12,6 +12,7 @@ from typing_extensions import Self
|
|
12
12
|
|
13
13
|
if TYPE_CHECKING:
|
14
14
|
import types
|
15
|
+
from collections.abc import Callable
|
15
16
|
|
16
17
|
T = TypeVar("T")
|
17
18
|
|
@@ -0,0 +1,235 @@
|
|
1
|
+
"""Pure synchronous EasyOCR without any async overhead."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import tempfile
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any
|
8
|
+
|
9
|
+
from PIL import Image
|
10
|
+
|
11
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
13
|
+
from kreuzberg._types import ExtractionResult
|
14
|
+
from kreuzberg._utils._string import normalize_spaces
|
15
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError
|
16
|
+
|
17
|
+
|
18
|
+
def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
|
19
|
+
"""Get an EasyOCR Reader instance with the given configuration."""
|
20
|
+
try:
|
21
|
+
import easyocr
|
22
|
+
except ImportError as e:
|
23
|
+
raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
|
24
|
+
|
25
|
+
gpu = False
|
26
|
+
if hasattr(config, "device"):
|
27
|
+
if config.device and config.device.lower() != "cpu":
|
28
|
+
gpu = True
|
29
|
+
elif hasattr(config, "use_gpu"):
|
30
|
+
gpu = config.use_gpu
|
31
|
+
|
32
|
+
language = config.language if hasattr(config, "language") else "en"
|
33
|
+
if isinstance(language, str):
|
34
|
+
lang_list = [lang.strip().lower() for lang in language.split(",")]
|
35
|
+
else:
|
36
|
+
lang_list = [lang.lower() for lang in language]
|
37
|
+
|
38
|
+
kwargs = {
|
39
|
+
"lang_list": lang_list,
|
40
|
+
"gpu": gpu,
|
41
|
+
"model_storage_directory": getattr(config, "model_storage_directory", None),
|
42
|
+
"user_network_directory": getattr(config, "user_network_directory", None),
|
43
|
+
"recog_network": getattr(config, "recog_network", None),
|
44
|
+
"detector": getattr(config, "detector", None),
|
45
|
+
"recognizer": getattr(config, "recognizer", None),
|
46
|
+
"verbose": False,
|
47
|
+
"quantize": getattr(config, "quantize", None),
|
48
|
+
"cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
|
49
|
+
}
|
50
|
+
|
51
|
+
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
52
|
+
|
53
|
+
return easyocr.Reader(**kwargs)
|
54
|
+
|
55
|
+
|
56
|
+
def process_image_sync_pure(
|
57
|
+
image_path: str | Path,
|
58
|
+
config: EasyOCRConfig | None = None,
|
59
|
+
) -> ExtractionResult:
|
60
|
+
"""Process an image with EasyOCR using pure sync implementation.
|
61
|
+
|
62
|
+
This bypasses all async overhead and calls EasyOCR directly.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
image_path: Path to the image file.
|
66
|
+
config: EasyOCR configuration.
|
67
|
+
|
68
|
+
Returns:
|
69
|
+
Extraction result.
|
70
|
+
"""
|
71
|
+
cfg = config or EasyOCRConfig()
|
72
|
+
|
73
|
+
try:
|
74
|
+
reader = _get_easyocr_instance(cfg)
|
75
|
+
|
76
|
+
readtext_kwargs = {
|
77
|
+
"decoder": cfg.decoder,
|
78
|
+
"beamWidth": cfg.beam_width,
|
79
|
+
"batch_size": getattr(cfg, "batch_size", 1),
|
80
|
+
"workers": getattr(cfg, "workers", 0),
|
81
|
+
"allowlist": getattr(cfg, "allowlist", None),
|
82
|
+
"blocklist": getattr(cfg, "blocklist", None),
|
83
|
+
"detail": getattr(cfg, "detail", 1),
|
84
|
+
"rotation_info": cfg.rotation_info,
|
85
|
+
"paragraph": getattr(cfg, "paragraph", False),
|
86
|
+
"min_size": cfg.min_size,
|
87
|
+
"text_threshold": cfg.text_threshold,
|
88
|
+
"low_text": cfg.low_text,
|
89
|
+
"link_threshold": cfg.link_threshold,
|
90
|
+
"canvas_size": cfg.canvas_size,
|
91
|
+
"mag_ratio": cfg.mag_ratio,
|
92
|
+
"slope_ths": cfg.slope_ths,
|
93
|
+
"ycenter_ths": cfg.ycenter_ths,
|
94
|
+
"height_ths": cfg.height_ths,
|
95
|
+
"width_ths": cfg.width_ths,
|
96
|
+
"add_margin": cfg.add_margin,
|
97
|
+
"x_ths": cfg.x_ths,
|
98
|
+
"y_ths": cfg.y_ths,
|
99
|
+
}
|
100
|
+
|
101
|
+
readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
|
102
|
+
|
103
|
+
results = reader.readtext(str(image_path), **readtext_kwargs)
|
104
|
+
|
105
|
+
if not results:
|
106
|
+
return ExtractionResult(
|
107
|
+
content="",
|
108
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
109
|
+
metadata={},
|
110
|
+
chunks=[],
|
111
|
+
)
|
112
|
+
|
113
|
+
texts = []
|
114
|
+
confidences = []
|
115
|
+
|
116
|
+
detail_value = getattr(cfg, "detail", 1)
|
117
|
+
if detail_value:
|
118
|
+
for result in results:
|
119
|
+
min_result_length = 2
|
120
|
+
max_confidence_index = 2
|
121
|
+
if len(result) >= min_result_length:
|
122
|
+
_bbox, text = result[0], result[1]
|
123
|
+
confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
|
124
|
+
texts.append(text)
|
125
|
+
confidences.append(confidence)
|
126
|
+
else:
|
127
|
+
texts = results
|
128
|
+
confidences = [1.0] * len(texts)
|
129
|
+
|
130
|
+
content = "\n".join(texts)
|
131
|
+
content = normalize_spaces(content)
|
132
|
+
|
133
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
134
|
+
|
135
|
+
metadata = {"confidence": avg_confidence} if confidences else {}
|
136
|
+
|
137
|
+
return ExtractionResult(
|
138
|
+
content=content,
|
139
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
140
|
+
metadata=metadata, # type: ignore[arg-type]
|
141
|
+
chunks=[],
|
142
|
+
)
|
143
|
+
|
144
|
+
except Exception as e:
|
145
|
+
raise OCRError(f"EasyOCR processing failed: {e}") from e
|
146
|
+
|
147
|
+
|
148
|
+
def process_image_bytes_sync_pure(
|
149
|
+
image_bytes: bytes,
|
150
|
+
config: EasyOCRConfig | None = None,
|
151
|
+
) -> ExtractionResult:
|
152
|
+
"""Process image bytes with EasyOCR using pure sync implementation.
|
153
|
+
|
154
|
+
Args:
|
155
|
+
image_bytes: Image data as bytes.
|
156
|
+
config: EasyOCR configuration.
|
157
|
+
|
158
|
+
Returns:
|
159
|
+
Extraction result.
|
160
|
+
"""
|
161
|
+
import io
|
162
|
+
|
163
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
|
164
|
+
with Image.open(io.BytesIO(image_bytes)) as image:
|
165
|
+
image.save(tmp_image.name, format="PNG")
|
166
|
+
image_path = tmp_image.name
|
167
|
+
|
168
|
+
try:
|
169
|
+
return process_image_sync_pure(image_path, config)
|
170
|
+
finally:
|
171
|
+
image_file = Path(image_path)
|
172
|
+
if image_file.exists():
|
173
|
+
image_file.unlink()
|
174
|
+
|
175
|
+
|
176
|
+
def process_batch_images_sync_pure(
|
177
|
+
image_paths: list[str | Path],
|
178
|
+
config: EasyOCRConfig | None = None,
|
179
|
+
) -> list[ExtractionResult]:
|
180
|
+
"""Process a batch of images sequentially with pure sync implementation.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
image_paths: List of image file paths.
|
184
|
+
config: EasyOCR configuration.
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
List of extraction results.
|
188
|
+
"""
|
189
|
+
results = []
|
190
|
+
for image_path in image_paths:
|
191
|
+
result = process_image_sync_pure(image_path, config)
|
192
|
+
results.append(result)
|
193
|
+
return results
|
194
|
+
|
195
|
+
|
196
|
+
def process_batch_images_threaded(
|
197
|
+
image_paths: list[str | Path],
|
198
|
+
config: EasyOCRConfig | None = None,
|
199
|
+
max_workers: int | None = None,
|
200
|
+
) -> list[ExtractionResult]:
|
201
|
+
"""Process a batch of images using threading.
|
202
|
+
|
203
|
+
Args:
|
204
|
+
image_paths: List of image file paths.
|
205
|
+
config: EasyOCR configuration.
|
206
|
+
max_workers: Maximum number of threads.
|
207
|
+
|
208
|
+
Returns:
|
209
|
+
List of extraction results in same order as input.
|
210
|
+
"""
|
211
|
+
import multiprocessing as mp
|
212
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
213
|
+
|
214
|
+
if max_workers is None:
|
215
|
+
max_workers = min(len(image_paths), mp.cpu_count())
|
216
|
+
|
217
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
218
|
+
future_to_index = {
|
219
|
+
executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
|
220
|
+
}
|
221
|
+
|
222
|
+
results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
|
223
|
+
for future in as_completed(future_to_index):
|
224
|
+
index = future_to_index[future]
|
225
|
+
try:
|
226
|
+
results[index] = future.result()
|
227
|
+
except Exception as e: # noqa: BLE001
|
228
|
+
results[index] = ExtractionResult(
|
229
|
+
content=f"Error: {e}",
|
230
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
231
|
+
metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
|
232
|
+
chunks=[],
|
233
|
+
)
|
234
|
+
|
235
|
+
return results
|
@@ -0,0 +1,199 @@
|
|
1
|
+
"""Pure synchronous PaddleOCR without any async overhead."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import tempfile
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import Any
|
8
|
+
|
9
|
+
from PIL import Image
|
10
|
+
|
11
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
13
|
+
from kreuzberg._types import ExtractionResult
|
14
|
+
from kreuzberg._utils._string import normalize_spaces
|
15
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError
|
16
|
+
|
17
|
+
|
18
|
+
def _get_paddleocr_instance(config: PaddleOCRConfig) -> Any:
|
19
|
+
"""Get a PaddleOCR instance with the given configuration."""
|
20
|
+
try:
|
21
|
+
import paddleocr
|
22
|
+
except ImportError as e:
|
23
|
+
raise MissingDependencyError("PaddleOCR is not installed. Install it with: pip install paddleocr") from e
|
24
|
+
|
25
|
+
if hasattr(config, "device"):
|
26
|
+
if config.device and config.device.lower() != "cpu":
|
27
|
+
pass
|
28
|
+
elif hasattr(config, "use_gpu"):
|
29
|
+
pass
|
30
|
+
|
31
|
+
kwargs = {
|
32
|
+
"lang": config.language,
|
33
|
+
"use_textline_orientation": config.use_angle_cls,
|
34
|
+
}
|
35
|
+
|
36
|
+
if hasattr(config, "det_db_thresh"):
|
37
|
+
kwargs["text_det_thresh"] = config.det_db_thresh
|
38
|
+
if hasattr(config, "det_db_box_thresh"):
|
39
|
+
kwargs["text_det_box_thresh"] = config.det_db_box_thresh
|
40
|
+
if hasattr(config, "det_db_unclip_ratio"):
|
41
|
+
kwargs["text_det_unclip_ratio"] = config.det_db_unclip_ratio
|
42
|
+
if hasattr(config, "det_max_side_len"):
|
43
|
+
kwargs["text_det_limit_side_len"] = config.det_max_side_len
|
44
|
+
if hasattr(config, "drop_score"):
|
45
|
+
kwargs["text_rec_score_thresh"] = config.drop_score
|
46
|
+
|
47
|
+
return paddleocr.PaddleOCR(**kwargs)
|
48
|
+
|
49
|
+
|
50
|
+
def process_image_sync_pure(
|
51
|
+
image_path: str | Path,
|
52
|
+
config: PaddleOCRConfig | None = None,
|
53
|
+
) -> ExtractionResult:
|
54
|
+
"""Process an image with PaddleOCR using pure sync implementation.
|
55
|
+
|
56
|
+
This bypasses all async overhead and calls PaddleOCR directly.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
image_path: Path to the image file.
|
60
|
+
config: PaddleOCR configuration.
|
61
|
+
|
62
|
+
Returns:
|
63
|
+
Extraction result.
|
64
|
+
"""
|
65
|
+
cfg = config or PaddleOCRConfig()
|
66
|
+
|
67
|
+
try:
|
68
|
+
ocr_instance = _get_paddleocr_instance(cfg)
|
69
|
+
|
70
|
+
results = ocr_instance.ocr(str(image_path))
|
71
|
+
|
72
|
+
if not results or not results[0]:
|
73
|
+
return ExtractionResult(
|
74
|
+
content="",
|
75
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
76
|
+
metadata={},
|
77
|
+
chunks=[],
|
78
|
+
)
|
79
|
+
|
80
|
+
ocr_result = results[0]
|
81
|
+
result_data = ocr_result.json["res"]
|
82
|
+
|
83
|
+
texts = result_data.get("rec_texts", [])
|
84
|
+
scores = result_data.get("rec_scores", [])
|
85
|
+
|
86
|
+
if not texts:
|
87
|
+
return ExtractionResult(
|
88
|
+
content="",
|
89
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
90
|
+
metadata={},
|
91
|
+
chunks=[],
|
92
|
+
)
|
93
|
+
|
94
|
+
content = "\n".join(texts)
|
95
|
+
content = normalize_spaces(content)
|
96
|
+
|
97
|
+
avg_confidence = sum(scores) / len(scores) if scores else 0.0
|
98
|
+
|
99
|
+
metadata = {"confidence": avg_confidence} if scores else {}
|
100
|
+
|
101
|
+
return ExtractionResult(
|
102
|
+
content=content,
|
103
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
104
|
+
metadata=metadata, # type: ignore[arg-type]
|
105
|
+
chunks=[],
|
106
|
+
)
|
107
|
+
|
108
|
+
except Exception as e:
|
109
|
+
raise OCRError(f"PaddleOCR processing failed: {e}") from e
|
110
|
+
|
111
|
+
|
112
|
+
def process_image_bytes_sync_pure(
|
113
|
+
image_bytes: bytes,
|
114
|
+
config: PaddleOCRConfig | None = None,
|
115
|
+
) -> ExtractionResult:
|
116
|
+
"""Process image bytes with PaddleOCR using pure sync implementation.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
image_bytes: Image data as bytes.
|
120
|
+
config: PaddleOCR configuration.
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
Extraction result.
|
124
|
+
"""
|
125
|
+
import io
|
126
|
+
|
127
|
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
|
128
|
+
with Image.open(io.BytesIO(image_bytes)) as image:
|
129
|
+
image.save(tmp_image.name, format="PNG")
|
130
|
+
image_path = tmp_image.name
|
131
|
+
|
132
|
+
try:
|
133
|
+
return process_image_sync_pure(image_path, config)
|
134
|
+
finally:
|
135
|
+
image_file = Path(image_path)
|
136
|
+
if image_file.exists():
|
137
|
+
image_file.unlink()
|
138
|
+
|
139
|
+
|
140
|
+
def process_batch_images_sync_pure(
|
141
|
+
image_paths: list[str | Path],
|
142
|
+
config: PaddleOCRConfig | None = None,
|
143
|
+
) -> list[ExtractionResult]:
|
144
|
+
"""Process a batch of images sequentially with pure sync implementation.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
image_paths: List of image file paths.
|
148
|
+
config: PaddleOCR configuration.
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
List of extraction results.
|
152
|
+
"""
|
153
|
+
results = []
|
154
|
+
for image_path in image_paths:
|
155
|
+
result = process_image_sync_pure(image_path, config)
|
156
|
+
results.append(result)
|
157
|
+
return results
|
158
|
+
|
159
|
+
|
160
|
+
def process_batch_images_threaded(
|
161
|
+
image_paths: list[str | Path],
|
162
|
+
config: PaddleOCRConfig | None = None,
|
163
|
+
max_workers: int | None = None,
|
164
|
+
) -> list[ExtractionResult]:
|
165
|
+
"""Process a batch of images using threading.
|
166
|
+
|
167
|
+
Args:
|
168
|
+
image_paths: List of image file paths.
|
169
|
+
config: PaddleOCR configuration.
|
170
|
+
max_workers: Maximum number of threads.
|
171
|
+
|
172
|
+
Returns:
|
173
|
+
List of extraction results in same order as input.
|
174
|
+
"""
|
175
|
+
import multiprocessing as mp
|
176
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
177
|
+
|
178
|
+
if max_workers is None:
|
179
|
+
max_workers = min(len(image_paths), mp.cpu_count())
|
180
|
+
|
181
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
182
|
+
future_to_index = {
|
183
|
+
executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
|
184
|
+
}
|
185
|
+
|
186
|
+
results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
|
187
|
+
for future in as_completed(future_to_index):
|
188
|
+
index = future_to_index[future]
|
189
|
+
try:
|
190
|
+
results[index] = future.result()
|
191
|
+
except Exception as e: # noqa: BLE001
|
192
|
+
results[index] = ExtractionResult(
|
193
|
+
content=f"Error: {e}",
|
194
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
195
|
+
metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
|
196
|
+
chunks=[],
|
197
|
+
)
|
198
|
+
|
199
|
+
return results
|
kreuzberg/_ocr/_easyocr.py
CHANGED
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -202,9 +202,11 @@ class TesseractConfig:
|
|
202
202
|
- 'deu' for German
|
203
203
|
- multiple languages combined with '+', e.g. 'eng+deu')
|
204
204
|
"""
|
205
|
-
language_model_ngram_on: bool =
|
206
|
-
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
207
|
-
|
205
|
+
language_model_ngram_on: bool = False
|
206
|
+
"""Enable or disable the use of n-gram-based language models for improved text recognition.
|
207
|
+
|
208
|
+
Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
|
209
|
+
psm: PSMMode = PSMMode.AUTO_ONLY
|
208
210
|
"""Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
|
209
211
|
tessedit_dont_blkrej_good_wds: bool = True
|
210
212
|
"""If True, prevents block rejection of words identified as good, improving text output quality."""
|
@@ -212,6 +214,8 @@ class TesseractConfig:
|
|
212
214
|
"""If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
|
213
215
|
tessedit_enable_dict_correction: bool = True
|
214
216
|
"""Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
|
217
|
+
tessedit_char_whitelist: str = ""
|
218
|
+
"""Whitelist of characters that Tesseract is allowed to recognize. Empty string means no restriction."""
|
215
219
|
tessedit_use_primary_params_model: bool = True
|
216
220
|
"""If True, forces the use of the primary parameters model for text recognition."""
|
217
221
|
textord_space_size_is_variable: bool = True
|
kreuzberg/_types.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from collections.abc import Awaitable
|
4
|
+
from collections.abc import Awaitable, Callable
|
5
5
|
from dataclasses import asdict, dataclass, field
|
6
|
-
from typing import TYPE_CHECKING, Any,
|
6
|
+
from typing import TYPE_CHECKING, Any, Literal, TypedDict
|
7
7
|
|
8
8
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
9
9
|
from kreuzberg.exceptions import ValidationError
|
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
|
|
18
18
|
from PIL.Image import Image
|
19
19
|
|
20
20
|
from kreuzberg._gmft import GMFTConfig
|
21
|
+
from kreuzberg._language_detection import LanguageDetectionConfig
|
21
22
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
22
23
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
23
24
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
@@ -113,14 +114,16 @@ class ExtractionResult:
|
|
113
114
|
"""Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
|
114
115
|
chunks: list[str] = field(default_factory=list)
|
115
116
|
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
117
|
+
detected_languages: list[str] | None = None
|
118
|
+
"""Languages detected in the extracted content, if language detection is enabled."""
|
116
119
|
|
117
120
|
def to_dict(self) -> dict[str, Any]:
|
118
121
|
"""Converts the ExtractionResult to a dictionary."""
|
119
122
|
return asdict(self)
|
120
123
|
|
121
124
|
|
122
|
-
PostProcessingHook = Callable[[ExtractionResult],
|
123
|
-
ValidationHook = Callable[[ExtractionResult],
|
125
|
+
PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
|
126
|
+
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
124
127
|
|
125
128
|
|
126
129
|
@dataclass(unsafe_hash=True)
|
@@ -157,6 +160,10 @@ class ExtractionConfig:
|
|
157
160
|
"""Post processing hooks to call after processing is done and before the final result is returned."""
|
158
161
|
validators: list[ValidationHook] | None = None
|
159
162
|
"""Validation hooks to call after processing is done and before post-processing and result return."""
|
163
|
+
auto_detect_language: bool = False
|
164
|
+
"""Whether to automatically detect language and configure OCR accordingly."""
|
165
|
+
language_detection_config: LanguageDetectionConfig | None = None
|
166
|
+
"""Configuration for language detection. If None, uses default settings."""
|
160
167
|
|
161
168
|
def __post_init__(self) -> None:
|
162
169
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
kreuzberg/_utils/_device.py
CHANGED
@@ -153,7 +153,7 @@ def _is_cuda_available() -> bool:
|
|
153
153
|
try:
|
154
154
|
import torch # type: ignore[import-not-found,unused-ignore]
|
155
155
|
|
156
|
-
return torch.cuda.is_available()
|
156
|
+
return bool(torch.cuda.is_available())
|
157
157
|
except ImportError:
|
158
158
|
return False
|
159
159
|
|
@@ -163,7 +163,7 @@ def _is_mps_available() -> bool:
|
|
163
163
|
try:
|
164
164
|
import torch # type: ignore[import-not-found,unused-ignore]
|
165
165
|
|
166
|
-
return torch.backends.mps.is_available()
|
166
|
+
return bool(torch.backends.mps.is_available())
|
167
167
|
except ImportError:
|
168
168
|
return False
|
169
169
|
|
@@ -5,10 +5,10 @@ from __future__ import annotations
|
|
5
5
|
import multiprocessing as mp
|
6
6
|
from concurrent.futures import ProcessPoolExecutor
|
7
7
|
from contextlib import contextmanager
|
8
|
-
from typing import TYPE_CHECKING, Any,
|
8
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
9
9
|
|
10
10
|
if TYPE_CHECKING:
|
11
|
-
from collections.abc import Generator
|
11
|
+
from collections.abc import Callable, Generator
|
12
12
|
|
13
13
|
T = TypeVar("T")
|
14
14
|
|
kreuzberg/_utils/_sync.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import sys
|
4
3
|
from functools import partial
|
5
4
|
from inspect import isawaitable, iscoroutinefunction
|
6
5
|
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
@@ -12,10 +11,7 @@ from anyio.to_thread import run_sync as any_io_run_sync
|
|
12
11
|
if TYPE_CHECKING: # pragma: no cover
|
13
12
|
from collections.abc import Awaitable, Callable
|
14
13
|
|
15
|
-
|
16
|
-
from typing import ParamSpec
|
17
|
-
else: # pragma: no cover
|
18
|
-
from typing_extensions import ParamSpec
|
14
|
+
from typing import ParamSpec
|
19
15
|
|
20
16
|
T = TypeVar("T")
|
21
17
|
P = ParamSpec("P")
|
kreuzberg/_utils/_tmp.py
CHANGED
@@ -3,14 +3,14 @@ from __future__ import annotations
|
|
3
3
|
from contextlib import suppress
|
4
4
|
from pathlib import Path
|
5
5
|
from tempfile import NamedTemporaryFile
|
6
|
-
from typing import TYPE_CHECKING
|
6
|
+
from typing import TYPE_CHECKING
|
7
7
|
|
8
8
|
from anyio import Path as AsyncPath
|
9
9
|
|
10
10
|
from kreuzberg._utils._sync import run_sync
|
11
11
|
|
12
12
|
if TYPE_CHECKING: # pragma: no cover
|
13
|
-
from collections.abc import Coroutine
|
13
|
+
from collections.abc import Callable, Coroutine
|
14
14
|
|
15
15
|
|
16
16
|
async def create_temp_file(
|
kreuzberg/extraction.py
CHANGED
@@ -28,6 +28,11 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
|
|
28
28
|
for validator in config.validators or []:
|
29
29
|
await run_maybe_sync(validator, result)
|
30
30
|
|
31
|
+
if config.auto_detect_language and result.content:
|
32
|
+
from kreuzberg._language_detection import detect_languages
|
33
|
+
|
34
|
+
result.detected_languages = detect_languages(result.content, config.language_detection_config)
|
35
|
+
|
31
36
|
if config.chunk_content:
|
32
37
|
result.chunks = _handle_chunk_content(
|
33
38
|
mime_type=result.mime_type,
|
@@ -45,6 +50,11 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
|
|
45
50
|
for validator in config.validators or []:
|
46
51
|
run_sync_only(validator, result)
|
47
52
|
|
53
|
+
if config.auto_detect_language and result.content:
|
54
|
+
from kreuzberg._language_detection import detect_languages
|
55
|
+
|
56
|
+
result.detected_languages = detect_languages(result.content, config.language_detection_config)
|
57
|
+
|
48
58
|
if config.chunk_content:
|
49
59
|
result.chunks = _handle_chunk_content(
|
50
60
|
mime_type=result.mime_type,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.5.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
@@ -56,6 +56,8 @@ Provides-Extra: easyocr
|
|
56
56
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
57
57
|
Provides-Extra: gmft
|
58
58
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
59
|
+
Provides-Extra: langdetect
|
60
|
+
Requires-Dist: fast-langdetect>=0.2.0; extra == 'langdetect'
|
59
61
|
Provides-Extra: paddleocr
|
60
62
|
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
61
63
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
@@ -1,50 +1,53 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=
|
1
|
+
kreuzberg/__init__.py,sha256=zZ_puArNdw0pQk93BV99fXCxzkHFKXB9kINn8-6-y24,1408
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
3
|
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
4
|
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
-
kreuzberg/_gmft.py,sha256=
|
6
|
+
kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
|
7
|
+
kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
|
7
8
|
kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
|
8
9
|
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
9
10
|
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
10
|
-
kreuzberg/_types.py,sha256=
|
11
|
+
kreuzberg/_types.py,sha256=Tnl9yP56dn8ziBZk1sorNk1ZHZbJYMjSoqh7xxImFHs,8092
|
11
12
|
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
12
13
|
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
13
|
-
kreuzberg/extraction.py,sha256=
|
14
|
+
kreuzberg/extraction.py,sha256=Jz0f31Mm90mBkWwn0L3vn3z7-irdwNIzMHWByIj5d_I,17005
|
14
15
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
16
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
17
|
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
17
18
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
19
|
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
19
20
|
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
20
|
-
kreuzberg/_extractors/_image.py,sha256=
|
21
|
+
kreuzberg/_extractors/_image.py,sha256=pYfh3x9CkiIxOLvp0jkkZcmLbB_FpdfDo01klSc6OzQ,4819
|
21
22
|
kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
|
22
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
23
|
+
kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
|
23
24
|
kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
|
24
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
25
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
|
25
26
|
kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
|
26
|
-
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=
|
27
|
-
kreuzberg/_multiprocessing/process_manager.py,sha256=
|
27
|
+
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
|
28
|
+
kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
|
29
|
+
kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
|
30
|
+
kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
|
28
31
|
kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
|
29
32
|
kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
|
30
33
|
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
31
34
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
32
|
-
kreuzberg/_ocr/_easyocr.py,sha256=
|
35
|
+
kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
|
33
36
|
kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
|
34
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
37
|
+
kreuzberg/_ocr/_tesseract.py,sha256=3s3MkZN9xA_Uedx4s2p5m4IEIMhGjs9gYHxan9Iz-2g,13044
|
35
38
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
39
|
kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
|
37
|
-
kreuzberg/_utils/_device.py,sha256=
|
40
|
+
kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
|
38
41
|
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
39
42
|
kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
|
40
43
|
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
41
|
-
kreuzberg/_utils/_process_pool.py,sha256
|
44
|
+
kreuzberg/_utils/_process_pool.py,sha256=-0SNP01Qz21D7hgJmN0eHoqKusSygwPbi1U7IzJlPio,2895
|
42
45
|
kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
|
43
46
|
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
44
|
-
kreuzberg/_utils/_sync.py,sha256=
|
45
|
-
kreuzberg/_utils/_tmp.py,sha256=
|
46
|
-
kreuzberg-3.
|
47
|
-
kreuzberg-3.
|
48
|
-
kreuzberg-3.
|
49
|
-
kreuzberg-3.
|
50
|
-
kreuzberg-3.
|
47
|
+
kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
|
48
|
+
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
49
|
+
kreuzberg-3.5.0.dist-info/METADATA,sha256=jJXbwUuTXevmry2VVg1H8d6rEzebILJyN7q7kJ0M9mQ,8790
|
50
|
+
kreuzberg-3.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
51
|
+
kreuzberg-3.5.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
|
52
|
+
kreuzberg-3.5.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
53
|
+
kreuzberg-3.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|