kreuzberg 3.4.2__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -1
- kreuzberg/_entity_extraction.py +239 -0
- kreuzberg/_extractors/_image.py +21 -1
- kreuzberg/_extractors/_pdf.py +44 -14
- kreuzberg/_extractors/_spread_sheet.py +2 -2
- kreuzberg/_gmft.py +4 -4
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +2 -4
- kreuzberg/_multiprocessing/process_manager.py +2 -1
- kreuzberg/_multiprocessing/sync_easyocr.py +235 -0
- kreuzberg/_multiprocessing/sync_paddleocr.py +199 -0
- kreuzberg/_ocr/_easyocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +7 -3
- kreuzberg/_types.py +46 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_process_pool.py +2 -2
- kreuzberg/_utils/_sync.py +1 -5
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/extraction.py +39 -12
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/METADATA +12 -4
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/RECORD +24 -20
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.4.2.dist-info → kreuzberg-3.6.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
from importlib.metadata import version
|
2
2
|
|
3
|
+
from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
|
3
4
|
from kreuzberg._gmft import GMFTConfig
|
5
|
+
from kreuzberg._language_detection import LanguageDetectionConfig
|
4
6
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
5
7
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
6
8
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
7
9
|
|
8
10
|
from ._ocr._tesseract import PSMMode
|
9
11
|
from ._registry import ExtractorRegistry
|
10
|
-
from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
|
12
|
+
from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
|
11
13
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
12
14
|
from .extraction import (
|
13
15
|
batch_extract_bytes,
|
@@ -24,17 +26,20 @@ __version__ = version("kreuzberg")
|
|
24
26
|
|
25
27
|
__all__ = [
|
26
28
|
"EasyOCRConfig",
|
29
|
+
"Entity",
|
27
30
|
"ExtractionConfig",
|
28
31
|
"ExtractionResult",
|
29
32
|
"ExtractorRegistry",
|
30
33
|
"GMFTConfig",
|
31
34
|
"KreuzbergError",
|
35
|
+
"LanguageDetectionConfig",
|
32
36
|
"Metadata",
|
33
37
|
"MissingDependencyError",
|
34
38
|
"OCRError",
|
35
39
|
"PSMMode",
|
36
40
|
"PaddleOCRConfig",
|
37
41
|
"ParsingError",
|
42
|
+
"SpacyEntityExtractionConfig",
|
38
43
|
"TableData",
|
39
44
|
"TesseractConfig",
|
40
45
|
"ValidationError",
|
@@ -0,0 +1,239 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from functools import lru_cache
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
from kreuzberg._types import Entity
|
9
|
+
from kreuzberg.exceptions import MissingDependencyError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from collections.abc import Sequence
|
13
|
+
from pathlib import Path
|
14
|
+
|
15
|
+
|
16
|
+
@dataclass(unsafe_hash=True, frozen=True)
|
17
|
+
class SpacyEntityExtractionConfig:
|
18
|
+
"""Configuration for spaCy-based entity extraction."""
|
19
|
+
|
20
|
+
model_cache_dir: str | Path | None = None
|
21
|
+
"""Directory to cache spaCy models. If None, uses spaCy's default."""
|
22
|
+
|
23
|
+
language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
|
24
|
+
"""Mapping of language codes to spaCy model names.
|
25
|
+
|
26
|
+
If None, uses default mappings:
|
27
|
+
- en: en_core_web_sm
|
28
|
+
- de: de_core_news_sm
|
29
|
+
- fr: fr_core_news_sm
|
30
|
+
- es: es_core_news_sm
|
31
|
+
- pt: pt_core_news_sm
|
32
|
+
- it: it_core_news_sm
|
33
|
+
- nl: nl_core_news_sm
|
34
|
+
- zh: zh_core_web_sm
|
35
|
+
- ja: ja_core_news_sm
|
36
|
+
"""
|
37
|
+
|
38
|
+
fallback_to_multilingual: bool = True
|
39
|
+
"""If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
|
40
|
+
|
41
|
+
max_doc_length: int = 1000000
|
42
|
+
"""Maximum document length for spaCy processing."""
|
43
|
+
|
44
|
+
batch_size: int = 1000
|
45
|
+
"""Batch size for processing multiple texts."""
|
46
|
+
|
47
|
+
def __post_init__(self) -> None:
|
48
|
+
if self.language_models is None:
|
49
|
+
object.__setattr__(self, "language_models", self._get_default_language_models())
|
50
|
+
|
51
|
+
if isinstance(self.language_models, dict):
|
52
|
+
object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
|
53
|
+
|
54
|
+
@staticmethod
|
55
|
+
def _get_default_language_models() -> dict[str, str]:
|
56
|
+
"""Get default language model mappings based on available spaCy models."""
|
57
|
+
return {
|
58
|
+
"en": "en_core_web_sm",
|
59
|
+
"de": "de_core_news_sm",
|
60
|
+
"fr": "fr_core_news_sm",
|
61
|
+
"es": "es_core_news_sm",
|
62
|
+
"pt": "pt_core_news_sm",
|
63
|
+
"it": "it_core_news_sm",
|
64
|
+
"nl": "nl_core_news_sm",
|
65
|
+
"zh": "zh_core_web_sm",
|
66
|
+
"ja": "ja_core_news_sm",
|
67
|
+
"ko": "ko_core_news_sm",
|
68
|
+
"ru": "ru_core_news_sm",
|
69
|
+
"pl": "pl_core_news_sm",
|
70
|
+
"ro": "ro_core_news_sm",
|
71
|
+
"el": "el_core_news_sm",
|
72
|
+
"da": "da_core_news_sm",
|
73
|
+
"fi": "fi_core_news_sm",
|
74
|
+
"nb": "nb_core_news_sm",
|
75
|
+
"sv": "sv_core_news_sm",
|
76
|
+
"ca": "ca_core_news_sm",
|
77
|
+
"hr": "hr_core_news_sm",
|
78
|
+
"lt": "lt_core_news_sm",
|
79
|
+
"mk": "mk_core_news_sm",
|
80
|
+
"sl": "sl_core_news_sm",
|
81
|
+
"uk": "uk_core_news_sm",
|
82
|
+
}
|
83
|
+
|
84
|
+
def get_model_for_language(self, language_code: str) -> str | None:
|
85
|
+
"""Get the appropriate spaCy model for a language code."""
|
86
|
+
if not self.language_models:
|
87
|
+
return None
|
88
|
+
|
89
|
+
models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
|
90
|
+
|
91
|
+
if language_code in models_dict:
|
92
|
+
return models_dict[language_code]
|
93
|
+
|
94
|
+
base_lang = language_code.split("-")[0].lower()
|
95
|
+
if base_lang in models_dict:
|
96
|
+
return models_dict[base_lang]
|
97
|
+
|
98
|
+
return None
|
99
|
+
|
100
|
+
def get_fallback_model(self) -> str | None:
|
101
|
+
"""Get fallback multilingual model if enabled."""
|
102
|
+
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
103
|
+
|
104
|
+
|
105
|
+
def extract_entities(
|
106
|
+
text: str,
|
107
|
+
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
108
|
+
custom_patterns: frozenset[tuple[str, str]] | None = None,
|
109
|
+
languages: list[str] | None = None,
|
110
|
+
spacy_config: SpacyEntityExtractionConfig | None = None,
|
111
|
+
) -> list[Entity]:
|
112
|
+
"""Extract entities from text using custom regex patterns and/or a NER model.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
text: The input text to extract entities from.
|
116
|
+
entity_types: List of entity types to extract using the NER model.
|
117
|
+
custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
|
118
|
+
languages: List of detected languages to choose appropriate spaCy models.
|
119
|
+
spacy_config: Configuration for spaCy entity extraction.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
|
123
|
+
|
124
|
+
Raises:
|
125
|
+
MissingDependencyError: If `spacy` is not installed.
|
126
|
+
"""
|
127
|
+
entities: list[Entity] = []
|
128
|
+
if custom_patterns:
|
129
|
+
custom_patterns_dict = dict(custom_patterns)
|
130
|
+
for ent_type, pattern in custom_patterns_dict.items():
|
131
|
+
entities.extend(
|
132
|
+
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
133
|
+
for match in re.finditer(pattern, text)
|
134
|
+
)
|
135
|
+
|
136
|
+
if spacy_config is None:
|
137
|
+
spacy_config = SpacyEntityExtractionConfig()
|
138
|
+
|
139
|
+
try:
|
140
|
+
import spacy # noqa: F401
|
141
|
+
except ImportError as e:
|
142
|
+
raise MissingDependencyError.create_for_package(
|
143
|
+
package_name="spacy",
|
144
|
+
dependency_group="entity-extraction",
|
145
|
+
functionality="Entity Extraction",
|
146
|
+
) from e
|
147
|
+
|
148
|
+
model_name = _select_spacy_model(languages, spacy_config)
|
149
|
+
if not model_name:
|
150
|
+
return entities
|
151
|
+
|
152
|
+
nlp = _load_spacy_model(model_name, spacy_config)
|
153
|
+
if not nlp:
|
154
|
+
return entities
|
155
|
+
|
156
|
+
if len(text) > spacy_config.max_doc_length:
|
157
|
+
text = text[: spacy_config.max_doc_length]
|
158
|
+
|
159
|
+
doc = nlp(text)
|
160
|
+
|
161
|
+
entity_type_mapping = {etype.upper() for etype in entity_types}
|
162
|
+
|
163
|
+
entities.extend(
|
164
|
+
Entity(
|
165
|
+
type=ent.label_,
|
166
|
+
text=ent.text,
|
167
|
+
start=ent.start_char,
|
168
|
+
end=ent.end_char,
|
169
|
+
)
|
170
|
+
for ent in doc.ents
|
171
|
+
if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
|
172
|
+
)
|
173
|
+
|
174
|
+
return entities
|
175
|
+
|
176
|
+
|
177
|
+
@lru_cache(maxsize=32)
|
178
|
+
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
179
|
+
"""Load a spaCy model with caching."""
|
180
|
+
try:
|
181
|
+
import spacy
|
182
|
+
|
183
|
+
if spacy_config.model_cache_dir:
|
184
|
+
import os
|
185
|
+
|
186
|
+
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
187
|
+
|
188
|
+
nlp = spacy.load(model_name)
|
189
|
+
|
190
|
+
nlp.max_length = spacy_config.max_doc_length
|
191
|
+
|
192
|
+
return nlp
|
193
|
+
except (OSError, ImportError):
|
194
|
+
return None
|
195
|
+
|
196
|
+
|
197
|
+
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
198
|
+
"""Select the best spaCy model based on detected languages."""
|
199
|
+
if not languages:
|
200
|
+
return spacy_config.get_model_for_language("en")
|
201
|
+
|
202
|
+
for lang in languages:
|
203
|
+
model_name = spacy_config.get_model_for_language(lang)
|
204
|
+
if model_name:
|
205
|
+
return model_name
|
206
|
+
|
207
|
+
return spacy_config.get_fallback_model()
|
208
|
+
|
209
|
+
|
210
|
+
def extract_keywords(
|
211
|
+
text: str,
|
212
|
+
keyword_count: int = 10,
|
213
|
+
) -> list[tuple[str, float]]:
|
214
|
+
"""Extract keywords from text using the KeyBERT model.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
text: The input text to extract keywords from.
|
218
|
+
keyword_count: Number of top keywords to return. Defaults to 10.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
|
222
|
+
|
223
|
+
Raises:
|
224
|
+
MissingDependencyError: If `keybert` is not installed.
|
225
|
+
"""
|
226
|
+
try:
|
227
|
+
from keybert import KeyBERT
|
228
|
+
|
229
|
+
kw_model = KeyBERT()
|
230
|
+
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
231
|
+
return [(kw, float(score)) for kw, score in keywords]
|
232
|
+
except (RuntimeError, OSError, ValueError):
|
233
|
+
return []
|
234
|
+
except ImportError as e:
|
235
|
+
raise MissingDependencyError.create_for_package(
|
236
|
+
package_name="keybert",
|
237
|
+
dependency_group="entity-extraction",
|
238
|
+
functionality="Keyword Extraction",
|
239
|
+
) from e
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -80,11 +80,11 @@ class ImageExtractor(Extractor):
|
|
80
80
|
if self.config.ocr_backend is None:
|
81
81
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
82
|
|
83
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
84
83
|
from kreuzberg._types import ExtractionResult
|
85
84
|
|
86
85
|
if self.config.ocr_backend == "tesseract":
|
87
86
|
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
87
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
88
88
|
|
89
89
|
if isinstance(self.config.ocr_config, TesseractConfig):
|
90
90
|
config = self.config.ocr_config
|
@@ -96,6 +96,26 @@ class ImageExtractor(Extractor):
|
|
96
96
|
return results[0]
|
97
97
|
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
98
98
|
|
99
|
+
if self.config.ocr_backend == "paddleocr":
|
100
|
+
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
101
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
102
|
+
|
103
|
+
paddle_config = (
|
104
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
105
|
+
)
|
106
|
+
|
107
|
+
return paddle_process(path, paddle_config)
|
108
|
+
|
109
|
+
if self.config.ocr_backend == "easyocr":
|
110
|
+
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
111
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
112
|
+
|
113
|
+
easy_config = (
|
114
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
115
|
+
)
|
116
|
+
|
117
|
+
return easy_process(path, easy_config)
|
118
|
+
|
99
119
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
100
120
|
|
101
121
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -299,8 +299,6 @@ class PDFExtractor(Extractor):
|
|
299
299
|
"""Extract text from PDF using OCR (sync version)."""
|
300
300
|
pdf = None
|
301
301
|
try:
|
302
|
-
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
303
|
-
|
304
302
|
images = []
|
305
303
|
with pypdfium_file_lock(path):
|
306
304
|
pdf = pypdfium2.PdfDocument(str(path))
|
@@ -325,18 +323,7 @@ class PDFExtractor(Extractor):
|
|
325
323
|
os.close(fd)
|
326
324
|
image_paths.append(temp_path)
|
327
325
|
|
328
|
-
|
329
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
330
|
-
|
331
|
-
if isinstance(self.config.ocr_config, TesseractConfig):
|
332
|
-
config = self.config.ocr_config
|
333
|
-
else:
|
334
|
-
config = TesseractConfig()
|
335
|
-
results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
|
336
|
-
text_parts = [r.content for r in results]
|
337
|
-
return "\n\n".join(text_parts)
|
338
|
-
|
339
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
326
|
+
return self._process_pdf_images_with_ocr(image_paths)
|
340
327
|
|
341
328
|
finally:
|
342
329
|
for _, temp_path in temp_files:
|
@@ -349,3 +336,46 @@ class PDFExtractor(Extractor):
|
|
349
336
|
if pdf:
|
350
337
|
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
351
338
|
pdf.close()
|
339
|
+
|
340
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
341
|
+
"""Process PDF images with the configured OCR backend."""
|
342
|
+
if self.config.ocr_backend == "tesseract":
|
343
|
+
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
344
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
345
|
+
|
346
|
+
tesseract_config = (
|
347
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
348
|
+
)
|
349
|
+
results = process_batch_images_sync_pure([str(p) for p in image_paths], tesseract_config)
|
350
|
+
text_parts = [r.content for r in results]
|
351
|
+
return "\n\n".join(text_parts)
|
352
|
+
|
353
|
+
if self.config.ocr_backend == "paddleocr":
|
354
|
+
from kreuzberg._multiprocessing.sync_paddleocr import process_image_sync_pure as paddle_process
|
355
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
356
|
+
|
357
|
+
paddle_config = (
|
358
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
359
|
+
)
|
360
|
+
|
361
|
+
text_parts = []
|
362
|
+
for image_path in image_paths:
|
363
|
+
result = paddle_process(Path(image_path), paddle_config)
|
364
|
+
text_parts.append(result.content)
|
365
|
+
return "\n\n".join(text_parts)
|
366
|
+
|
367
|
+
if self.config.ocr_backend == "easyocr":
|
368
|
+
from kreuzberg._multiprocessing.sync_easyocr import process_image_sync_pure as easy_process
|
369
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
370
|
+
|
371
|
+
easy_config = (
|
372
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
373
|
+
)
|
374
|
+
|
375
|
+
text_parts = []
|
376
|
+
for image_path in image_paths:
|
377
|
+
result = easy_process(Path(image_path), easy_config)
|
378
|
+
text_parts.append(result.content)
|
379
|
+
return "\n\n".join(text_parts)
|
380
|
+
|
381
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
@@ -6,7 +6,7 @@ import sys
|
|
6
6
|
from datetime import date, datetime, time, timedelta
|
7
7
|
from io import StringIO
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any
|
9
|
+
from typing import Any
|
10
10
|
|
11
11
|
from anyio import Path as AsyncPath
|
12
12
|
from python_calamine import CalamineWorkbook
|
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
|
|
23
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
24
24
|
|
25
25
|
|
26
|
-
CellValue =
|
26
|
+
CellValue = int | float | str | bool | time | date | datetime | timedelta
|
27
27
|
|
28
28
|
|
29
29
|
class SpreadSheetExtractor(Extractor):
|
kreuzberg/_gmft.py
CHANGED
@@ -210,7 +210,7 @@ async def extract_tables( # noqa: PLR0915
|
|
210
210
|
from gmft.formatters.tatr import TATRFormatConfig
|
211
211
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
212
212
|
|
213
|
-
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
213
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
214
214
|
config=TATRFormatConfig(
|
215
215
|
verbosity=config.verbosity,
|
216
216
|
formatter_base_threshold=config.formatter_base_threshold,
|
@@ -226,7 +226,7 @@ async def extract_tables( # noqa: PLR0915
|
|
226
226
|
force_large_table_assumption=config.force_large_table_assumption,
|
227
227
|
)
|
228
228
|
)
|
229
|
-
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
229
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
|
230
230
|
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
231
231
|
)
|
232
232
|
doc = await run_sync(PyPDFium2Document, str(file_path))
|
@@ -247,7 +247,7 @@ async def extract_tables( # noqa: PLR0915
|
|
247
247
|
text=data_frame.to_markdown(),
|
248
248
|
df=data_frame,
|
249
249
|
)
|
250
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
250
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
251
251
|
]
|
252
252
|
|
253
253
|
await table_cache.aset(result, **cache_kwargs)
|
@@ -365,7 +365,7 @@ def extract_tables_sync(
|
|
365
365
|
text=data_frame.to_markdown(),
|
366
366
|
df=data_frame,
|
367
367
|
)
|
368
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
368
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
|
369
369
|
]
|
370
370
|
|
371
371
|
table_cache.set(result, **cache_kwargs)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from functools import lru_cache
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
from kreuzberg.exceptions import MissingDependencyError
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
11
|
+
|
12
|
+
try:
|
13
|
+
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
14
|
+
from fast_langdetect import detect, detect_multilingual
|
15
|
+
|
16
|
+
HAS_FAST_LANGDETECT = True
|
17
|
+
except ImportError:
|
18
|
+
HAS_FAST_LANGDETECT = False
|
19
|
+
detect = None
|
20
|
+
detect_multilingual = None
|
21
|
+
FastLangDetectConfig = None
|
22
|
+
|
23
|
+
_CACHE_SIZE = 128
|
24
|
+
|
25
|
+
|
26
|
+
@dataclass(frozen=True)
|
27
|
+
class LanguageDetectionConfig:
|
28
|
+
"""Configuration for language detection.
|
29
|
+
|
30
|
+
Attributes:
|
31
|
+
low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
32
|
+
Defaults to True for better memory efficiency.
|
33
|
+
top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
|
34
|
+
multilingual: If True, uses multilingual detection to handle mixed-language text.
|
35
|
+
If False, uses single language detection. Defaults to False.
|
36
|
+
cache_dir: Custom directory for model cache. If None, uses system default.
|
37
|
+
allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
|
38
|
+
"""
|
39
|
+
|
40
|
+
low_memory: bool = True
|
41
|
+
top_k: int = 3
|
42
|
+
multilingual: bool = False
|
43
|
+
cache_dir: str | None = None
|
44
|
+
allow_fallback: bool = True
|
45
|
+
|
46
|
+
|
47
|
+
def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
|
48
|
+
"""Create FastLangDetectConfig from our config."""
|
49
|
+
if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
|
50
|
+
return None
|
51
|
+
|
52
|
+
kwargs: dict[str, Any] = {
|
53
|
+
"allow_fallback": config.allow_fallback,
|
54
|
+
}
|
55
|
+
if config.cache_dir is not None:
|
56
|
+
kwargs["cache_dir"] = config.cache_dir
|
57
|
+
|
58
|
+
return FastLangDetectConfig(**kwargs)
|
59
|
+
|
60
|
+
|
61
|
+
@lru_cache(maxsize=_CACHE_SIZE)
|
62
|
+
def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
|
63
|
+
"""Detect the most probable languages in the given text using fast-langdetect.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
text: The text to analyze.
|
67
|
+
config: Configuration for language detection. If None, uses defaults.
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
|
71
|
+
or None if detection fails.
|
72
|
+
|
73
|
+
Raises:
|
74
|
+
MissingDependencyError: If fast-langdetect is not installed.
|
75
|
+
"""
|
76
|
+
if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
|
77
|
+
raise MissingDependencyError.create_for_package(
|
78
|
+
dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
|
79
|
+
)
|
80
|
+
|
81
|
+
if config is None:
|
82
|
+
config = LanguageDetectionConfig()
|
83
|
+
|
84
|
+
try:
|
85
|
+
if config.multilingual:
|
86
|
+
results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
|
87
|
+
|
88
|
+
return [result["lang"].lower() for result in results if result.get("lang")]
|
89
|
+
|
90
|
+
result = detect(text, low_memory=config.low_memory)
|
91
|
+
if result and result.get("lang"):
|
92
|
+
return [result["lang"].lower()]
|
93
|
+
return None
|
94
|
+
except Exception: # noqa: BLE001
|
95
|
+
return None
|
@@ -56,9 +56,7 @@ def _extract_tables_in_process(
|
|
56
56
|
force_large_table_assumption=config.force_large_table_assumption,
|
57
57
|
)
|
58
58
|
)
|
59
|
-
detector = AutoTableDetector( # type: ignore[no-untyped-call]
|
60
|
-
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
61
|
-
)
|
59
|
+
detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
|
62
60
|
|
63
61
|
doc = PyPDFium2Document(str(file_path))
|
64
62
|
cropped_tables = []
|
@@ -73,7 +71,7 @@ def _extract_tables_in_process(
|
|
73
71
|
dataframes.append(formatted_table.df())
|
74
72
|
|
75
73
|
results = []
|
76
|
-
for data_frame, cropped_table in zip(dataframes, cropped_tables):
|
74
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
|
77
75
|
import io
|
78
76
|
|
79
77
|
img_bytes = io.BytesIO()
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import multiprocessing as mp
|
6
6
|
from concurrent.futures import ProcessPoolExecutor
|
7
|
-
from typing import TYPE_CHECKING, Any,
|
7
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
8
8
|
|
9
9
|
import anyio
|
10
10
|
import psutil
|
@@ -12,6 +12,7 @@ from typing_extensions import Self
|
|
12
12
|
|
13
13
|
if TYPE_CHECKING:
|
14
14
|
import types
|
15
|
+
from collections.abc import Callable
|
15
16
|
|
16
17
|
T = TypeVar("T")
|
17
18
|
|