kreuzberg 3.17.0__py3-none-any.whl → 3.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_language_detection.py +16 -39
- kreuzberg/_types.py +15 -6
- kreuzberg/extraction.py +8 -1
- {kreuzberg-3.17.0.dist-info → kreuzberg-3.17.1.dist-info}/METADATA +3 -3
- {kreuzberg-3.17.0.dist-info → kreuzberg-3.17.1.dist-info}/RECORD +8 -8
- {kreuzberg-3.17.0.dist-info → kreuzberg-3.17.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.17.0.dist-info → kreuzberg-3.17.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.17.0.dist-info → kreuzberg-3.17.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_language_detection.py
CHANGED
@@ -1,60 +1,37 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from functools import lru_cache
|
4
|
-
from typing import TYPE_CHECKING, Any
|
5
4
|
|
6
5
|
from kreuzberg._types import LanguageDetectionConfig
|
7
6
|
from kreuzberg.exceptions import MissingDependencyError
|
8
7
|
|
9
|
-
if TYPE_CHECKING:
|
10
|
-
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
11
|
-
|
12
|
-
try:
|
13
|
-
from fast_langdetect import LangDetectConfig as FastLangDetectConfig
|
14
|
-
from fast_langdetect import detect, detect_multilingual
|
15
|
-
|
16
|
-
HAS_FAST_LANGDETECT = True
|
17
|
-
except ImportError: # pragma: no cover
|
18
|
-
HAS_FAST_LANGDETECT = False
|
19
|
-
detect = None
|
20
|
-
detect_multilingual = None
|
21
|
-
FastLangDetectConfig = None
|
22
|
-
|
23
8
|
_CACHE_SIZE = 128
|
24
9
|
|
25
10
|
|
26
|
-
def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
|
27
|
-
if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
|
28
|
-
return None
|
29
|
-
|
30
|
-
kwargs: dict[str, Any] = {
|
31
|
-
"allow_fallback": config.allow_fallback,
|
32
|
-
}
|
33
|
-
if config.cache_dir is not None:
|
34
|
-
kwargs["cache_dir"] = config.cache_dir
|
35
|
-
|
36
|
-
return FastLangDetectConfig(**kwargs)
|
37
|
-
|
38
|
-
|
39
11
|
@lru_cache(maxsize=_CACHE_SIZE)
|
40
12
|
def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
|
41
|
-
|
13
|
+
try:
|
14
|
+
from fast_langdetect import detect # noqa: PLC0415
|
15
|
+
except ImportError as e:
|
42
16
|
raise MissingDependencyError.create_for_package(
|
43
|
-
dependency_group="langdetect",
|
44
|
-
|
17
|
+
dependency_group="langdetect",
|
18
|
+
functionality="language detection",
|
19
|
+
package_name="fast-langdetect",
|
20
|
+
) from e
|
45
21
|
|
46
22
|
if config is None:
|
47
23
|
config = LanguageDetectionConfig()
|
48
24
|
|
49
25
|
try:
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
if
|
57
|
-
|
26
|
+
# detect always returns a list, use k parameter for multiple languages
|
27
|
+
k = config.top_k if config.multilingual else 1
|
28
|
+
# Use the model from config directly
|
29
|
+
model = config.model
|
30
|
+
results = detect(text, model=model, k=k)
|
31
|
+
|
32
|
+
if results:
|
33
|
+
langs = [result["lang"].lower() for result in results if result.get("lang")]
|
34
|
+
return langs if langs else None
|
58
35
|
return None
|
59
36
|
except Exception: # noqa: BLE001
|
60
37
|
return None
|
kreuzberg/_types.py
CHANGED
@@ -402,9 +402,12 @@ class ImageOCRConfig(ConfigDict):
|
|
402
402
|
|
403
403
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
404
404
|
class LanguageDetectionConfig(ConfigDict):
|
405
|
-
|
406
|
-
"""
|
407
|
-
|
405
|
+
model: Literal["lite", "full", "auto"] = "auto"
|
406
|
+
"""Language detection model to use:
|
407
|
+
- 'lite': Smaller, faster model with good accuracy
|
408
|
+
- 'full': Larger model with highest accuracy
|
409
|
+
- 'auto': Automatically choose based on memory availability (default)
|
410
|
+
"""
|
408
411
|
top_k: int = 3
|
409
412
|
"""Maximum number of languages to return for multilingual detection."""
|
410
413
|
multilingual: bool = False
|
@@ -412,8 +415,8 @@ class LanguageDetectionConfig(ConfigDict):
|
|
412
415
|
If False, uses single language detection."""
|
413
416
|
cache_dir: str | None = None
|
414
417
|
"""Custom directory for model cache. If None, uses system default."""
|
415
|
-
|
416
|
-
"""
|
418
|
+
low_memory: bool = True
|
419
|
+
"""Deprecated. Use 'model' parameter instead. If True, uses 'lite' model."""
|
417
420
|
|
418
421
|
|
419
422
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
@@ -983,8 +986,14 @@ class ExtractionConfig(ConfigDict):
|
|
983
986
|
"""Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
|
984
987
|
auto_detect_language: bool = False
|
985
988
|
"""Whether to automatically detect language and configure OCR accordingly."""
|
989
|
+
language_detection_model: Literal["lite", "full", "auto"] = "auto"
|
990
|
+
"""Language detection model to use when auto_detect_language is True.
|
991
|
+
- 'lite': Smaller, faster model with good accuracy
|
992
|
+
- 'full': Larger model with highest accuracy
|
993
|
+
- 'auto': Automatically choose based on memory availability (default)
|
994
|
+
"""
|
986
995
|
language_detection_config: LanguageDetectionConfig | None = None
|
987
|
-
"""Configuration for language detection. If None, uses default settings."""
|
996
|
+
"""Configuration for language detection. If None, uses default settings with language_detection_model."""
|
988
997
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
989
998
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
990
999
|
auto_detect_document_type: bool = False
|
kreuzberg/extraction.py
CHANGED
@@ -76,9 +76,16 @@ def _validate_and_post_process_helper(
|
|
76
76
|
result.keywords = None
|
77
77
|
|
78
78
|
if config.auto_detect_language:
|
79
|
+
# Use provided config or create one with the model from ExtractionConfig
|
80
|
+
lang_config = config.language_detection_config
|
81
|
+
if lang_config is None:
|
82
|
+
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
83
|
+
|
84
|
+
lang_config = LanguageDetectionConfig(model=config.language_detection_model)
|
85
|
+
|
79
86
|
result.detected_languages = detect_languages(
|
80
87
|
result.content,
|
81
|
-
config=
|
88
|
+
config=lang_config,
|
82
89
|
)
|
83
90
|
|
84
91
|
if config.auto_detect_document_type:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.17.
|
3
|
+
Version: 3.17.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -33,12 +33,12 @@ Requires-Dist: chardetng-py>=0.3.5
|
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.14.
|
36
|
+
Requires-Dist: mcp>=1.14.1
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
38
38
|
Requires-Dist: numpy>=2.0.0
|
39
39
|
Requires-Dist: playa-pdf>=0.7.0
|
40
40
|
Requires-Dist: polars>=1.33.1
|
41
|
-
Requires-Dist: psutil>=7.
|
41
|
+
Requires-Dist: psutil>=7.1.0
|
42
42
|
Requires-Dist: pypdfium2==4.30.0
|
43
43
|
Requires-Dist: python-calamine>=0.5.3
|
44
44
|
Requires-Dist: python-pptx>=1.0.2
|
@@ -6,14 +6,14 @@ kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
|
6
6
|
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
7
|
kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
|
8
8
|
kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
|
9
|
-
kreuzberg/_language_detection.py,sha256=
|
9
|
+
kreuzberg/_language_detection.py,sha256=OwIWIddERPEz8krU_Aq0_KjRF6MHP-LpugH6Y6miwOc,1204
|
10
10
|
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
11
11
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
12
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
-
kreuzberg/_types.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
|
14
14
|
kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
|
15
15
|
kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
|
16
|
-
kreuzberg/extraction.py,sha256=
|
16
|
+
kreuzberg/extraction.py,sha256=jwzWdomwrl-2z1UznLoURLyqD5r0U-rFABXSBV2B2wA,19063
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
121
121
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
122
122
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
123
123
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
124
|
-
kreuzberg-3.17.
|
125
|
-
kreuzberg-3.17.
|
126
|
-
kreuzberg-3.17.
|
127
|
-
kreuzberg-3.17.
|
128
|
-
kreuzberg-3.17.
|
124
|
+
kreuzberg-3.17.1.dist-info/METADATA,sha256=ttfOl3XA6b-M2BMY7v1cfASGm_Qe91HPzfRcAf_-zU8,12351
|
125
|
+
kreuzberg-3.17.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
126
|
+
kreuzberg-3.17.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
127
|
+
kreuzberg-3.17.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
128
|
+
kreuzberg-3.17.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|