kreuzberg 3.17.0__py3-none-any.whl → 3.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,60 +1,37 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from functools import lru_cache
4
- from typing import TYPE_CHECKING, Any
5
4
 
6
5
  from kreuzberg._types import LanguageDetectionConfig
7
6
  from kreuzberg.exceptions import MissingDependencyError
8
7
 
9
- if TYPE_CHECKING:
10
- from fast_langdetect import LangDetectConfig as FastLangDetectConfig
11
-
12
- try:
13
- from fast_langdetect import LangDetectConfig as FastLangDetectConfig
14
- from fast_langdetect import detect, detect_multilingual
15
-
16
- HAS_FAST_LANGDETECT = True
17
- except ImportError: # pragma: no cover
18
- HAS_FAST_LANGDETECT = False
19
- detect = None
20
- detect_multilingual = None
21
- FastLangDetectConfig = None
22
-
23
8
  _CACHE_SIZE = 128
24
9
 
25
10
 
26
- def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
27
- if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
28
- return None
29
-
30
- kwargs: dict[str, Any] = {
31
- "allow_fallback": config.allow_fallback,
32
- }
33
- if config.cache_dir is not None:
34
- kwargs["cache_dir"] = config.cache_dir
35
-
36
- return FastLangDetectConfig(**kwargs)
37
-
38
-
39
11
  @lru_cache(maxsize=_CACHE_SIZE)
40
12
  def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
41
- if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
13
+ try:
14
+ from fast_langdetect import detect # noqa: PLC0415
15
+ except ImportError as e:
42
16
  raise MissingDependencyError.create_for_package(
43
- dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
44
- )
17
+ dependency_group="langdetect",
18
+ functionality="language detection",
19
+ package_name="fast-langdetect",
20
+ ) from e
45
21
 
46
22
  if config is None:
47
23
  config = LanguageDetectionConfig()
48
24
 
49
25
  try:
50
- if config.multilingual:
51
- results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
52
-
53
- return [result["lang"].lower() for result in results if result.get("lang")]
54
-
55
- result = detect(text, low_memory=config.low_memory)
56
- if result and result.get("lang"):
57
- return [result["lang"].lower()]
26
+ # detect always returns a list, use k parameter for multiple languages
27
+ k = config.top_k if config.multilingual else 1
28
+ # Use the model from config directly
29
+ model = config.model
30
+ results = detect(text, model=model, k=k)
31
+
32
+ if results:
33
+ langs = [result["lang"].lower() for result in results if result.get("lang")]
34
+ return langs if langs else None
58
35
  return None
59
36
  except Exception: # noqa: BLE001
60
37
  return None
kreuzberg/_types.py CHANGED
@@ -402,9 +402,12 @@ class ImageOCRConfig(ConfigDict):
402
402
 
403
403
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
404
404
  class LanguageDetectionConfig(ConfigDict):
405
- low_memory: bool = True
406
- """If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
407
- Defaults to True for better memory efficiency."""
405
+ model: Literal["lite", "full", "auto"] = "auto"
406
+ """Language detection model to use:
407
+ - 'lite': Smaller, faster model with good accuracy
408
+ - 'full': Larger model with highest accuracy
409
+ - 'auto': Automatically choose based on memory availability (default)
410
+ """
408
411
  top_k: int = 3
409
412
  """Maximum number of languages to return for multilingual detection."""
410
413
  multilingual: bool = False
@@ -412,8 +415,8 @@ class LanguageDetectionConfig(ConfigDict):
412
415
  If False, uses single language detection."""
413
416
  cache_dir: str | None = None
414
417
  """Custom directory for model cache. If None, uses system default."""
415
- allow_fallback: bool = True
416
- """If True, falls back to small model if large model fails."""
418
+ low_memory: bool = True
419
+ """Deprecated. Use 'model' parameter instead. If True, uses 'lite' model."""
417
420
 
418
421
 
419
422
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
@@ -983,8 +986,14 @@ class ExtractionConfig(ConfigDict):
983
986
  """Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
984
987
  auto_detect_language: bool = False
985
988
  """Whether to automatically detect language and configure OCR accordingly."""
989
+ language_detection_model: Literal["lite", "full", "auto"] = "auto"
990
+ """Language detection model to use when auto_detect_language is True.
991
+ - 'lite': Smaller, faster model with good accuracy
992
+ - 'full': Larger model with highest accuracy
993
+ - 'auto': Automatically choose based on memory availability (default)
994
+ """
986
995
  language_detection_config: LanguageDetectionConfig | None = None
987
- """Configuration for language detection. If None, uses default settings."""
996
+ """Configuration for language detection. If None, uses default settings with language_detection_model."""
988
997
  spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
989
998
  """Configuration for spaCy entity extraction. If None, uses default settings."""
990
999
  auto_detect_document_type: bool = False
kreuzberg/extraction.py CHANGED
@@ -76,9 +76,16 @@ def _validate_and_post_process_helper(
76
76
  result.keywords = None
77
77
 
78
78
  if config.auto_detect_language:
79
+ # Use provided config or create one with the model from ExtractionConfig
80
+ lang_config = config.language_detection_config
81
+ if lang_config is None:
82
+ from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
83
+
84
+ lang_config = LanguageDetectionConfig(model=config.language_detection_model)
85
+
79
86
  result.detected_languages = detect_languages(
80
87
  result.content,
81
- config=config.language_detection_config,
88
+ config=lang_config,
82
89
  )
83
90
 
84
91
  if config.auto_detect_document_type:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.0
3
+ Version: 3.17.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -33,12 +33,12 @@ Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.13.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.0
36
+ Requires-Dist: mcp>=1.14.1
37
37
  Requires-Dist: msgspec>=0.18.0
38
38
  Requires-Dist: numpy>=2.0.0
39
39
  Requires-Dist: playa-pdf>=0.7.0
40
40
  Requires-Dist: polars>=1.33.1
41
- Requires-Dist: psutil>=7.0.0
41
+ Requires-Dist: psutil>=7.1.0
42
42
  Requires-Dist: pypdfium2==4.30.0
43
43
  Requires-Dist: python-calamine>=0.5.3
44
44
  Requires-Dist: python-pptx>=1.0.2
@@ -6,14 +6,14 @@ kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
6
  kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
7
  kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
8
8
  kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
9
- kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
9
+ kreuzberg/_language_detection.py,sha256=OwIWIddERPEz8krU_Aq0_KjRF6MHP-LpugH6Y6miwOc,1204
10
10
  kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
11
11
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
12
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
- kreuzberg/_types.py,sha256=uULpUfQzpt_AAr8epOvIl3cdB9TkNTFrxWQssnZg_IM,48655
13
+ kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
14
14
  kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
15
15
  kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
16
- kreuzberg/extraction.py,sha256=Z2rBVGs8oteXU1mynHCd9q1yKz9NNA5tQdWq35jP2EE,18743
16
+ kreuzberg/extraction.py,sha256=jwzWdomwrl-2z1UznLoURLyqD5r0U-rFABXSBV2B2wA,19063
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
121
121
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
122
122
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
123
123
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
124
- kreuzberg-3.17.0.dist-info/METADATA,sha256=4iVwQUo4FVNSwj8h6oEqNAT5B6zm-J-u5k3Jy3Pv3L0,12351
125
- kreuzberg-3.17.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
- kreuzberg-3.17.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
- kreuzberg-3.17.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
- kreuzberg-3.17.0.dist-info/RECORD,,
124
+ kreuzberg-3.17.1.dist-info/METADATA,sha256=ttfOl3XA6b-M2BMY7v1cfASGm_Qe91HPzfRcAf_-zU8,12351
125
+ kreuzberg-3.17.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
+ kreuzberg-3.17.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
+ kreuzberg-3.17.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
+ kreuzberg-3.17.1.dist-info/RECORD,,