kreuzberg 3.17.1__py3-none-any.whl → 3.17.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,12 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
+ import subprocess
6
+ import sys
5
7
  from functools import lru_cache
6
8
  from itertools import chain
7
9
  from typing import TYPE_CHECKING, Any
8
10
 
9
11
  from kreuzberg._types import Entity, SpacyEntityExtractionConfig
10
- from kreuzberg.exceptions import MissingDependencyError
12
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
11
13
 
12
14
  if TYPE_CHECKING:
13
15
  from collections.abc import Sequence
@@ -49,8 +51,6 @@ def extract_entities(
49
51
  return entities
50
52
 
51
53
  nlp = _load_spacy_model(model_name, spacy_config)
52
- if not nlp:
53
- return entities
54
54
 
55
55
  if len(text) > spacy_config.max_doc_length:
56
56
  text = text[: spacy_config.max_doc_length]
@@ -77,17 +77,45 @@ def extract_entities(
77
77
  def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
78
78
  try:
79
79
  import spacy # noqa: PLC0415
80
+ except ImportError:
81
+ return None
80
82
 
81
- if spacy_config.model_cache_dir:
82
- os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
83
+ if spacy_config.model_cache_dir:
84
+ os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
83
85
 
86
+ try:
84
87
  nlp = spacy.load(model_name)
88
+ except OSError:
89
+ result = subprocess.run(
90
+ [sys.executable, "-m", "spacy", "download", model_name],
91
+ capture_output=True,
92
+ text=True,
93
+ check=False,
94
+ )
85
95
 
86
- nlp.max_length = spacy_config.max_doc_length
87
-
88
- return nlp
89
- except (OSError, ImportError):
90
- return None
96
+ if result.returncode != 0:
97
+ error_msg = (
98
+ f"Failed to download spaCy model '{model_name}'. "
99
+ f"Please install it manually with: python -m spacy download {model_name}"
100
+ )
101
+ if result.stderr:
102
+ error_msg += f"\nError details: {result.stderr}"
103
+ raise KreuzbergError(
104
+ error_msg, context={"model": model_name, "stderr": result.stderr, "return_code": result.returncode}
105
+ ) from None
106
+
107
+ try:
108
+ nlp = spacy.load(model_name)
109
+ except OSError as e:
110
+ raise KreuzbergError(
111
+ f"Failed to load spaCy model '{model_name}' even after successful download. "
112
+ f"Please verify your spaCy installation and try reinstalling the model.",
113
+ context={"model": model_name, "error": str(e)},
114
+ ) from e
115
+
116
+ nlp.max_length = spacy_config.max_doc_length
117
+
118
+ return nlp
91
119
 
92
120
 
93
121
  def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
@@ -23,9 +23,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
23
23
  config = LanguageDetectionConfig()
24
24
 
25
25
  try:
26
- # detect always returns a list, use k parameter for multiple languages
27
26
  k = config.top_k if config.multilingual else 1
28
- # Use the model from config directly
29
27
  model = config.model
30
28
  results = detect(text, model=model, k=k)
31
29
 
kreuzberg/extraction.py CHANGED
@@ -76,7 +76,6 @@ def _validate_and_post_process_helper(
76
76
  result.keywords = None
77
77
 
78
78
  if config.auto_detect_language:
79
- # Use provided config or create one with the model from ExtractionConfig
80
79
  lang_config = config.language_detection_config
81
80
  if lang_config is None:
82
81
  from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.1
3
+ Version: 3.17.3
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.13.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.14.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
36
  Requires-Dist: mcp>=1.14.1
37
37
  Requires-Dist: msgspec>=0.18.0
@@ -4,16 +4,16 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
4
4
  kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
5
5
  kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
6
  kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
- kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
7
+ kreuzberg/_entity_extraction.py,sha256=NuGcmIU-gBfzKmrhk6AcO6angCeUbML8REKPp7CE8sc,4710
8
8
  kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
9
- kreuzberg/_language_detection.py,sha256=OwIWIddERPEz8krU_Aq0_KjRF6MHP-LpugH6Y6miwOc,1204
9
+ kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
10
10
  kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
11
11
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
12
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
13
  kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
14
14
  kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
15
15
  kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
16
- kreuzberg/extraction.py,sha256=jwzWdomwrl-2z1UznLoURLyqD5r0U-rFABXSBV2B2wA,19063
16
+ kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
121
121
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
122
122
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
123
123
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
124
- kreuzberg-3.17.1.dist-info/METADATA,sha256=ttfOl3XA6b-M2BMY7v1cfASGm_Qe91HPzfRcAf_-zU8,12351
125
- kreuzberg-3.17.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
- kreuzberg-3.17.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
- kreuzberg-3.17.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
- kreuzberg-3.17.1.dist-info/RECORD,,
124
+ kreuzberg-3.17.3.dist-info/METADATA,sha256=XXgXKaiujoGAGsCn-skmPDij6vcQ9XqwbA1LBpX_Pvw,12351
125
+ kreuzberg-3.17.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
+ kreuzberg-3.17.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
+ kreuzberg-3.17.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
+ kreuzberg-3.17.3.dist-info/RECORD,,