kreuzberg 3.17.2__py3-none-any.whl → 3.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import base64
4
4
  import io
5
+ import os
5
6
  import traceback
6
7
  from json import dumps
7
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -100,6 +101,36 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
100
101
  )
101
102
 
102
103
 
104
+ def _get_max_upload_size() -> int:
105
+ """Get the maximum upload size from environment variable.
106
+
107
+ Returns:
108
+ Maximum upload size in bytes. Defaults to 1GB if not set.
109
+
110
+ Environment Variables:
111
+ KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
+ """
113
+ default_size = 1024 * 1024 * 1024 # 1GB
114
+ try:
115
+ size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
+ # Return default if negative
117
+ return size if size >= 0 else default_size
118
+ except ValueError:
119
+ return default_size
120
+
121
+
122
+ def _is_opentelemetry_enabled() -> bool:
123
+ """Check if OpenTelemetry should be enabled.
124
+
125
+ Returns:
126
+ True if OpenTelemetry should be enabled, False otherwise.
127
+
128
+ Environment Variables:
129
+ KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
130
+ """
131
+ return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
132
+
133
+
103
134
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
104
135
  error_type = type(exception).__name__
105
136
  error_message = str(exception)
@@ -242,7 +273,7 @@ async def handle_files_upload( # noqa: PLR0913
242
273
  - Language detection (if enabled)
243
274
 
244
275
  Supports various file formats including PDF, Office documents, images, and more.
245
- Maximum file size: 1GB per file.
276
+ Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
246
277
 
247
278
  Args:
248
279
  request: The HTTP request object
@@ -379,9 +410,18 @@ type_encoders = {
379
410
  Image.Image: _pil_image_encoder,
380
411
  }
381
412
 
413
+
414
+ def _get_plugins() -> list[Any]:
415
+ """Get configured plugins based on environment variables."""
416
+ plugins = []
417
+ if _is_opentelemetry_enabled():
418
+ plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
419
+ return plugins
420
+
421
+
382
422
  app = Litestar(
383
423
  route_handlers=[handle_files_upload, health_check, get_configuration],
384
- plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
424
+ plugins=_get_plugins(),
385
425
  logging_config=StructLoggingConfig(),
386
426
  openapi_config=openapi_config,
387
427
  exception_handlers={
@@ -389,5 +429,5 @@ app = Litestar(
389
429
  Exception: general_exception_handler,
390
430
  },
391
431
  type_encoders=type_encoders,
392
- request_max_body_size=1024 * 1024 * 1024,
432
+ request_max_body_size=_get_max_upload_size(),
393
433
  )
@@ -2,17 +2,77 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
+ import shutil
6
+ import subprocess
5
7
  from functools import lru_cache
6
8
  from itertools import chain
7
9
  from typing import TYPE_CHECKING, Any
8
10
 
11
+ import anyio
12
+
9
13
  from kreuzberg._types import Entity, SpacyEntityExtractionConfig
10
- from kreuzberg.exceptions import MissingDependencyError
14
+ from kreuzberg._utils._sync import run_sync
15
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
11
16
 
12
17
  if TYPE_CHECKING:
13
18
  from collections.abc import Sequence
14
19
 
15
20
 
21
+ def is_uv_available() -> bool:
22
+ """Check if uv is available in the environment."""
23
+ return shutil.which("uv") is not None
24
+
25
+
26
+ def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
27
+ """Get the direct download URL for a spaCy model.
28
+
29
+ Args:
30
+ model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
31
+ version: Model version to download (default: 3.8.0)
32
+
33
+ Returns:
34
+ Direct download URL for the model
35
+ """
36
+ return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
37
+
38
+
39
+ async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
40
+ """Install spaCy model using uv.
41
+
42
+ Args:
43
+ model_name: Name of the spaCy model to install
44
+
45
+ Returns:
46
+ Completed process result
47
+ """
48
+ model_url = get_spacy_model_url(model_name)
49
+ return await run_sync(
50
+ subprocess.run,
51
+ ["uv", "pip", "install", model_url],
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ )
56
+
57
+
58
+ async def install_spacy_model_with_spacy(model_name: str) -> bool:
59
+ """Install spaCy model using spacy download function.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to install
63
+
64
+ Returns:
65
+ True if successful, False otherwise
66
+ """
67
+ try:
68
+ import spacy.cli.download # noqa: PLC0415
69
+
70
+ await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
71
+ return True
72
+ except (ImportError, OSError, RuntimeError):
73
+ return False
74
+
75
+
16
76
  def extract_entities(
17
77
  text: str,
18
78
  entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
@@ -44,13 +104,11 @@ def extract_entities(
44
104
  functionality="Entity Extraction",
45
105
  ) from e
46
106
 
47
- model_name = _select_spacy_model(languages, spacy_config)
107
+ model_name = select_spacy_model(languages, spacy_config)
48
108
  if not model_name:
49
109
  return entities
50
110
 
51
- nlp = _load_spacy_model(model_name, spacy_config)
52
- if not nlp:
53
- return entities
111
+ nlp = load_spacy_model(model_name, spacy_config)
54
112
 
55
113
  if len(text) > spacy_config.max_doc_length:
56
114
  text = text[: spacy_config.max_doc_length]
@@ -74,23 +132,87 @@ def extract_entities(
74
132
 
75
133
 
76
134
  @lru_cache(maxsize=32)
77
- def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
135
+ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
78
136
  try:
79
137
  import spacy # noqa: PLC0415
138
+ except ImportError:
139
+ return None
80
140
 
81
- if spacy_config.model_cache_dir:
82
- os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
141
+ if spacy_config.model_cache_dir:
142
+ os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
83
143
 
144
+ try:
84
145
  nlp = spacy.load(model_name)
146
+ except OSError:
147
+ # Try to download the model automatically
148
+ async def install_model() -> tuple[bool, str | None]:
149
+ """Install model and return success status and error message."""
150
+ # First try spaCy's built-in download
151
+ try:
152
+ success = await install_spacy_model_with_spacy(model_name)
153
+ if success:
154
+ return True, None
155
+ except (ImportError, OSError, RuntimeError) as e:
156
+ spacy_error = str(e)
157
+ else:
158
+ spacy_error = "spaCy download failed"
159
+
160
+ # If spaCy download failed and uv is available, try uv as fallback
161
+ if is_uv_available():
162
+ try:
163
+ result = await install_spacy_model_with_uv(model_name)
164
+ return result.returncode == 0, result.stderr
165
+ except (OSError, subprocess.SubprocessError) as e:
166
+ return False, f"spaCy: {spacy_error}, uv: {e!s}"
167
+
168
+ return False, spacy_error
169
+
170
+ # Run the async installation in a sync context
171
+ try:
172
+ success, error_details = anyio.run(install_model)
173
+ except (OSError, RuntimeError) as e:
174
+ success, error_details = False, str(e)
175
+
176
+ if not success:
177
+ # Generate appropriate error message based on available tools
178
+ if is_uv_available():
179
+ model_url = get_spacy_model_url(model_name)
180
+ manual_install_cmd = f"uv pip install {model_url}"
181
+ else:
182
+ manual_install_cmd = f"python -m spacy download {model_name}"
183
+
184
+ error_msg = (
185
+ f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
186
+ )
85
187
 
86
- nlp.max_length = spacy_config.max_doc_length
188
+ if error_details:
189
+ error_msg += f"\nError details: {error_details}"
87
190
 
88
- return nlp
89
- except (OSError, ImportError):
90
- return None
191
+ raise KreuzbergError(
192
+ error_msg,
193
+ context={
194
+ "model": model_name,
195
+ "manual_install_cmd": manual_install_cmd,
196
+ "error_details": error_details,
197
+ "uv_available": is_uv_available(),
198
+ },
199
+ ) from None
200
+
201
+ try:
202
+ nlp = spacy.load(model_name)
203
+ except OSError as e:
204
+ raise KreuzbergError(
205
+ f"Failed to load spaCy model '{model_name}' even after successful download. "
206
+ f"Please verify your spaCy installation and try reinstalling the model.",
207
+ context={"model": model_name, "error": str(e)},
208
+ ) from e
209
+
210
+ nlp.max_length = spacy_config.max_doc_length
211
+
212
+ return nlp
91
213
 
92
214
 
93
- def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
215
+ def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
94
216
  if not languages:
95
217
  return spacy_config.get_model_for_language("en")
96
218
 
@@ -23,9 +23,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
23
23
  config = LanguageDetectionConfig()
24
24
 
25
25
  try:
26
- # detect always returns a list, use k parameter for multiple languages
27
26
  k = config.top_k if config.multilingual else 1
28
- # Use the model from config directly
29
27
  model = config.model
30
28
  results = detect(text, model=model, k=k)
31
29
 
kreuzberg/extraction.py CHANGED
@@ -76,7 +76,6 @@ def _validate_and_post_process_helper(
76
76
  result.keywords = None
77
77
 
78
78
  if config.auto_detect_language:
79
- # Use provided config or create one with the model from ExtractionConfig
80
79
  lang_config = config.language_detection_config
81
80
  if lang_config is None:
82
81
  from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.2
3
+ Version: 3.18.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.10.0
31
+ Requires-Dist: anyio>=4.11.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.14.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.16.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.1
36
+ Requires-Dist: mcp>=1.15.0
37
37
  Requires-Dist: msgspec>=0.18.0
38
38
  Requires-Dist: numpy>=2.0.0
39
39
  Requires-Dist: playa-pdf>=0.7.0
@@ -4,20 +4,20 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
4
4
  kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
5
5
  kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
6
  kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
- kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
7
+ kreuzberg/_entity_extraction.py,sha256=zbwgvS_2M4JibmVVnclkmie0nmZQtyHtT_ucdbQc6nU,7837
8
8
  kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
9
- kreuzberg/_language_detection.py,sha256=OwIWIddERPEz8krU_Aq0_KjRF6MHP-LpugH6Y6miwOc,1204
9
+ kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
10
10
  kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
11
11
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
12
  kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
13
  kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
14
14
  kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
15
15
  kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
16
- kreuzberg/extraction.py,sha256=jwzWdomwrl-2z1UznLoURLyqD5r0U-rFABXSBV2B2wA,19063
16
+ kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
20
- kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
20
+ kreuzberg/_api/main.py,sha256=5LiqgyeHJy0GLLa-ehB0bq8ftEUYfM1Pt6f0j_a0dso,15190
21
21
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  kreuzberg/_extractors/_base.py,sha256=4MRBXdLsgdtdrTuupWb2IT9YpRSnNPpWWviS2mfeOXg,9961
23
23
  kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
121
121
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
122
122
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
123
123
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
124
- kreuzberg-3.17.2.dist-info/METADATA,sha256=JlkYHBdCVl74XDARQuuvrhEncD2CaWBkodwO_S8ddH8,12351
125
- kreuzberg-3.17.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
- kreuzberg-3.17.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
- kreuzberg-3.17.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
- kreuzberg-3.17.2.dist-info/RECORD,,
124
+ kreuzberg-3.18.0.dist-info/METADATA,sha256=Z54em4GwMd18BmlIWmq1AHtCdFStstMV5RAXaB4x3_0,12351
125
+ kreuzberg-3.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
+ kreuzberg-3.18.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
+ kreuzberg-3.18.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
+ kreuzberg-3.18.0.dist-info/RECORD,,