kreuzberg 3.17.3__py3-none-any.whl → 3.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import base64
4
4
  import io
5
+ import os
5
6
  import traceback
6
7
  from json import dumps
7
8
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -100,6 +101,36 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
100
101
  )
101
102
 
102
103
 
104
+ def _get_max_upload_size() -> int:
105
+ """Get the maximum upload size from environment variable.
106
+
107
+ Returns:
108
+ Maximum upload size in bytes. Defaults to 1GB if not set.
109
+
110
+ Environment Variables:
111
+ KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
112
+ """
113
+ default_size = 1024 * 1024 * 1024 # 1GB
114
+ try:
115
+ size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
116
+ # Return default if negative
117
+ return size if size >= 0 else default_size
118
+ except ValueError:
119
+ return default_size
120
+
121
+
122
+ def _is_opentelemetry_enabled() -> bool:
123
+ """Check if OpenTelemetry should be enabled.
124
+
125
+ Returns:
126
+ True if OpenTelemetry should be enabled, False otherwise.
127
+
128
+ Environment Variables:
129
+ KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
130
+ """
131
+ return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
132
+
133
+
103
134
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
104
135
  error_type = type(exception).__name__
105
136
  error_message = str(exception)
@@ -242,7 +273,7 @@ async def handle_files_upload( # noqa: PLR0913
242
273
  - Language detection (if enabled)
243
274
 
244
275
  Supports various file formats including PDF, Office documents, images, and more.
245
- Maximum file size: 1GB per file.
276
+ Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
246
277
 
247
278
  Args:
248
279
  request: The HTTP request object
@@ -379,9 +410,18 @@ type_encoders = {
379
410
  Image.Image: _pil_image_encoder,
380
411
  }
381
412
 
413
+
414
+ def _get_plugins() -> list[Any]:
415
+ """Get configured plugins based on environment variables."""
416
+ plugins = []
417
+ if _is_opentelemetry_enabled():
418
+ plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
419
+ return plugins
420
+
421
+
382
422
  app = Litestar(
383
423
  route_handlers=[handle_files_upload, health_check, get_configuration],
384
- plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
424
+ plugins=_get_plugins(),
385
425
  logging_config=StructLoggingConfig(),
386
426
  openapi_config=openapi_config,
387
427
  exception_handlers={
@@ -389,5 +429,5 @@ app = Litestar(
389
429
  Exception: general_exception_handler,
390
430
  },
391
431
  type_encoders=type_encoders,
392
- request_max_body_size=1024 * 1024 * 1024,
432
+ request_max_body_size=_get_max_upload_size(),
393
433
  )
@@ -2,19 +2,77 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
+ import shutil
5
6
  import subprocess
6
- import sys
7
7
  from functools import lru_cache
8
8
  from itertools import chain
9
9
  from typing import TYPE_CHECKING, Any
10
10
 
11
+ import anyio
12
+
11
13
  from kreuzberg._types import Entity, SpacyEntityExtractionConfig
14
+ from kreuzberg._utils._sync import run_sync
12
15
  from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
13
16
 
14
17
  if TYPE_CHECKING:
15
18
  from collections.abc import Sequence
16
19
 
17
20
 
21
+ def is_uv_available() -> bool:
22
+ """Check if uv is available in the environment."""
23
+ return shutil.which("uv") is not None
24
+
25
+
26
+ def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
27
+ """Get the direct download URL for a spaCy model.
28
+
29
+ Args:
30
+ model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
31
+ version: Model version to download (default: 3.8.0)
32
+
33
+ Returns:
34
+ Direct download URL for the model
35
+ """
36
+ return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
37
+
38
+
39
+ async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
40
+ """Install spaCy model using uv.
41
+
42
+ Args:
43
+ model_name: Name of the spaCy model to install
44
+
45
+ Returns:
46
+ Completed process result
47
+ """
48
+ model_url = get_spacy_model_url(model_name)
49
+ return await run_sync(
50
+ subprocess.run,
51
+ ["uv", "pip", "install", model_url],
52
+ capture_output=True,
53
+ text=True,
54
+ check=False,
55
+ )
56
+
57
+
58
+ async def install_spacy_model_with_spacy(model_name: str) -> bool:
59
+ """Install spaCy model using spacy download function.
60
+
61
+ Args:
62
+ model_name: Name of the spaCy model to install
63
+
64
+ Returns:
65
+ True if successful, False otherwise
66
+ """
67
+ try:
68
+ import spacy.cli.download # noqa: PLC0415
69
+
70
+ await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
71
+ return True
72
+ except (ImportError, OSError, RuntimeError):
73
+ return False
74
+
75
+
18
76
  def extract_entities(
19
77
  text: str,
20
78
  entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
@@ -46,11 +104,11 @@ def extract_entities(
46
104
  functionality="Entity Extraction",
47
105
  ) from e
48
106
 
49
- model_name = _select_spacy_model(languages, spacy_config)
107
+ model_name = select_spacy_model(languages, spacy_config)
50
108
  if not model_name:
51
109
  return entities
52
110
 
53
- nlp = _load_spacy_model(model_name, spacy_config)
111
+ nlp = load_spacy_model(model_name, spacy_config)
54
112
 
55
113
  if len(text) > spacy_config.max_doc_length:
56
114
  text = text[: spacy_config.max_doc_length]
@@ -74,7 +132,7 @@ def extract_entities(
74
132
 
75
133
 
76
134
  @lru_cache(maxsize=32)
77
- def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
135
+ def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
78
136
  try:
79
137
  import spacy # noqa: PLC0415
80
138
  except ImportError:
@@ -86,22 +144,58 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
86
144
  try:
87
145
  nlp = spacy.load(model_name)
88
146
  except OSError:
89
- result = subprocess.run(
90
- [sys.executable, "-m", "spacy", "download", model_name],
91
- capture_output=True,
92
- text=True,
93
- check=False,
94
- )
147
+ # Try to download the model automatically
148
+ async def install_model() -> tuple[bool, str | None]:
149
+ """Install model and return success status and error message."""
150
+ # First try spaCy's built-in download
151
+ try:
152
+ success = await install_spacy_model_with_spacy(model_name)
153
+ if success:
154
+ return True, None
155
+ except (ImportError, OSError, RuntimeError) as e:
156
+ spacy_error = str(e)
157
+ else:
158
+ spacy_error = "spaCy download failed"
159
+
160
+ # If spaCy download failed and uv is available, try uv as fallback
161
+ if is_uv_available():
162
+ try:
163
+ result = await install_spacy_model_with_uv(model_name)
164
+ return result.returncode == 0, result.stderr
165
+ except (OSError, subprocess.SubprocessError) as e:
166
+ return False, f"spaCy: {spacy_error}, uv: {e!s}"
167
+
168
+ return False, spacy_error
169
+
170
+ # Run the async installation in a sync context
171
+ try:
172
+ success, error_details = anyio.run(install_model)
173
+ except (OSError, RuntimeError) as e:
174
+ success, error_details = False, str(e)
175
+
176
+ if not success:
177
+ # Generate appropriate error message based on available tools
178
+ if is_uv_available():
179
+ model_url = get_spacy_model_url(model_name)
180
+ manual_install_cmd = f"uv pip install {model_url}"
181
+ else:
182
+ manual_install_cmd = f"python -m spacy download {model_name}"
95
183
 
96
- if result.returncode != 0:
97
184
  error_msg = (
98
- f"Failed to download spaCy model '{model_name}'. "
99
- f"Please install it manually with: python -m spacy download {model_name}"
185
+ f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
100
186
  )
101
- if result.stderr:
102
- error_msg += f"\nError details: {result.stderr}"
187
+
188
+ if error_details:
189
+ error_msg += f"\nError details: {error_details}"
190
+
103
191
  raise KreuzbergError(
104
- error_msg, context={"model": model_name, "stderr": result.stderr, "return_code": result.returncode}
192
+ error_msg,
193
+ context={
194
+ "model": model_name,
195
+ "manual_install_cmd": manual_install_cmd,
196
+ "error_details": error_details,
197
+ "uv_available": is_uv_available(),
198
+ },
105
199
  ) from None
106
200
 
107
201
  try:
@@ -118,7 +212,7 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
118
212
  return nlp
119
213
 
120
214
 
121
- def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
215
+ def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
122
216
  if not languages:
123
217
  return spacy_config.get_model_for_language("en")
124
218
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.17.3
3
+ Version: 3.18.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
- Requires-Dist: anyio>=4.10.0
31
+ Requires-Dist: anyio>=4.11.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.14.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.16.0
35
35
  Requires-Dist: langcodes>=3.5.0
36
- Requires-Dist: mcp>=1.14.1
36
+ Requires-Dist: mcp>=1.15.0
37
37
  Requires-Dist: msgspec>=0.18.0
38
38
  Requires-Dist: numpy>=2.0.0
39
39
  Requires-Dist: playa-pdf>=0.7.0
@@ -4,7 +4,7 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
4
4
  kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
5
5
  kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
6
6
  kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
7
- kreuzberg/_entity_extraction.py,sha256=NuGcmIU-gBfzKmrhk6AcO6angCeUbML8REKPp7CE8sc,4710
7
+ kreuzberg/_entity_extraction.py,sha256=zbwgvS_2M4JibmVVnclkmie0nmZQtyHtT_ucdbQc6nU,7837
8
8
  kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
9
9
  kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
10
10
  kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
@@ -17,7 +17,7 @@ kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
19
  kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
20
- kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
20
+ kreuzberg/_api/main.py,sha256=5LiqgyeHJy0GLLa-ehB0bq8ftEUYfM1Pt6f0j_a0dso,15190
21
21
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  kreuzberg/_extractors/_base.py,sha256=4MRBXdLsgdtdrTuupWb2IT9YpRSnNPpWWviS2mfeOXg,9961
23
23
  kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
121
121
  kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
122
122
  kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
123
123
  kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
124
- kreuzberg-3.17.3.dist-info/METADATA,sha256=XXgXKaiujoGAGsCn-skmPDij6vcQ9XqwbA1LBpX_Pvw,12351
125
- kreuzberg-3.17.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
- kreuzberg-3.17.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
- kreuzberg-3.17.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
- kreuzberg-3.17.3.dist-info/RECORD,,
124
+ kreuzberg-3.18.0.dist-info/METADATA,sha256=Z54em4GwMd18BmlIWmq1AHtCdFStstMV5RAXaB4x3_0,12351
125
+ kreuzberg-3.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
126
+ kreuzberg-3.18.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
127
+ kreuzberg-3.18.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
128
+ kreuzberg-3.18.0.dist-info/RECORD,,