kreuzberg 3.17.3__py3-none-any.whl → 3.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +43 -3
- kreuzberg/_entity_extraction.py +111 -17
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.18.0.dist-info}/METADATA +4 -4
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.18.0.dist-info}/RECORD +7 -7
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.18.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.18.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.17.3.dist-info → kreuzberg-3.18.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import base64
|
4
4
|
import io
|
5
|
+
import os
|
5
6
|
import traceback
|
6
7
|
from json import dumps
|
7
8
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
@@ -100,6 +101,36 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
100
101
|
)
|
101
102
|
|
102
103
|
|
104
|
+
def _get_max_upload_size() -> int:
|
105
|
+
"""Get the maximum upload size from environment variable.
|
106
|
+
|
107
|
+
Returns:
|
108
|
+
Maximum upload size in bytes. Defaults to 1GB if not set.
|
109
|
+
|
110
|
+
Environment Variables:
|
111
|
+
KREUZBERG_MAX_UPLOAD_SIZE: Maximum upload size in bytes (default: 1073741824 = 1GB)
|
112
|
+
"""
|
113
|
+
default_size = 1024 * 1024 * 1024 # 1GB
|
114
|
+
try:
|
115
|
+
size = int(os.environ.get("KREUZBERG_MAX_UPLOAD_SIZE", default_size))
|
116
|
+
# Return default if negative
|
117
|
+
return size if size >= 0 else default_size
|
118
|
+
except ValueError:
|
119
|
+
return default_size
|
120
|
+
|
121
|
+
|
122
|
+
def _is_opentelemetry_enabled() -> bool:
|
123
|
+
"""Check if OpenTelemetry should be enabled.
|
124
|
+
|
125
|
+
Returns:
|
126
|
+
True if OpenTelemetry should be enabled, False otherwise.
|
127
|
+
|
128
|
+
Environment Variables:
|
129
|
+
KREUZBERG_ENABLE_OPENTELEMETRY: Enable OpenTelemetry tracing (true/false) (default: true)
|
130
|
+
"""
|
131
|
+
return os.environ.get("KREUZBERG_ENABLE_OPENTELEMETRY", "true").lower() in ("true", "1", "yes", "on")
|
132
|
+
|
133
|
+
|
103
134
|
def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
|
104
135
|
error_type = type(exception).__name__
|
105
136
|
error_message = str(exception)
|
@@ -242,7 +273,7 @@ async def handle_files_upload( # noqa: PLR0913
|
|
242
273
|
- Language detection (if enabled)
|
243
274
|
|
244
275
|
Supports various file formats including PDF, Office documents, images, and more.
|
245
|
-
Maximum file size: 1GB per file.
|
276
|
+
Maximum file size: Configurable via KREUZBERG_MAX_UPLOAD_SIZE environment variable (default: 1GB per file).
|
246
277
|
|
247
278
|
Args:
|
248
279
|
request: The HTTP request object
|
@@ -379,9 +410,18 @@ type_encoders = {
|
|
379
410
|
Image.Image: _pil_image_encoder,
|
380
411
|
}
|
381
412
|
|
413
|
+
|
414
|
+
def _get_plugins() -> list[Any]:
|
415
|
+
"""Get configured plugins based on environment variables."""
|
416
|
+
plugins = []
|
417
|
+
if _is_opentelemetry_enabled():
|
418
|
+
plugins.append(OpenTelemetryPlugin(OpenTelemetryConfig()))
|
419
|
+
return plugins
|
420
|
+
|
421
|
+
|
382
422
|
app = Litestar(
|
383
423
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
384
|
-
plugins=
|
424
|
+
plugins=_get_plugins(),
|
385
425
|
logging_config=StructLoggingConfig(),
|
386
426
|
openapi_config=openapi_config,
|
387
427
|
exception_handlers={
|
@@ -389,5 +429,5 @@ app = Litestar(
|
|
389
429
|
Exception: general_exception_handler,
|
390
430
|
},
|
391
431
|
type_encoders=type_encoders,
|
392
|
-
request_max_body_size=
|
432
|
+
request_max_body_size=_get_max_upload_size(),
|
393
433
|
)
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -2,19 +2,77 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
+
import shutil
|
5
6
|
import subprocess
|
6
|
-
import sys
|
7
7
|
from functools import lru_cache
|
8
8
|
from itertools import chain
|
9
9
|
from typing import TYPE_CHECKING, Any
|
10
10
|
|
11
|
+
import anyio
|
12
|
+
|
11
13
|
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
14
|
+
from kreuzberg._utils._sync import run_sync
|
12
15
|
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
13
16
|
|
14
17
|
if TYPE_CHECKING:
|
15
18
|
from collections.abc import Sequence
|
16
19
|
|
17
20
|
|
21
|
+
def is_uv_available() -> bool:
|
22
|
+
"""Check if uv is available in the environment."""
|
23
|
+
return shutil.which("uv") is not None
|
24
|
+
|
25
|
+
|
26
|
+
def get_spacy_model_url(model_name: str, version: str = "3.8.0") -> str:
|
27
|
+
"""Get the direct download URL for a spaCy model.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
model_name: Name of the spaCy model (e.g., 'en_core_web_sm')
|
31
|
+
version: Model version to download (default: 3.8.0)
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Direct download URL for the model
|
35
|
+
"""
|
36
|
+
return f"https://github.com/explosion/spacy-models/releases/download/{model_name}-{version}/{model_name}-{version}-py3-none-any.whl"
|
37
|
+
|
38
|
+
|
39
|
+
async def install_spacy_model_with_uv(model_name: str) -> subprocess.CompletedProcess[str]:
|
40
|
+
"""Install spaCy model using uv.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
model_name: Name of the spaCy model to install
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
Completed process result
|
47
|
+
"""
|
48
|
+
model_url = get_spacy_model_url(model_name)
|
49
|
+
return await run_sync(
|
50
|
+
subprocess.run,
|
51
|
+
["uv", "pip", "install", model_url],
|
52
|
+
capture_output=True,
|
53
|
+
text=True,
|
54
|
+
check=False,
|
55
|
+
)
|
56
|
+
|
57
|
+
|
58
|
+
async def install_spacy_model_with_spacy(model_name: str) -> bool:
|
59
|
+
"""Install spaCy model using spacy download function.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
model_name: Name of the spaCy model to install
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
True if successful, False otherwise
|
66
|
+
"""
|
67
|
+
try:
|
68
|
+
import spacy.cli.download # noqa: PLC0415
|
69
|
+
|
70
|
+
await run_sync(spacy.cli.download, model_name) # type: ignore[attr-defined]
|
71
|
+
return True
|
72
|
+
except (ImportError, OSError, RuntimeError):
|
73
|
+
return False
|
74
|
+
|
75
|
+
|
18
76
|
def extract_entities(
|
19
77
|
text: str,
|
20
78
|
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
@@ -46,11 +104,11 @@ def extract_entities(
|
|
46
104
|
functionality="Entity Extraction",
|
47
105
|
) from e
|
48
106
|
|
49
|
-
model_name =
|
107
|
+
model_name = select_spacy_model(languages, spacy_config)
|
50
108
|
if not model_name:
|
51
109
|
return entities
|
52
110
|
|
53
|
-
nlp =
|
111
|
+
nlp = load_spacy_model(model_name, spacy_config)
|
54
112
|
|
55
113
|
if len(text) > spacy_config.max_doc_length:
|
56
114
|
text = text[: spacy_config.max_doc_length]
|
@@ -74,7 +132,7 @@ def extract_entities(
|
|
74
132
|
|
75
133
|
|
76
134
|
@lru_cache(maxsize=32)
|
77
|
-
def
|
135
|
+
def load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
78
136
|
try:
|
79
137
|
import spacy # noqa: PLC0415
|
80
138
|
except ImportError:
|
@@ -86,22 +144,58 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
86
144
|
try:
|
87
145
|
nlp = spacy.load(model_name)
|
88
146
|
except OSError:
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
147
|
+
# Try to download the model automatically
|
148
|
+
async def install_model() -> tuple[bool, str | None]:
|
149
|
+
"""Install model and return success status and error message."""
|
150
|
+
# First try spaCy's built-in download
|
151
|
+
try:
|
152
|
+
success = await install_spacy_model_with_spacy(model_name)
|
153
|
+
if success:
|
154
|
+
return True, None
|
155
|
+
except (ImportError, OSError, RuntimeError) as e:
|
156
|
+
spacy_error = str(e)
|
157
|
+
else:
|
158
|
+
spacy_error = "spaCy download failed"
|
159
|
+
|
160
|
+
# If spaCy download failed and uv is available, try uv as fallback
|
161
|
+
if is_uv_available():
|
162
|
+
try:
|
163
|
+
result = await install_spacy_model_with_uv(model_name)
|
164
|
+
return result.returncode == 0, result.stderr
|
165
|
+
except (OSError, subprocess.SubprocessError) as e:
|
166
|
+
return False, f"spaCy: {spacy_error}, uv: {e!s}"
|
167
|
+
|
168
|
+
return False, spacy_error
|
169
|
+
|
170
|
+
# Run the async installation in a sync context
|
171
|
+
try:
|
172
|
+
success, error_details = anyio.run(install_model)
|
173
|
+
except (OSError, RuntimeError) as e:
|
174
|
+
success, error_details = False, str(e)
|
175
|
+
|
176
|
+
if not success:
|
177
|
+
# Generate appropriate error message based on available tools
|
178
|
+
if is_uv_available():
|
179
|
+
model_url = get_spacy_model_url(model_name)
|
180
|
+
manual_install_cmd = f"uv pip install {model_url}"
|
181
|
+
else:
|
182
|
+
manual_install_cmd = f"python -m spacy download {model_name}"
|
95
183
|
|
96
|
-
if result.returncode != 0:
|
97
184
|
error_msg = (
|
98
|
-
f"Failed to download spaCy model '{model_name}'. "
|
99
|
-
f"Please install it manually with: python -m spacy download {model_name}"
|
185
|
+
f"Failed to download spaCy model '{model_name}'. Please install it manually with: {manual_install_cmd}"
|
100
186
|
)
|
101
|
-
|
102
|
-
|
187
|
+
|
188
|
+
if error_details:
|
189
|
+
error_msg += f"\nError details: {error_details}"
|
190
|
+
|
103
191
|
raise KreuzbergError(
|
104
|
-
error_msg,
|
192
|
+
error_msg,
|
193
|
+
context={
|
194
|
+
"model": model_name,
|
195
|
+
"manual_install_cmd": manual_install_cmd,
|
196
|
+
"error_details": error_details,
|
197
|
+
"uv_available": is_uv_available(),
|
198
|
+
},
|
105
199
|
) from None
|
106
200
|
|
107
201
|
try:
|
@@ -118,7 +212,7 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
118
212
|
return nlp
|
119
213
|
|
120
214
|
|
121
|
-
def
|
215
|
+
def select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
122
216
|
if not languages:
|
123
217
|
return spacy_config.get_model_for_language("en")
|
124
218
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.18.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -28,12 +28,12 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
28
|
Classifier: Topic :: Text Processing :: General
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
|
-
Requires-Dist: anyio>=4.
|
31
|
+
Requires-Dist: anyio>=4.11.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.16.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
|
-
Requires-Dist: mcp>=1.
|
36
|
+
Requires-Dist: mcp>=1.15.0
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
38
38
|
Requires-Dist: numpy>=2.0.0
|
39
39
|
Requires-Dist: playa-pdf>=0.7.0
|
@@ -4,7 +4,7 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
|
4
4
|
kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
|
5
5
|
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
6
|
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
|
-
kreuzberg/_entity_extraction.py,sha256=
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=zbwgvS_2M4JibmVVnclkmie0nmZQtyHtT_ucdbQc6nU,7837
|
8
8
|
kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
|
9
9
|
kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
|
10
10
|
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
@@ -17,7 +17,7 @@ kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
|
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
20
|
-
kreuzberg/_api/main.py,sha256=
|
20
|
+
kreuzberg/_api/main.py,sha256=5LiqgyeHJy0GLLa-ehB0bq8ftEUYfM1Pt6f0j_a0dso,15190
|
21
21
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
kreuzberg/_extractors/_base.py,sha256=4MRBXdLsgdtdrTuupWb2IT9YpRSnNPpWWviS2mfeOXg,9961
|
23
23
|
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
121
121
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
122
122
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
123
123
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
124
|
-
kreuzberg-3.
|
125
|
-
kreuzberg-3.
|
126
|
-
kreuzberg-3.
|
127
|
-
kreuzberg-3.
|
128
|
-
kreuzberg-3.
|
124
|
+
kreuzberg-3.18.0.dist-info/METADATA,sha256=Z54em4GwMd18BmlIWmq1AHtCdFStstMV5RAXaB4x3_0,12351
|
125
|
+
kreuzberg-3.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
126
|
+
kreuzberg-3.18.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
127
|
+
kreuzberg-3.18.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
128
|
+
kreuzberg-3.18.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|