kreuzberg 3.17.1__py3-none-any.whl → 3.17.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_entity_extraction.py +38 -10
- kreuzberg/_language_detection.py +0 -2
- kreuzberg/extraction.py +0 -1
- {kreuzberg-3.17.1.dist-info → kreuzberg-3.17.3.dist-info}/METADATA +2 -2
- {kreuzberg-3.17.1.dist-info → kreuzberg-3.17.3.dist-info}/RECORD +8 -8
- {kreuzberg-3.17.1.dist-info → kreuzberg-3.17.3.dist-info}/WHEEL +0 -0
- {kreuzberg-3.17.1.dist-info → kreuzberg-3.17.3.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.17.1.dist-info → kreuzberg-3.17.3.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_entity_extraction.py
CHANGED
@@ -2,12 +2,14 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
+
import subprocess
|
6
|
+
import sys
|
5
7
|
from functools import lru_cache
|
6
8
|
from itertools import chain
|
7
9
|
from typing import TYPE_CHECKING, Any
|
8
10
|
|
9
11
|
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
10
|
-
from kreuzberg.exceptions import MissingDependencyError
|
12
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
11
13
|
|
12
14
|
if TYPE_CHECKING:
|
13
15
|
from collections.abc import Sequence
|
@@ -49,8 +51,6 @@ def extract_entities(
|
|
49
51
|
return entities
|
50
52
|
|
51
53
|
nlp = _load_spacy_model(model_name, spacy_config)
|
52
|
-
if not nlp:
|
53
|
-
return entities
|
54
54
|
|
55
55
|
if len(text) > spacy_config.max_doc_length:
|
56
56
|
text = text[: spacy_config.max_doc_length]
|
@@ -77,17 +77,45 @@ def extract_entities(
|
|
77
77
|
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
78
78
|
try:
|
79
79
|
import spacy # noqa: PLC0415
|
80
|
+
except ImportError:
|
81
|
+
return None
|
80
82
|
|
81
|
-
|
82
|
-
|
83
|
+
if spacy_config.model_cache_dir:
|
84
|
+
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
83
85
|
|
86
|
+
try:
|
84
87
|
nlp = spacy.load(model_name)
|
88
|
+
except OSError:
|
89
|
+
result = subprocess.run(
|
90
|
+
[sys.executable, "-m", "spacy", "download", model_name],
|
91
|
+
capture_output=True,
|
92
|
+
text=True,
|
93
|
+
check=False,
|
94
|
+
)
|
85
95
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
96
|
+
if result.returncode != 0:
|
97
|
+
error_msg = (
|
98
|
+
f"Failed to download spaCy model '{model_name}'. "
|
99
|
+
f"Please install it manually with: python -m spacy download {model_name}"
|
100
|
+
)
|
101
|
+
if result.stderr:
|
102
|
+
error_msg += f"\nError details: {result.stderr}"
|
103
|
+
raise KreuzbergError(
|
104
|
+
error_msg, context={"model": model_name, "stderr": result.stderr, "return_code": result.returncode}
|
105
|
+
) from None
|
106
|
+
|
107
|
+
try:
|
108
|
+
nlp = spacy.load(model_name)
|
109
|
+
except OSError as e:
|
110
|
+
raise KreuzbergError(
|
111
|
+
f"Failed to load spaCy model '{model_name}' even after successful download. "
|
112
|
+
f"Please verify your spaCy installation and try reinstalling the model.",
|
113
|
+
context={"model": model_name, "error": str(e)},
|
114
|
+
) from e
|
115
|
+
|
116
|
+
nlp.max_length = spacy_config.max_doc_length
|
117
|
+
|
118
|
+
return nlp
|
91
119
|
|
92
120
|
|
93
121
|
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
kreuzberg/_language_detection.py
CHANGED
@@ -23,9 +23,7 @@ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -
|
|
23
23
|
config = LanguageDetectionConfig()
|
24
24
|
|
25
25
|
try:
|
26
|
-
# detect always returns a list, use k parameter for multiple languages
|
27
26
|
k = config.top_k if config.multilingual else 1
|
28
|
-
# Use the model from config directly
|
29
27
|
model = config.model
|
30
28
|
results = detect(text, model=model, k=k)
|
31
29
|
|
kreuzberg/extraction.py
CHANGED
@@ -76,7 +76,6 @@ def _validate_and_post_process_helper(
|
|
76
76
|
result.keywords = None
|
77
77
|
|
78
78
|
if config.auto_detect_language:
|
79
|
-
# Use provided config or create one with the model from ExtractionConfig
|
80
79
|
lang_config = config.language_detection_config
|
81
80
|
if lang_config is None:
|
82
81
|
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.17.
|
3
|
+
Version: 3.17.3
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,7 +31,7 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.14.0
|
35
35
|
Requires-Dist: langcodes>=3.5.0
|
36
36
|
Requires-Dist: mcp>=1.14.1
|
37
37
|
Requires-Dist: msgspec>=0.18.0
|
@@ -4,16 +4,16 @@ kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
|
4
4
|
kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
|
5
5
|
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
6
|
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
|
-
kreuzberg/_entity_extraction.py,sha256=
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=NuGcmIU-gBfzKmrhk6AcO6angCeUbML8REKPp7CE8sc,4710
|
8
8
|
kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
|
9
|
-
kreuzberg/_language_detection.py,sha256=
|
9
|
+
kreuzberg/_language_detection.py,sha256=y48gNaexnC6OIVTh3yBjXDumMeIKMggCDuacoXa7AvU,1080
|
10
10
|
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
11
11
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
12
|
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
13
|
kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
|
14
14
|
kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
|
15
15
|
kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
|
16
|
-
kreuzberg/extraction.py,sha256=
|
16
|
+
kreuzberg/extraction.py,sha256=ArsmHcJDvjx9Cog3IQ0D52oS9GbaH_Yhs5mfJfGgiaM,18982
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
19
|
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
@@ -121,8 +121,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
121
121
|
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
122
122
|
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
123
123
|
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
124
|
-
kreuzberg-3.17.
|
125
|
-
kreuzberg-3.17.
|
126
|
-
kreuzberg-3.17.
|
127
|
-
kreuzberg-3.17.
|
128
|
-
kreuzberg-3.17.
|
124
|
+
kreuzberg-3.17.3.dist-info/METADATA,sha256=XXgXKaiujoGAGsCn-skmPDij6vcQ9XqwbA1LBpX_Pvw,12351
|
125
|
+
kreuzberg-3.17.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
126
|
+
kreuzberg-3.17.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
127
|
+
kreuzberg-3.17.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
128
|
+
kreuzberg-3.17.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|