kreuzberg 3.13.1__py3-none-any.whl → 3.13.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +40 -2
- kreuzberg/_config.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +17 -2
- kreuzberg/_ocr/_tesseract.py +14 -3
- kreuzberg/cli.py +2 -2
- {kreuzberg-3.13.1.dist-info → kreuzberg-3.13.3.dist-info}/METADATA +8 -8
- {kreuzberg-3.13.1.dist-info → kreuzberg-3.13.3.dist-info}/RECORD +10 -10
- {kreuzberg-3.13.1.dist-info → kreuzberg-3.13.3.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.1.dist-info → kreuzberg-3.13.3.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.1.dist-info → kreuzberg-3.13.3.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_api/main.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import traceback
|
3
4
|
from functools import lru_cache
|
4
5
|
from json import dumps, loads
|
5
6
|
from typing import TYPE_CHECKING, Annotated, Any, Literal
|
@@ -69,6 +70,33 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
|
|
69
70
|
)
|
70
71
|
|
71
72
|
|
73
|
+
def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
|
74
|
+
"""Temporary handler to catch ALL exceptions for debugging."""
|
75
|
+
error_type = type(exception).__name__
|
76
|
+
error_message = str(exception)
|
77
|
+
traceback_str = traceback.format_exc()
|
78
|
+
|
79
|
+
if request.app.logger:
|
80
|
+
request.app.logger.error(
|
81
|
+
"Unhandled exception",
|
82
|
+
method=request.method,
|
83
|
+
url=str(request.url),
|
84
|
+
error_type=error_type,
|
85
|
+
message=error_message,
|
86
|
+
traceback=traceback_str,
|
87
|
+
)
|
88
|
+
|
89
|
+
return Response(
|
90
|
+
content={
|
91
|
+
"error_type": error_type,
|
92
|
+
"message": error_message,
|
93
|
+
"traceback": traceback_str,
|
94
|
+
"debug": "This is a temporary debug handler",
|
95
|
+
},
|
96
|
+
status_code=HTTP_500_INTERNAL_SERVER_ERROR,
|
97
|
+
)
|
98
|
+
|
99
|
+
|
72
100
|
def _convert_value_type(current_value: Any, new_value: Any) -> Any:
|
73
101
|
if isinstance(current_value, bool):
|
74
102
|
if isinstance(new_value, str):
|
@@ -121,6 +149,15 @@ def _merge_configs_cached(
|
|
121
149
|
return ExtractionConfig(**config_dict)
|
122
150
|
|
123
151
|
|
152
|
+
def _make_hashable(obj: Any) -> Any:
|
153
|
+
"""Convert nested dicts/lists to hashable tuples."""
|
154
|
+
if isinstance(obj, dict):
|
155
|
+
return tuple(sorted((k, _make_hashable(v)) for k, v in obj.items()))
|
156
|
+
if isinstance(obj, list):
|
157
|
+
return tuple(_make_hashable(item) for item in obj)
|
158
|
+
return obj
|
159
|
+
|
160
|
+
|
124
161
|
def merge_configs(
|
125
162
|
static_config: ExtractionConfig | None,
|
126
163
|
query_params: dict[str, Any],
|
@@ -128,7 +165,7 @@ def merge_configs(
|
|
128
165
|
) -> ExtractionConfig:
|
129
166
|
"""Merge configurations with precedence: header > query > static > default."""
|
130
167
|
query_tuple = tuple(sorted(query_params.items())) if query_params else ()
|
131
|
-
header_tuple =
|
168
|
+
header_tuple = _make_hashable(header_config) if header_config else None
|
132
169
|
|
133
170
|
return _merge_configs_cached(static_config, query_tuple, header_tuple)
|
134
171
|
|
@@ -211,8 +248,9 @@ async def get_configuration() -> dict[str, Any]:
|
|
211
248
|
app = Litestar(
|
212
249
|
route_handlers=[handle_files_upload, health_check, get_configuration],
|
213
250
|
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
214
|
-
logging_config=StructLoggingConfig(),
|
251
|
+
logging_config=StructLoggingConfig(), # Use default config
|
215
252
|
exception_handlers={
|
216
253
|
KreuzbergError: exception_handler,
|
254
|
+
Exception: general_exception_handler, # Catch all exceptions for debugging
|
217
255
|
},
|
218
256
|
)
|
kreuzberg/_config.py
CHANGED
@@ -162,6 +162,10 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
|
162
162
|
if config_path.name == "pyproject.toml":
|
163
163
|
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
164
164
|
|
165
|
+
# For any other TOML file, check if it has [tool.kreuzberg] section
|
166
|
+
if "tool" in data and "kreuzberg" in data["tool"]:
|
167
|
+
return data["tool"]["kreuzberg"] # type: ignore[no-any-return]
|
168
|
+
|
165
169
|
return data # type: ignore[no-any-return]
|
166
170
|
|
167
171
|
|
@@ -34,8 +34,22 @@ CellValue = int | float | str | bool | time | date | datetime | timedelta
|
|
34
34
|
class SpreadSheetExtractor(Extractor):
|
35
35
|
SUPPORTED_MIME_TYPES = SPREADSHEET_MIME_TYPES
|
36
36
|
|
37
|
+
def _get_file_extension(self) -> str:
|
38
|
+
"""Get the appropriate file extension based on MIME type."""
|
39
|
+
mime_to_ext = {
|
40
|
+
"application/vnd.ms-excel": ".xls",
|
41
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
42
|
+
"application/vnd.ms-excel.sheet.macroEnabled.12": ".xlsm",
|
43
|
+
"application/vnd.ms-excel.sheet.binary.macroEnabled.12": ".xlsb",
|
44
|
+
"application/vnd.ms-excel.addin.macroEnabled.12": ".xlam",
|
45
|
+
"application/vnd.ms-excel.template.macroEnabled.12": ".xltm",
|
46
|
+
"application/vnd.oasis.opendocument.spreadsheet": ".ods",
|
47
|
+
}
|
48
|
+
return mime_to_ext.get(self.mime_type, ".xlsx")
|
49
|
+
|
37
50
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
38
|
-
|
51
|
+
file_extension = self._get_file_extension()
|
52
|
+
xlsx_path, unlink = await create_temp_file(file_extension)
|
39
53
|
await AsyncPath(xlsx_path).write_bytes(content)
|
40
54
|
try:
|
41
55
|
return await self.extract_path_async(xlsx_path)
|
@@ -72,7 +86,8 @@ class SpreadSheetExtractor(Extractor):
|
|
72
86
|
) from e
|
73
87
|
|
74
88
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
75
|
-
|
89
|
+
file_extension = self._get_file_extension()
|
90
|
+
fd, temp_path = tempfile.mkstemp(suffix=file_extension)
|
76
91
|
|
77
92
|
try:
|
78
93
|
with os.fdopen(fd, "wb") as f:
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -280,6 +280,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
280
280
|
}
|
281
281
|
|
282
282
|
async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
|
283
|
+
psm_value = run_config["psm"]
|
284
|
+
psm_str = str(psm_value.value) if hasattr(psm_value, "value") else str(psm_value)
|
285
|
+
|
283
286
|
command = [
|
284
287
|
"tesseract",
|
285
288
|
str(path),
|
@@ -287,7 +290,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
287
290
|
"-l",
|
288
291
|
run_config["language"],
|
289
292
|
"--psm",
|
290
|
-
|
293
|
+
psm_str,
|
291
294
|
"--oem",
|
292
295
|
"1",
|
293
296
|
"--loglevel",
|
@@ -1089,8 +1092,16 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1089
1092
|
}
|
1090
1093
|
|
1091
1094
|
def _build_tesseract_command(
|
1092
|
-
self,
|
1095
|
+
self,
|
1096
|
+
path: Path,
|
1097
|
+
output_base: str,
|
1098
|
+
language: str,
|
1099
|
+
psm: PSMMode | int,
|
1100
|
+
output_format: str = "text",
|
1101
|
+
**kwargs: Any,
|
1093
1102
|
) -> list[str]:
|
1103
|
+
psm_str = str(psm.value) if hasattr(psm, "value") else str(psm)
|
1104
|
+
|
1094
1105
|
command = [
|
1095
1106
|
"tesseract",
|
1096
1107
|
str(path),
|
@@ -1098,7 +1109,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
1098
1109
|
"-l",
|
1099
1110
|
language,
|
1100
1111
|
"--psm",
|
1101
|
-
|
1112
|
+
psm_str,
|
1102
1113
|
"--oem",
|
1103
1114
|
"1",
|
1104
1115
|
"--loglevel",
|
kreuzberg/cli.py
CHANGED
@@ -265,7 +265,7 @@ def cli(ctx: click.Context) -> None:
|
|
265
265
|
@click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
|
266
266
|
@click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
|
267
267
|
@click.pass_context
|
268
|
-
def extract(ctx: click.Context) -> None:
|
268
|
+
def extract(ctx: click.Context, /, **kwargs: Any) -> None:
|
269
269
|
"""Extract text from a document.
|
270
270
|
|
271
271
|
FILE can be a path to a document or '-' to read from stdin.
|
@@ -279,7 +279,7 @@ def extract(ctx: click.Context) -> None:
|
|
279
279
|
|
280
280
|
extraction_config = build_extraction_config(file_config, cli_args)
|
281
281
|
|
282
|
-
result = _perform_extraction(
|
282
|
+
result = _perform_extraction(kwargs.get("file"), extraction_config, params["verbose"])
|
283
283
|
|
284
284
|
_write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
|
285
285
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.13.
|
3
|
+
Version: 3.13.3
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,15 +31,15 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.10.0
|
35
35
|
Requires-Dist: mcp>=1.13.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
37
|
Requires-Dist: numpy>=1.24.0
|
38
38
|
Requires-Dist: playa-pdf>=0.7.0
|
39
|
-
Requires-Dist: polars>=1.33.
|
39
|
+
Requires-Dist: polars>=1.33.1
|
40
40
|
Requires-Dist: psutil>=7.0.0
|
41
41
|
Requires-Dist: pypdfium2==4.30.0
|
42
|
-
Requires-Dist: python-calamine>=0.5.
|
42
|
+
Requires-Dist: python-calamine>=0.5.3
|
43
43
|
Requires-Dist: python-pptx>=1.0.2
|
44
44
|
Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
|
45
45
|
Provides-Extra: additional-extensions
|
@@ -55,17 +55,17 @@ Requires-Dist: keybert>=0.9.0; extra == 'all'
|
|
55
55
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
56
56
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
57
57
|
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
58
|
-
Requires-Dist: paddlepaddle>=3.
|
58
|
+
Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
|
59
59
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
60
60
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
61
|
-
Requires-Dist: semantic-text-splitter>=0.
|
61
|
+
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
62
62
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
63
63
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
64
64
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
65
65
|
Provides-Extra: api
|
66
66
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
67
67
|
Provides-Extra: chunking
|
68
|
-
Requires-Dist: semantic-text-splitter>=0.
|
68
|
+
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
|
69
69
|
Provides-Extra: cli
|
70
70
|
Requires-Dist: click>=8.2.1; extra == 'cli'
|
71
71
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
@@ -85,7 +85,7 @@ Provides-Extra: langdetect
|
|
85
85
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
86
86
|
Provides-Extra: paddleocr
|
87
87
|
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
|
-
Requires-Dist: paddlepaddle>=3.
|
88
|
+
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
89
89
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
90
90
|
Description-Content-Type: text/markdown
|
91
91
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=Oh_NTp8wf0BlvD8CSBad2A493nEWH4jTE0x8v7v1Y9w,1341
|
2
2
|
kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
|
3
3
|
kreuzberg/_chunker.py,sha256=tr9_KUYTSLauFois3MsB-A-0hGcTT8hTQFrqNRTii-I,1373
|
4
|
-
kreuzberg/_config.py,sha256=
|
4
|
+
kreuzberg/_config.py,sha256=T6ASb3N8nPQ4g5B2FxfgK82uE4pesGllezqrmZ0gSdM,12457
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
6
|
kreuzberg/_document_classification.py,sha256=Mz_s2GJGsEl7MQ-67BPoGYCZibTy9Sw0PScUZKBjKOA,5736
|
7
7
|
kreuzberg/_entity_extraction.py,sha256=5YpPnqoJ5aiHd_sy4bN4-Ngiq79RhCV6yaUQE8joGXo,3503
|
@@ -11,12 +11,12 @@ kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
|
|
11
11
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
12
|
kreuzberg/_registry.py,sha256=8cPpz3oZVnMwWDT2v_Q7wf-GHd5YuHmc-nkLtvPfE1I,2433
|
13
13
|
kreuzberg/_types.py,sha256=D-2d_WG8HyByA163izGhjk7t-e4FL_N-_6UzlVso8Dg,36020
|
14
|
-
kreuzberg/cli.py,sha256=
|
14
|
+
kreuzberg/cli.py,sha256=nPH4FDW6WkoF4gtH0s4RWmxjAveJ_-Unb6fev6x0Sko,12752
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
16
|
kreuzberg/extraction.py,sha256=jiMKiDyTf3sHyk76sMffHR-eH-_yg-DFRMuXEKufRYI,17649
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
kreuzberg/_api/main.py,sha256=
|
19
|
+
kreuzberg/_api/main.py,sha256=q0ygmdAUfTkjlqAa1RdW1KxxzxQ6IX80__UTpoXipp8,8859
|
20
20
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
|
22
22
|
kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
|
@@ -25,7 +25,7 @@ kreuzberg/_extractors/_image.py,sha256=UqPoYfvDRX6Rd1yPhcLHJLDw6d2cUzgkqOGjh2ele
|
|
25
25
|
kreuzberg/_extractors/_pandoc.py,sha256=-Ai4S1cXs7F6yeonb_7Y7_ZoWHn29E2oP1WlPtM-4HM,22505
|
26
26
|
kreuzberg/_extractors/_pdf.py,sha256=Yv_c3xYzrGAjgTbwCGqbiQTDLjIUP_Pu7Z3GmMOqgqg,17865
|
27
27
|
kreuzberg/_extractors/_presentation.py,sha256=ULGkt7dzeA9sYSEhpAucKZmkdv9EubzeZtOjoLP3Z2E,6994
|
28
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
28
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=UgjkLBATirc5FXUFtRN1ArLfOYhLDJxH2wFb1s9E5vA,12784
|
29
29
|
kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
|
30
30
|
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
31
31
|
kreuzberg/_mcp/server.py,sha256=iYJG6g0u7I6mWtC4R1XlxydBrPpgnp5dGJzpm9QAZig,8438
|
@@ -34,7 +34,7 @@ kreuzberg/_ocr/_base.py,sha256=5ef2g8JuSaZF2sDiAmoaODHbeG4MT0LtNzbtW0n9BnU,1445
|
|
34
34
|
kreuzberg/_ocr/_easyocr.py,sha256=XbgpGt5tkE4xHleIGvV1cHlpOQTp43rSXBO1CyIyKTg,14599
|
35
35
|
kreuzberg/_ocr/_paddleocr.py,sha256=58sKOHfKCHGFJNlRLrJwey8G_7xbsAAPBXB4n3hKc7k,14052
|
36
36
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
37
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
37
|
+
kreuzberg/_ocr/_tesseract.py,sha256=H2T_iuXwa0FGCSQ_ZfXvmvqksxoOdOFAfv3uQA8E4-M,49160
|
38
38
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
39
|
kreuzberg/_utils/_cache.py,sha256=S6Oc4TJamiuuWeJ2ABxDFbbQh4o8w38AUyZeBEc1NN8,12767
|
40
40
|
kreuzberg/_utils/_device.py,sha256=UxGkSTN3Up-Zn43CSyvf8CozW2xAF05Cm01LWA2FZmg,8263
|
@@ -50,8 +50,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
|
|
50
50
|
kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
|
51
51
|
kreuzberg/_utils/_table.py,sha256=R-6owHjvcvHGhem_vDsFH7S2yMHGoUUO2PFcj-Idptk,6361
|
52
52
|
kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
|
53
|
-
kreuzberg-3.13.
|
54
|
-
kreuzberg-3.13.
|
55
|
-
kreuzberg-3.13.
|
56
|
-
kreuzberg-3.13.
|
57
|
-
kreuzberg-3.13.
|
53
|
+
kreuzberg-3.13.3.dist-info/METADATA,sha256=ey7kAlKK8eTER87IiGZZpIPnYoSLwLPX2AGdOPTjj2M,12128
|
54
|
+
kreuzberg-3.13.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
55
|
+
kreuzberg-3.13.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
56
|
+
kreuzberg-3.13.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
57
|
+
kreuzberg-3.13.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|