kreuzberg 3.13.2__py3-none-any.whl → 3.13.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import traceback
3
4
  from functools import lru_cache
4
5
  from json import dumps, loads
5
6
  from typing import TYPE_CHECKING, Annotated, Any, Literal
@@ -69,6 +70,33 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
69
70
  )
70
71
 
71
72
 
73
+ def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
74
+ """Temporary handler to catch ALL exceptions for debugging."""
75
+ error_type = type(exception).__name__
76
+ error_message = str(exception)
77
+ traceback_str = traceback.format_exc()
78
+
79
+ if request.app.logger:
80
+ request.app.logger.error(
81
+ "Unhandled exception",
82
+ method=request.method,
83
+ url=str(request.url),
84
+ error_type=error_type,
85
+ message=error_message,
86
+ traceback=traceback_str,
87
+ )
88
+
89
+ return Response(
90
+ content={
91
+ "error_type": error_type,
92
+ "message": error_message,
93
+ "traceback": traceback_str,
94
+ "debug": "This is a temporary debug handler",
95
+ },
96
+ status_code=HTTP_500_INTERNAL_SERVER_ERROR,
97
+ )
98
+
99
+
72
100
  def _convert_value_type(current_value: Any, new_value: Any) -> Any:
73
101
  if isinstance(current_value, bool):
74
102
  if isinstance(new_value, str):
@@ -121,6 +149,15 @@ def _merge_configs_cached(
121
149
  return ExtractionConfig(**config_dict)
122
150
 
123
151
 
152
+ def _make_hashable(obj: Any) -> Any:
153
+ """Convert nested dicts/lists to hashable tuples."""
154
+ if isinstance(obj, dict):
155
+ return tuple(sorted((k, _make_hashable(v)) for k, v in obj.items()))
156
+ if isinstance(obj, list):
157
+ return tuple(_make_hashable(item) for item in obj)
158
+ return obj
159
+
160
+
124
161
  def merge_configs(
125
162
  static_config: ExtractionConfig | None,
126
163
  query_params: dict[str, Any],
@@ -128,7 +165,7 @@ def merge_configs(
128
165
  ) -> ExtractionConfig:
129
166
  """Merge configurations with precedence: header > query > static > default."""
130
167
  query_tuple = tuple(sorted(query_params.items())) if query_params else ()
131
- header_tuple = tuple(sorted(header_config.items())) if header_config else None
168
+ header_tuple = _make_hashable(header_config) if header_config else None
132
169
 
133
170
  return _merge_configs_cached(static_config, query_tuple, header_tuple)
134
171
 
@@ -211,8 +248,9 @@ async def get_configuration() -> dict[str, Any]:
211
248
  app = Litestar(
212
249
  route_handlers=[handle_files_upload, health_check, get_configuration],
213
250
  plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
214
- logging_config=StructLoggingConfig(),
251
+ logging_config=StructLoggingConfig(), # Use default config
215
252
  exception_handlers={
216
253
  KreuzbergError: exception_handler,
254
+ Exception: general_exception_handler, # Catch all exceptions for debugging
217
255
  },
218
256
  )
kreuzberg/_config.py CHANGED
@@ -162,6 +162,10 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
162
162
  if config_path.name == "pyproject.toml":
163
163
  return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
164
164
 
165
+ # For any other TOML file, check if it has [tool.kreuzberg] section
166
+ if "tool" in data and "kreuzberg" in data["tool"]:
167
+ return data["tool"]["kreuzberg"] # type: ignore[no-any-return]
168
+
165
169
  return data # type: ignore[no-any-return]
166
170
 
167
171
 
@@ -34,8 +34,22 @@ CellValue = int | float | str | bool | time | date | datetime | timedelta
34
34
  class SpreadSheetExtractor(Extractor):
35
35
  SUPPORTED_MIME_TYPES = SPREADSHEET_MIME_TYPES
36
36
 
37
+ def _get_file_extension(self) -> str:
38
+ """Get the appropriate file extension based on MIME type."""
39
+ mime_to_ext = {
40
+ "application/vnd.ms-excel": ".xls",
41
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
42
+ "application/vnd.ms-excel.sheet.macroEnabled.12": ".xlsm",
43
+ "application/vnd.ms-excel.sheet.binary.macroEnabled.12": ".xlsb",
44
+ "application/vnd.ms-excel.addin.macroEnabled.12": ".xlam",
45
+ "application/vnd.ms-excel.template.macroEnabled.12": ".xltm",
46
+ "application/vnd.oasis.opendocument.spreadsheet": ".ods",
47
+ }
48
+ return mime_to_ext.get(self.mime_type, ".xlsx")
49
+
37
50
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
38
- xlsx_path, unlink = await create_temp_file(".xlsx")
51
+ file_extension = self._get_file_extension()
52
+ xlsx_path, unlink = await create_temp_file(file_extension)
39
53
  await AsyncPath(xlsx_path).write_bytes(content)
40
54
  try:
41
55
  return await self.extract_path_async(xlsx_path)
@@ -72,7 +86,8 @@ class SpreadSheetExtractor(Extractor):
72
86
  ) from e
73
87
 
74
88
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
75
- fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
89
+ file_extension = self._get_file_extension()
90
+ fd, temp_path = tempfile.mkstemp(suffix=file_extension)
76
91
 
77
92
  try:
78
93
  with os.fdopen(fd, "wb") as f:
@@ -280,6 +280,9 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
280
280
  }
281
281
 
282
282
  async def _execute_tesseract(self, path: Path, output_base: str, run_config: dict[str, Any]) -> None:
283
+ psm_value = run_config["psm"]
284
+ psm_str = str(psm_value.value) if hasattr(psm_value, "value") else str(psm_value)
285
+
283
286
  command = [
284
287
  "tesseract",
285
288
  str(path),
@@ -287,7 +290,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
287
290
  "-l",
288
291
  run_config["language"],
289
292
  "--psm",
290
- str(run_config["psm"].value),
293
+ psm_str,
291
294
  "--oem",
292
295
  "1",
293
296
  "--loglevel",
@@ -1089,8 +1092,16 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1089
1092
  }
1090
1093
 
1091
1094
  def _build_tesseract_command(
1092
- self, path: Path, output_base: str, language: str, psm: PSMMode, output_format: str = "text", **kwargs: Any
1095
+ self,
1096
+ path: Path,
1097
+ output_base: str,
1098
+ language: str,
1099
+ psm: PSMMode | int,
1100
+ output_format: str = "text",
1101
+ **kwargs: Any,
1093
1102
  ) -> list[str]:
1103
+ psm_str = str(psm.value) if hasattr(psm, "value") else str(psm)
1104
+
1094
1105
  command = [
1095
1106
  "tesseract",
1096
1107
  str(path),
@@ -1098,7 +1109,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
1098
1109
  "-l",
1099
1110
  language,
1100
1111
  "--psm",
1101
- str(psm.value),
1112
+ psm_str,
1102
1113
  "--oem",
1103
1114
  "1",
1104
1115
  "--loglevel",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.13.2
3
+ Version: 3.13.3
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,15 +31,15 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.9.1
34
+ Requires-Dist: html-to-markdown[lxml]>=1.10.0
35
35
  Requires-Dist: mcp>=1.13.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: numpy>=1.24.0
38
38
  Requires-Dist: playa-pdf>=0.7.0
39
- Requires-Dist: polars>=1.33.0
39
+ Requires-Dist: polars>=1.33.1
40
40
  Requires-Dist: psutil>=7.0.0
41
41
  Requires-Dist: pypdfium2==4.30.0
42
- Requires-Dist: python-calamine>=0.5.2
42
+ Requires-Dist: python-calamine>=0.5.3
43
43
  Requires-Dist: python-pptx>=1.0.2
44
44
  Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
45
45
  Provides-Extra: additional-extensions
@@ -55,17 +55,17 @@ Requires-Dist: keybert>=0.9.0; extra == 'all'
55
55
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
56
56
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
57
57
  Requires-Dist: paddleocr>=3.2.0; extra == 'all'
58
- Requires-Dist: paddlepaddle>=3.1.1; extra == 'all'
58
+ Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
59
59
  Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
60
60
  Requires-Dist: rich>=14.1.0; extra == 'all'
61
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
61
+ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
62
62
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
63
63
  Requires-Dist: spacy>=3.8.7; extra == 'all'
64
64
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
65
65
  Provides-Extra: api
66
66
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
67
67
  Provides-Extra: chunking
68
- Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
68
+ Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
69
69
  Provides-Extra: cli
70
70
  Requires-Dist: click>=8.2.1; extra == 'cli'
71
71
  Requires-Dist: rich>=14.1.0; extra == 'cli'
@@ -85,7 +85,7 @@ Provides-Extra: langdetect
85
85
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
86
86
  Provides-Extra: paddleocr
87
87
  Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
88
- Requires-Dist: paddlepaddle>=3.1.1; extra == 'paddleocr'
88
+ Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
89
89
  Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
90
90
  Description-Content-Type: text/markdown
91
91
 
@@ -1,7 +1,7 @@
1
1
  kreuzberg/__init__.py,sha256=Oh_NTp8wf0BlvD8CSBad2A493nEWH4jTE0x8v7v1Y9w,1341
2
2
  kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
3
3
  kreuzberg/_chunker.py,sha256=tr9_KUYTSLauFois3MsB-A-0hGcTT8hTQFrqNRTii-I,1373
4
- kreuzberg/_config.py,sha256=Q5oiJE1XRf8ITuYcO8LZAOB3G2zNlXz2458rgPSth-U,12257
4
+ kreuzberg/_config.py,sha256=T6ASb3N8nPQ4g5B2FxfgK82uE4pesGllezqrmZ0gSdM,12457
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
6
  kreuzberg/_document_classification.py,sha256=Mz_s2GJGsEl7MQ-67BPoGYCZibTy9Sw0PScUZKBjKOA,5736
7
7
  kreuzberg/_entity_extraction.py,sha256=5YpPnqoJ5aiHd_sy4bN4-Ngiq79RhCV6yaUQE8joGXo,3503
@@ -16,7 +16,7 @@ kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
16
  kreuzberg/extraction.py,sha256=jiMKiDyTf3sHyk76sMffHR-eH-_yg-DFRMuXEKufRYI,17649
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- kreuzberg/_api/main.py,sha256=JALYRD0qwyoZloWk5dNNuslBtG4GlVNc0G2oADm6cAc,7578
19
+ kreuzberg/_api/main.py,sha256=q0ygmdAUfTkjlqAa1RdW1KxxzxQ6IX80__UTpoXipp8,8859
20
20
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
22
22
  kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
@@ -25,7 +25,7 @@ kreuzberg/_extractors/_image.py,sha256=UqPoYfvDRX6Rd1yPhcLHJLDw6d2cUzgkqOGjh2ele
25
25
  kreuzberg/_extractors/_pandoc.py,sha256=-Ai4S1cXs7F6yeonb_7Y7_ZoWHn29E2oP1WlPtM-4HM,22505
26
26
  kreuzberg/_extractors/_pdf.py,sha256=Yv_c3xYzrGAjgTbwCGqbiQTDLjIUP_Pu7Z3GmMOqgqg,17865
27
27
  kreuzberg/_extractors/_presentation.py,sha256=ULGkt7dzeA9sYSEhpAucKZmkdv9EubzeZtOjoLP3Z2E,6994
28
- kreuzberg/_extractors/_spread_sheet.py,sha256=x25u2M-ufxpDd7_qrjhMEz1yFftIcOISE1qwPW09Zm0,11962
28
+ kreuzberg/_extractors/_spread_sheet.py,sha256=UgjkLBATirc5FXUFtRN1ArLfOYhLDJxH2wFb1s9E5vA,12784
29
29
  kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
30
30
  kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
31
31
  kreuzberg/_mcp/server.py,sha256=iYJG6g0u7I6mWtC4R1XlxydBrPpgnp5dGJzpm9QAZig,8438
@@ -34,7 +34,7 @@ kreuzberg/_ocr/_base.py,sha256=5ef2g8JuSaZF2sDiAmoaODHbeG4MT0LtNzbtW0n9BnU,1445
34
34
  kreuzberg/_ocr/_easyocr.py,sha256=XbgpGt5tkE4xHleIGvV1cHlpOQTp43rSXBO1CyIyKTg,14599
35
35
  kreuzberg/_ocr/_paddleocr.py,sha256=58sKOHfKCHGFJNlRLrJwey8G_7xbsAAPBXB4n3hKc7k,14052
36
36
  kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
37
- kreuzberg/_ocr/_tesseract.py,sha256=xGML3ygY5xMN5T3YznrKDVAH_DWfaFiteFBo_-GpjCs,48931
37
+ kreuzberg/_ocr/_tesseract.py,sha256=H2T_iuXwa0FGCSQ_ZfXvmvqksxoOdOFAfv3uQA8E4-M,49160
38
38
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
39
  kreuzberg/_utils/_cache.py,sha256=S6Oc4TJamiuuWeJ2ABxDFbbQh4o8w38AUyZeBEc1NN8,12767
40
40
  kreuzberg/_utils/_device.py,sha256=UxGkSTN3Up-Zn43CSyvf8CozW2xAF05Cm01LWA2FZmg,8263
@@ -50,8 +50,8 @@ kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4
50
50
  kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
51
51
  kreuzberg/_utils/_table.py,sha256=R-6owHjvcvHGhem_vDsFH7S2yMHGoUUO2PFcj-Idptk,6361
52
52
  kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
53
- kreuzberg-3.13.2.dist-info/METADATA,sha256=c1w8iB_Frnzr0DHY-X-a9rk5S9vQPICPIniPzwfvHV8,12127
54
- kreuzberg-3.13.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
55
- kreuzberg-3.13.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
56
- kreuzberg-3.13.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
57
- kreuzberg-3.13.2.dist-info/RECORD,,
53
+ kreuzberg-3.13.3.dist-info/METADATA,sha256=ey7kAlKK8eTER87IiGZZpIPnYoSLwLPX2AGdOPTjj2M,12128
54
+ kreuzberg-3.13.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
55
+ kreuzberg-3.13.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
56
+ kreuzberg-3.13.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
57
+ kreuzberg-3.13.3.dist-info/RECORD,,