kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,13 +2,10 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import contextlib
|
4
4
|
import csv
|
5
|
-
import os
|
6
5
|
import sys
|
7
|
-
import tempfile
|
8
6
|
from datetime import date, datetime, time, timedelta
|
9
7
|
from io import StringIO
|
10
|
-
from
|
11
|
-
from typing import Any
|
8
|
+
from typing import TYPE_CHECKING, Any
|
12
9
|
|
13
10
|
import polars as pl
|
14
11
|
from anyio import Path as AsyncPath
|
@@ -21,9 +18,12 @@ from kreuzberg._types import ExtractionResult, Metadata, TableData
|
|
21
18
|
from kreuzberg._utils._string import normalize_spaces
|
22
19
|
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
23
20
|
from kreuzberg._utils._table import enhance_table_markdown
|
24
|
-
from kreuzberg._utils._tmp import create_temp_file
|
21
|
+
from kreuzberg._utils._tmp import create_temp_file, temporary_file, temporary_file_sync
|
25
22
|
from kreuzberg.exceptions import ParsingError
|
26
23
|
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from pathlib import Path
|
26
|
+
|
27
27
|
if sys.version_info < (3, 11): # pragma: no cover
|
28
28
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
29
29
|
|
@@ -48,12 +48,8 @@ class SpreadSheetExtractor(Extractor):
|
|
48
48
|
|
49
49
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
50
50
|
file_extension = self._get_file_extension()
|
51
|
-
|
52
|
-
await AsyncPath(xlsx_path).write_bytes(content)
|
53
|
-
try:
|
51
|
+
async with temporary_file(file_extension, content) as xlsx_path:
|
54
52
|
return await self.extract_path_async(xlsx_path)
|
55
|
-
finally:
|
56
|
-
await unlink()
|
57
53
|
|
58
54
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
59
55
|
try:
|
@@ -86,16 +82,8 @@ class SpreadSheetExtractor(Extractor):
|
|
86
82
|
|
87
83
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
88
84
|
file_extension = self._get_file_extension()
|
89
|
-
|
90
|
-
|
91
|
-
try:
|
92
|
-
with os.fdopen(fd, "wb") as f:
|
93
|
-
f.write(content)
|
94
|
-
|
95
|
-
return self.extract_path_sync(Path(temp_path))
|
96
|
-
finally:
|
97
|
-
with contextlib.suppress(OSError):
|
98
|
-
Path(temp_path).unlink()
|
85
|
+
with temporary_file_sync(file_extension, content) as temp_path:
|
86
|
+
return self.extract_path_sync(temp_path)
|
99
87
|
|
100
88
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
101
89
|
try:
|
@@ -122,15 +110,17 @@ class SpreadSheetExtractor(Extractor):
|
|
122
110
|
|
123
111
|
@staticmethod
|
124
112
|
def _convert_cell_to_str(value: Any) -> str:
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
113
|
+
match value:
|
114
|
+
case None:
|
115
|
+
return ""
|
116
|
+
case bool():
|
117
|
+
return str(value).lower()
|
118
|
+
case datetime() | date() | time():
|
119
|
+
return value.isoformat()
|
120
|
+
case timedelta():
|
121
|
+
return f"{value.total_seconds()} seconds"
|
122
|
+
case _:
|
123
|
+
return str(value)
|
134
124
|
|
135
125
|
async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
136
126
|
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
@@ -207,7 +197,11 @@ class SpreadSheetExtractor(Extractor):
|
|
207
197
|
if not data or not any(row for row in data):
|
208
198
|
return f"## {sheet_name}\n\n*Empty sheet*"
|
209
199
|
|
210
|
-
|
200
|
+
if data:
|
201
|
+
max_cols = max(len(row) if row else 0 for row in data)
|
202
|
+
data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
|
203
|
+
|
204
|
+
df = pl.DataFrame(data, strict=False)
|
211
205
|
|
212
206
|
df = df.filter(~pl.all_horizontal(pl.all().is_null()))
|
213
207
|
df = df.select([col for col in df.columns if not df[col].is_null().all()])
|
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import sys
|
5
4
|
from typing import TYPE_CHECKING, Any, ClassVar
|
6
5
|
|
@@ -17,11 +16,13 @@ try:
|
|
17
16
|
except ImportError: # pragma: no cover
|
18
17
|
yaml = None
|
19
18
|
|
19
|
+
|
20
20
|
from anyio import Path as AsyncPath
|
21
21
|
|
22
22
|
from kreuzberg._extractors._base import Extractor
|
23
23
|
from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
|
24
|
-
from kreuzberg._types import ExtractionResult, normalize_metadata
|
24
|
+
from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
|
25
|
+
from kreuzberg._utils._serialization import deserialize
|
25
26
|
from kreuzberg._utils._string import normalize_spaces, safe_decode
|
26
27
|
from kreuzberg._utils._sync import run_sync
|
27
28
|
|
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
|
|
43
44
|
"text/toml",
|
44
45
|
}
|
45
46
|
|
47
|
+
@property
|
48
|
+
def _json_config(self) -> JSONExtractionConfig | None:
|
49
|
+
return self.config.json_config
|
50
|
+
|
51
|
+
def _get_text_field_keywords(self) -> frozenset[str]:
|
52
|
+
json_config = self._json_config
|
53
|
+
if json_config and json_config.custom_text_field_patterns:
|
54
|
+
return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
|
55
|
+
return _TEXT_FIELD_KEYWORDS
|
56
|
+
|
57
|
+
def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
|
58
|
+
json_config = self._json_config
|
59
|
+
if not json_config or not json_config.extract_schema:
|
60
|
+
return {}
|
61
|
+
|
62
|
+
if depth >= json_config.max_depth:
|
63
|
+
return {"max_depth_reached": True}
|
64
|
+
|
65
|
+
schema_info: dict[str, Any] = {"type": type(data).__name__}
|
66
|
+
|
67
|
+
if isinstance(data, dict):
|
68
|
+
schema_info["properties"] = {}
|
69
|
+
for key, value in data.items():
|
70
|
+
key_path = f"{path}.{key}" if path else key
|
71
|
+
schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
|
72
|
+
elif isinstance(data, list) and data:
|
73
|
+
if len(data) <= json_config.array_item_limit:
|
74
|
+
schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
|
75
|
+
schema_info["length"] = len(data)
|
76
|
+
else:
|
77
|
+
schema_info["items"] = {"type": "truncated"}
|
78
|
+
schema_info["length"] = len(data)
|
79
|
+
schema_info["truncated"] = True
|
80
|
+
|
81
|
+
return schema_info
|
82
|
+
|
46
83
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
47
84
|
return await run_sync(self.extract_bytes_sync, content)
|
48
85
|
|
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
|
|
51
88
|
return await self.extract_bytes_async(content)
|
52
89
|
|
53
90
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
54
|
-
text_content =
|
55
|
-
|
91
|
+
text_content: None | str = None
|
56
92
|
try:
|
57
93
|
if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
|
58
|
-
data = json
|
94
|
+
data = deserialize(content, dict, json=True)
|
59
95
|
elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
|
96
|
+
text_content = safe_decode(content)
|
60
97
|
if tomllib is None:
|
61
98
|
return ExtractionResult(
|
62
99
|
content=normalize_spaces(text_content),
|
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
|
|
66
103
|
)
|
67
104
|
data = tomllib.loads(text_content)
|
68
105
|
else:
|
106
|
+
text_content = safe_decode(content)
|
69
107
|
if yaml is None:
|
70
108
|
return ExtractionResult(
|
71
109
|
content=normalize_spaces(text_content),
|
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
|
|
75
113
|
)
|
76
114
|
data = yaml.safe_load(text_content)
|
77
115
|
|
78
|
-
text_parts: list[str] = []
|
79
116
|
metadata: dict[str, Any] = {}
|
80
117
|
|
118
|
+
if (
|
119
|
+
self.mime_type in {JSON_MIME_TYPE, "text/json"}
|
120
|
+
and self._json_config
|
121
|
+
and self._json_config.extract_schema
|
122
|
+
):
|
123
|
+
schema_info = self._extract_json_schema(data)
|
124
|
+
if schema_info:
|
125
|
+
metadata["json_schema"] = schema_info
|
126
|
+
|
81
127
|
if isinstance(data, dict):
|
82
128
|
text_parts = self._extract_from_dict(data, metadata)
|
83
129
|
elif isinstance(data, list):
|
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
|
|
85
131
|
else:
|
86
132
|
text_parts = [str(data)]
|
87
133
|
|
88
|
-
combined_text = "\n".join(text_parts) if text_parts else text_content
|
134
|
+
combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
|
89
135
|
|
90
136
|
return ExtractionResult(
|
91
137
|
content=normalize_spaces(combined_text),
|
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
|
|
96
142
|
|
97
143
|
except (ValueError, TypeError) as e:
|
98
144
|
return ExtractionResult(
|
99
|
-
content=normalize_spaces(text_content),
|
145
|
+
content=normalize_spaces(text_content or safe_decode(content)),
|
100
146
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
101
147
|
metadata={"parse_error": str(e)},
|
102
148
|
chunks=[],
|
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
|
|
113
159
|
full_key = f"{prefix}.{key}" if prefix else key
|
114
160
|
|
115
161
|
if isinstance(value, str) and value.strip():
|
116
|
-
|
162
|
+
if self._json_config and self._json_config.include_type_info:
|
163
|
+
text_parts.append(f"{full_key} (string): {value}")
|
164
|
+
else:
|
165
|
+
text_parts.append(f"{full_key}: {value}")
|
117
166
|
|
118
167
|
key_lower = key.lower()
|
119
|
-
|
168
|
+
text_field_keywords = self._get_text_field_keywords()
|
169
|
+
if any(keyword in key_lower for keyword in text_field_keywords):
|
120
170
|
metadata[full_key] = value
|
121
171
|
|
122
172
|
elif isinstance(value, (int, float, bool)):
|
123
|
-
|
173
|
+
if self._json_config and self._json_config.include_type_info:
|
174
|
+
type_name = type(value).__name__
|
175
|
+
text_parts.append(f"{full_key} ({type_name}): {value}")
|
176
|
+
else:
|
177
|
+
text_parts.append(f"{full_key}: {value}")
|
124
178
|
|
125
179
|
elif isinstance(value, dict):
|
126
|
-
|
180
|
+
if self._json_config and not self._json_config.flatten_nested_objects:
|
181
|
+
text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
|
182
|
+
else:
|
183
|
+
text_parts.extend(self._extract_from_dict(value, metadata, full_key))
|
127
184
|
|
128
185
|
elif isinstance(value, list):
|
129
186
|
text_parts.extend(self._extract_from_list(value, metadata, full_key))
|
130
187
|
|
131
188
|
elif value is not None:
|
132
|
-
|
189
|
+
if self._json_config and self._json_config.include_type_info:
|
190
|
+
type_name = type(value).__name__
|
191
|
+
text_parts.append(f"{full_key} ({type_name}): {value!s}")
|
192
|
+
else:
|
193
|
+
text_parts.append(f"{full_key}: {value!s}")
|
133
194
|
|
134
195
|
return text_parts
|
135
196
|
|
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
|
|
140
201
|
item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
|
141
202
|
|
142
203
|
if isinstance(item, str) and item.strip():
|
143
|
-
|
204
|
+
if self._json_config and self._json_config.include_type_info:
|
205
|
+
text_parts.append(f"{item_key} (string): {item}")
|
206
|
+
else:
|
207
|
+
text_parts.append(f"{item_key}: {item}")
|
144
208
|
|
145
209
|
elif isinstance(item, dict):
|
146
210
|
text_parts.extend(self._extract_from_dict(item, metadata, item_key))
|
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
|
|
149
213
|
text_parts.extend(self._extract_from_list(item, metadata, item_key))
|
150
214
|
|
151
215
|
elif item is not None:
|
152
|
-
|
216
|
+
if self._json_config and self._json_config.include_type_info:
|
217
|
+
type_name = type(item).__name__
|
218
|
+
text_parts.append(f"{item_key} ({type_name}): {item!s}")
|
219
|
+
else:
|
220
|
+
text_parts.append(f"{item_key}: {item!s}")
|
153
221
|
|
154
222
|
return text_parts
|
kreuzberg/_gmft.py
CHANGED
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
|
|
312
312
|
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
313
313
|
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
314
314
|
|
315
|
+
if "cell_required_confidence" in config_dict:
|
316
|
+
cell_config = config_dict["cell_required_confidence"]
|
317
|
+
if isinstance(cell_config, dict) and cell_config:
|
318
|
+
config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
|
319
|
+
|
315
320
|
config = GMFTConfig(**config_dict)
|
316
321
|
|
317
322
|
formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
|