kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,13 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import contextlib
4
4
  import csv
5
- import os
6
5
  import sys
7
- import tempfile
8
6
  from datetime import date, datetime, time, timedelta
9
7
  from io import StringIO
10
- from pathlib import Path
11
- from typing import Any
8
+ from typing import TYPE_CHECKING, Any
12
9
 
13
10
  import polars as pl
14
11
  from anyio import Path as AsyncPath
@@ -21,9 +18,12 @@ from kreuzberg._types import ExtractionResult, Metadata, TableData
21
18
  from kreuzberg._utils._string import normalize_spaces
22
19
  from kreuzberg._utils._sync import run_sync, run_taskgroup
23
20
  from kreuzberg._utils._table import enhance_table_markdown
24
- from kreuzberg._utils._tmp import create_temp_file
21
+ from kreuzberg._utils._tmp import create_temp_file, temporary_file, temporary_file_sync
25
22
  from kreuzberg.exceptions import ParsingError
26
23
 
24
+ if TYPE_CHECKING:
25
+ from pathlib import Path
26
+
27
27
  if sys.version_info < (3, 11): # pragma: no cover
28
28
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
29
29
 
@@ -48,12 +48,8 @@ class SpreadSheetExtractor(Extractor):
48
48
 
49
49
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
50
50
  file_extension = self._get_file_extension()
51
- xlsx_path, unlink = await create_temp_file(file_extension)
52
- await AsyncPath(xlsx_path).write_bytes(content)
53
- try:
51
+ async with temporary_file(file_extension, content) as xlsx_path:
54
52
  return await self.extract_path_async(xlsx_path)
55
- finally:
56
- await unlink()
57
53
 
58
54
  async def extract_path_async(self, path: Path) -> ExtractionResult:
59
55
  try:
@@ -86,16 +82,8 @@ class SpreadSheetExtractor(Extractor):
86
82
 
87
83
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
88
84
  file_extension = self._get_file_extension()
89
- fd, temp_path = tempfile.mkstemp(suffix=file_extension)
90
-
91
- try:
92
- with os.fdopen(fd, "wb") as f:
93
- f.write(content)
94
-
95
- return self.extract_path_sync(Path(temp_path))
96
- finally:
97
- with contextlib.suppress(OSError):
98
- Path(temp_path).unlink()
85
+ with temporary_file_sync(file_extension, content) as temp_path:
86
+ return self.extract_path_sync(temp_path)
99
87
 
100
88
  def extract_path_sync(self, path: Path) -> ExtractionResult:
101
89
  try:
@@ -122,15 +110,17 @@ class SpreadSheetExtractor(Extractor):
122
110
 
123
111
  @staticmethod
124
112
  def _convert_cell_to_str(value: Any) -> str:
125
- if value is None:
126
- return ""
127
- if isinstance(value, bool):
128
- return str(value).lower()
129
- if isinstance(value, (datetime, date, time)):
130
- return value.isoformat()
131
- if isinstance(value, timedelta):
132
- return f"{value.total_seconds()} seconds"
133
- return str(value)
113
+ match value:
114
+ case None:
115
+ return ""
116
+ case bool():
117
+ return str(value).lower()
118
+ case datetime() | date() | time():
119
+ return value.isoformat()
120
+ case timedelta():
121
+ return f"{value.total_seconds()} seconds"
122
+ case _:
123
+ return str(value)
134
124
 
135
125
  async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
136
126
  values = workbook.get_sheet_by_name(sheet_name).to_python()
@@ -207,7 +197,11 @@ class SpreadSheetExtractor(Extractor):
207
197
  if not data or not any(row for row in data):
208
198
  return f"## {sheet_name}\n\n*Empty sheet*"
209
199
 
210
- df = pl.DataFrame(data)
200
+ if data:
201
+ max_cols = max(len(row) if row else 0 for row in data)
202
+ data = [row + [None] * (max_cols - len(row)) if row else [None] * max_cols for row in data] # type: ignore[list-item]
203
+
204
+ df = pl.DataFrame(data, strict=False)
211
205
 
212
206
  df = df.filter(~pl.all_horizontal(pl.all().is_null()))
213
207
  df = df.select([col for col in df.columns if not df[col].is_null().all()])
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import sys
5
4
  from typing import TYPE_CHECKING, Any, ClassVar
6
5
 
@@ -17,11 +16,13 @@ try:
17
16
  except ImportError: # pragma: no cover
18
17
  yaml = None
19
18
 
19
+
20
20
  from anyio import Path as AsyncPath
21
21
 
22
22
  from kreuzberg._extractors._base import Extractor
23
23
  from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
24
- from kreuzberg._types import ExtractionResult, normalize_metadata
24
+ from kreuzberg._types import ExtractionResult, JSONExtractionConfig, normalize_metadata
25
+ from kreuzberg._utils._serialization import deserialize
25
26
  from kreuzberg._utils._string import normalize_spaces, safe_decode
26
27
  from kreuzberg._utils._sync import run_sync
27
28
 
@@ -43,6 +44,42 @@ class StructuredDataExtractor(Extractor):
43
44
  "text/toml",
44
45
  }
45
46
 
47
+ @property
48
+ def _json_config(self) -> JSONExtractionConfig | None:
49
+ return self.config.json_config
50
+
51
+ def _get_text_field_keywords(self) -> frozenset[str]:
52
+ json_config = self._json_config
53
+ if json_config and json_config.custom_text_field_patterns:
54
+ return _TEXT_FIELD_KEYWORDS | json_config.custom_text_field_patterns
55
+ return _TEXT_FIELD_KEYWORDS
56
+
57
+ def _extract_json_schema(self, data: Any, path: str = "", depth: int = 0) -> dict[str, Any]:
58
+ json_config = self._json_config
59
+ if not json_config or not json_config.extract_schema:
60
+ return {}
61
+
62
+ if depth >= json_config.max_depth:
63
+ return {"max_depth_reached": True}
64
+
65
+ schema_info: dict[str, Any] = {"type": type(data).__name__}
66
+
67
+ if isinstance(data, dict):
68
+ schema_info["properties"] = {}
69
+ for key, value in data.items():
70
+ key_path = f"{path}.{key}" if path else key
71
+ schema_info["properties"][key] = self._extract_json_schema(value, key_path, depth + 1)
72
+ elif isinstance(data, list) and data:
73
+ if len(data) <= json_config.array_item_limit:
74
+ schema_info["items"] = self._extract_json_schema(data[0], f"{path}[0]", depth + 1)
75
+ schema_info["length"] = len(data)
76
+ else:
77
+ schema_info["items"] = {"type": "truncated"}
78
+ schema_info["length"] = len(data)
79
+ schema_info["truncated"] = True
80
+
81
+ return schema_info
82
+
46
83
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
47
84
  return await run_sync(self.extract_bytes_sync, content)
48
85
 
@@ -51,12 +88,12 @@ class StructuredDataExtractor(Extractor):
51
88
  return await self.extract_bytes_async(content)
52
89
 
53
90
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
54
- text_content = safe_decode(content)
55
-
91
+ text_content: None | str = None
56
92
  try:
57
93
  if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
58
- data = json.loads(text_content)
94
+ data = deserialize(content, dict, json=True)
59
95
  elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
96
+ text_content = safe_decode(content)
60
97
  if tomllib is None:
61
98
  return ExtractionResult(
62
99
  content=normalize_spaces(text_content),
@@ -66,6 +103,7 @@ class StructuredDataExtractor(Extractor):
66
103
  )
67
104
  data = tomllib.loads(text_content)
68
105
  else:
106
+ text_content = safe_decode(content)
69
107
  if yaml is None:
70
108
  return ExtractionResult(
71
109
  content=normalize_spaces(text_content),
@@ -75,9 +113,17 @@ class StructuredDataExtractor(Extractor):
75
113
  )
76
114
  data = yaml.safe_load(text_content)
77
115
 
78
- text_parts: list[str] = []
79
116
  metadata: dict[str, Any] = {}
80
117
 
118
+ if (
119
+ self.mime_type in {JSON_MIME_TYPE, "text/json"}
120
+ and self._json_config
121
+ and self._json_config.extract_schema
122
+ ):
123
+ schema_info = self._extract_json_schema(data)
124
+ if schema_info:
125
+ metadata["json_schema"] = schema_info
126
+
81
127
  if isinstance(data, dict):
82
128
  text_parts = self._extract_from_dict(data, metadata)
83
129
  elif isinstance(data, list):
@@ -85,7 +131,7 @@ class StructuredDataExtractor(Extractor):
85
131
  else:
86
132
  text_parts = [str(data)]
87
133
 
88
- combined_text = "\n".join(text_parts) if text_parts else text_content
134
+ combined_text = "\n".join(text_parts) if text_parts else (text_content or safe_decode(content))
89
135
 
90
136
  return ExtractionResult(
91
137
  content=normalize_spaces(combined_text),
@@ -96,7 +142,7 @@ class StructuredDataExtractor(Extractor):
96
142
 
97
143
  except (ValueError, TypeError) as e:
98
144
  return ExtractionResult(
99
- content=normalize_spaces(text_content),
145
+ content=normalize_spaces(text_content or safe_decode(content)),
100
146
  mime_type=PLAIN_TEXT_MIME_TYPE,
101
147
  metadata={"parse_error": str(e)},
102
148
  chunks=[],
@@ -113,23 +159,38 @@ class StructuredDataExtractor(Extractor):
113
159
  full_key = f"{prefix}.{key}" if prefix else key
114
160
 
115
161
  if isinstance(value, str) and value.strip():
116
- text_parts.append(f"{full_key}: {value}")
162
+ if self._json_config and self._json_config.include_type_info:
163
+ text_parts.append(f"{full_key} (string): {value}")
164
+ else:
165
+ text_parts.append(f"{full_key}: {value}")
117
166
 
118
167
  key_lower = key.lower()
119
- if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
168
+ text_field_keywords = self._get_text_field_keywords()
169
+ if any(keyword in key_lower for keyword in text_field_keywords):
120
170
  metadata[full_key] = value
121
171
 
122
172
  elif isinstance(value, (int, float, bool)):
123
- text_parts.append(f"{full_key}: {value}")
173
+ if self._json_config and self._json_config.include_type_info:
174
+ type_name = type(value).__name__
175
+ text_parts.append(f"{full_key} ({type_name}): {value}")
176
+ else:
177
+ text_parts.append(f"{full_key}: {value}")
124
178
 
125
179
  elif isinstance(value, dict):
126
- text_parts.extend(self._extract_from_dict(value, metadata, full_key))
180
+ if self._json_config and not self._json_config.flatten_nested_objects:
181
+ text_parts.append(f"{full_key}: [nested object with {len(value)} properties]")
182
+ else:
183
+ text_parts.extend(self._extract_from_dict(value, metadata, full_key))
127
184
 
128
185
  elif isinstance(value, list):
129
186
  text_parts.extend(self._extract_from_list(value, metadata, full_key))
130
187
 
131
188
  elif value is not None:
132
- text_parts.append(f"{full_key}: {value!s}")
189
+ if self._json_config and self._json_config.include_type_info:
190
+ type_name = type(value).__name__
191
+ text_parts.append(f"{full_key} ({type_name}): {value!s}")
192
+ else:
193
+ text_parts.append(f"{full_key}: {value!s}")
133
194
 
134
195
  return text_parts
135
196
 
@@ -140,7 +201,10 @@ class StructuredDataExtractor(Extractor):
140
201
  item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
141
202
 
142
203
  if isinstance(item, str) and item.strip():
143
- text_parts.append(f"{item_key}: {item}")
204
+ if self._json_config and self._json_config.include_type_info:
205
+ text_parts.append(f"{item_key} (string): {item}")
206
+ else:
207
+ text_parts.append(f"{item_key}: {item}")
144
208
 
145
209
  elif isinstance(item, dict):
146
210
  text_parts.extend(self._extract_from_dict(item, metadata, item_key))
@@ -149,6 +213,10 @@ class StructuredDataExtractor(Extractor):
149
213
  text_parts.extend(self._extract_from_list(item, metadata, item_key))
150
214
 
151
215
  elif item is not None:
152
- text_parts.append(f"{item_key}: {item!s}")
216
+ if self._json_config and self._json_config.include_type_info:
217
+ type_name = type(item).__name__
218
+ text_parts.append(f"{item_key} ({type_name}): {item!s}")
219
+ else:
220
+ text_parts.append(f"{item_key}: {item!s}")
153
221
 
154
222
  return text_parts
kreuzberg/_gmft.py CHANGED
@@ -312,6 +312,11 @@ def _extract_tables_in_process(
312
312
  from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
313
313
  from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
314
314
 
315
+ if "cell_required_confidence" in config_dict:
316
+ cell_config = config_dict["cell_required_confidence"]
317
+ if isinstance(cell_config, dict) and cell_config:
318
+ config_dict["cell_required_confidence"] = {int(k): v for k, v in cell_config.items()}
319
+
315
320
  config = GMFTConfig(**config_dict)
316
321
 
317
322
  formatter = AutoTableFormatter( # type: ignore[no-untyped-call]