kreuzberg 3.7.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_entity_extraction.py +1 -2
  2. kreuzberg/_extractors/_base.py +39 -1
  3. kreuzberg/_extractors/_email.py +149 -0
  4. kreuzberg/_extractors/_html.py +15 -3
  5. kreuzberg/_extractors/_image.py +21 -36
  6. kreuzberg/_extractors/_pandoc.py +3 -14
  7. kreuzberg/_extractors/_pdf.py +81 -48
  8. kreuzberg/_extractors/_presentation.py +62 -10
  9. kreuzberg/_extractors/_spread_sheet.py +179 -4
  10. kreuzberg/_extractors/_structured.py +148 -0
  11. kreuzberg/_gmft.py +314 -7
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_ocr/__init__.py +10 -1
  14. kreuzberg/_ocr/_base.py +59 -0
  15. kreuzberg/_ocr/_easyocr.py +91 -0
  16. kreuzberg/_ocr/_paddleocr.py +89 -0
  17. kreuzberg/_ocr/_tesseract.py +564 -4
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +52 -4
  21. kreuzberg/_utils/_errors.py +3 -7
  22. kreuzberg/_utils/_process_pool.py +180 -7
  23. kreuzberg/_utils/_quality.py +237 -0
  24. kreuzberg/_utils/_serialization.py +4 -2
  25. kreuzberg/_utils/_string.py +153 -10
  26. kreuzberg/_utils/_sync.py +5 -2
  27. kreuzberg/_utils/_table.py +261 -0
  28. kreuzberg/cli.py +1 -2
  29. kreuzberg/extraction.py +4 -22
  30. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/METADATA +58 -54
  31. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  32. kreuzberg/_multiprocessing/__init__.py +0 -6
  33. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  34. kreuzberg/_multiprocessing/process_manager.py +0 -189
  35. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  36. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  37. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  38. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  39. kreuzberg-3.7.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -30,6 +30,9 @@ if TYPE_CHECKING: # pragma: no cover
30
30
 
31
31
  from kreuzberg._types import Metadata
32
32
 
33
+ # Pre-compiled regex patterns for performance
34
+ _NON_WORD_PATTERN = re.compile(r"\W")
35
+
33
36
 
34
37
  class PresentationExtractor(Extractor):
35
38
  """Extractor for PowerPoint (.pptx) files.
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
141
144
  with suppress(AttributeError):
142
145
  alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
143
146
 
144
- filename = re.sub(r"\W", "", shape.name) + ".jpg"
147
+ filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
145
148
  md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
146
149
 
147
150
  elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
162
165
  md_content += "\n" + html_table + "\n"
163
166
 
164
167
  elif shape.has_text_frame:
165
- md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
168
+ if shape == title:
169
+ md_content += "# " + shape.text.lstrip() + "\n"
170
+ else:
171
+ md_content += shape.text + "\n"
166
172
 
167
173
  md_content = md_content.strip()
168
174
  if slide.has_notes_slide:
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
174
180
 
175
181
  md_content = md_content.strip()
176
182
 
177
- return ExtractionResult(
183
+ result = ExtractionResult(
178
184
  content=normalize_spaces(md_content),
179
185
  mime_type=MARKDOWN_MIME_TYPE,
180
186
  metadata=self._extract_presentation_metadata(presentation),
181
187
  chunks=[],
182
188
  )
183
189
 
190
+ return self._apply_quality_processing(result)
191
+
184
192
  @staticmethod
185
193
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
186
194
  """Extract metadata from a presentation instance.
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
193
201
  """
194
202
  metadata: Metadata = {}
195
203
 
196
- for metadata_key, core_property_key in [
204
+ # Extract core properties
205
+ PresentationExtractor._extract_core_properties(presentation, metadata)
206
+
207
+ # Extract fonts used in presentation
208
+ fonts = PresentationExtractor._extract_fonts(presentation)
209
+ if fonts:
210
+ metadata["fonts"] = list(fonts)
211
+
212
+ # Add structural information
213
+ PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
214
+
215
+ return metadata
216
+
217
+ @staticmethod
218
+ def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
219
+ """Extract core document properties from presentation."""
220
+ # Property mapping for core metadata
221
+ property_mapping = [
197
222
  ("authors", "author"),
198
223
  ("comments", "comments"),
199
224
  ("status", "content_status"),
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
205
230
  ("version", "revision"),
206
231
  ("subject", "subject"),
207
232
  ("title", "title"),
208
- ("version", "version"),
209
- ]:
233
+ ]
234
+
235
+ for metadata_key, core_property_key in property_mapping:
210
236
  if core_property := getattr(presentation.core_properties, core_property_key, None):
211
237
  metadata[metadata_key] = core_property # type: ignore[literal-required]
212
238
 
239
+ # Handle special list properties
213
240
  if presentation.core_properties.language:
214
241
  metadata["languages"] = [presentation.core_properties.language]
215
242
 
216
243
  if presentation.core_properties.category:
217
244
  metadata["categories"] = [presentation.core_properties.category]
218
245
 
246
+ @staticmethod
247
+ def _extract_fonts(presentation: Presentation) -> set[str]:
248
+ """Extract all fonts used in the presentation."""
219
249
  fonts = set()
220
250
  for slide in presentation.slides:
221
251
  for shape in slide.shapes:
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
226
256
  for run in paragraph.runs:
227
257
  if hasattr(run, "font") and run.font.name:
228
258
  fonts.add(run.font.name)
259
+ return fonts
229
260
 
230
- if fonts:
231
- metadata["fonts"] = list(fonts)
232
-
233
- return metadata
261
+ @staticmethod
262
+ def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
263
+ """Add structural information about the presentation."""
264
+ slide_count = len(presentation.slides)
265
+ if slide_count == 0:
266
+ return
267
+
268
+ # Build description
269
+ structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
270
+
271
+ slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
272
+ if slides_with_notes > 0:
273
+ structure_info += f", {slides_with_notes} with notes"
274
+
275
+ metadata["description"] = structure_info
276
+
277
+ # Build summary if not already present
278
+ if "summary" not in metadata:
279
+ summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
280
+ if slides_with_notes > 0:
281
+ summary_parts.append(f"{slides_with_notes} slides have notes")
282
+ if fonts:
283
+ summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
284
+
285
+ metadata["summary"] = f"{'. '.join(summary_parts)}."
@@ -13,7 +13,7 @@ from python_calamine import CalamineWorkbook
13
13
 
14
14
  from kreuzberg._extractors._base import Extractor
15
15
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
16
- from kreuzberg._types import ExtractionResult
16
+ from kreuzberg._types import ExtractionResult, Metadata
17
17
  from kreuzberg._utils._string import normalize_spaces
18
18
  from kreuzberg._utils._sync import run_sync, run_taskgroup
19
19
  from kreuzberg._utils._tmp import create_temp_file
@@ -45,9 +45,14 @@ class SpreadSheetExtractor(Extractor):
45
45
  try:
46
46
  results: list[str] = await run_taskgroup(*tasks)
47
47
 
48
- return ExtractionResult(
49
- content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[]
48
+ result = ExtractionResult(
49
+ content="\n\n".join(results),
50
+ mime_type=MARKDOWN_MIME_TYPE,
51
+ metadata=self._extract_spreadsheet_metadata(workbook),
52
+ chunks=[],
50
53
  )
54
+
55
+ return self._apply_quality_processing(result)
51
56
  except ExceptionGroup as eg:
52
57
  raise ParsingError(
53
58
  "Failed to extract file data",
@@ -87,7 +92,14 @@ class SpreadSheetExtractor(Extractor):
87
92
  sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
88
93
  results.append(sheet_text)
89
94
 
90
- return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
95
+ result = ExtractionResult(
96
+ content="\n\n".join(results),
97
+ mime_type=MARKDOWN_MIME_TYPE,
98
+ metadata=self._extract_spreadsheet_metadata(workbook),
99
+ chunks=[],
100
+ )
101
+
102
+ return self._apply_quality_processing(result)
91
103
  except Exception as e:
92
104
  raise ParsingError(
93
105
  "Failed to extract file data",
@@ -181,3 +193,166 @@ class SpreadSheetExtractor(Extractor):
181
193
  result = "\n".join(markdown_lines)
182
194
 
183
195
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
196
+
197
+ def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
198
+ """Enhanced sheet processing with better table structure preservation."""
199
+ try:
200
+ # pandas is optional dependency
201
+ import pandas as pd
202
+
203
+ from kreuzberg._utils._table import enhance_table_markdown
204
+
205
+ sheet = workbook.get_sheet_by_name(sheet_name)
206
+ data = sheet.to_python()
207
+
208
+ if not data or not any(row for row in data):
209
+ return f"## {sheet_name}\n\n*Empty sheet*"
210
+
211
+ # Convert to DataFrame
212
+ df = pd.DataFrame(data)
213
+
214
+ # Clean up empty rows and columns
215
+ df = df.dropna(how="all").dropna(axis=1, how="all")
216
+
217
+ if df.empty:
218
+ return f"## {sheet_name}\n\n*No data*"
219
+
220
+ # Create a mock TableData for enhanced formatting
221
+ from PIL import Image
222
+
223
+ from kreuzberg._types import TableData
224
+
225
+ # Create a 1x1 transparent image as placeholder
226
+ placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
227
+ mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
228
+
229
+ enhanced_markdown = enhance_table_markdown(mock_table)
230
+ return f"## {sheet_name}\n\n{enhanced_markdown}"
231
+
232
+ except (ImportError, AttributeError, ValueError):
233
+ # Fallback to original method if pandas/table enhancement fails
234
+ return self._convert_sheet_to_text_sync(workbook, sheet_name)
235
+
236
+ @staticmethod
237
+ def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
238
+ """Extract metadata from spreadsheet using python-calamine.
239
+
240
+ Args:
241
+ workbook: CalamineWorkbook instance
242
+
243
+ Returns:
244
+ Metadata dict using existing metadata keys where possible
245
+ """
246
+ metadata: Metadata = {}
247
+
248
+ # Extract basic document properties
249
+ SpreadSheetExtractor._extract_document_properties(workbook, metadata)
250
+
251
+ # Add structural information
252
+ SpreadSheetExtractor._add_structure_info(workbook, metadata)
253
+
254
+ # Analyze content complexity
255
+ SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
256
+
257
+ return metadata
258
+
259
+ @staticmethod
260
+ def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
261
+ """Extract basic document properties from workbook."""
262
+ with contextlib.suppress(AttributeError, Exception):
263
+ if not (hasattr(workbook, "metadata") and workbook.metadata):
264
+ return
265
+
266
+ props = workbook.metadata
267
+
268
+ # Basic properties mapping
269
+ property_mapping = {
270
+ "title": "title",
271
+ "author": "authors", # Convert to list
272
+ "subject": "subject",
273
+ "comments": "comments",
274
+ "keywords": "keywords", # Process separately
275
+ "category": "categories", # Convert to list
276
+ "company": "organization",
277
+ "manager": "modified_by",
278
+ }
279
+
280
+ for prop_name, meta_key in property_mapping.items():
281
+ if hasattr(props, prop_name) and (value := getattr(props, prop_name)):
282
+ if meta_key in ("authors", "categories"):
283
+ metadata[meta_key] = [value] # type: ignore[literal-required]
284
+ elif meta_key == "keywords":
285
+ keywords = [k.strip() for k in value.replace(";", ",").split(",") if k.strip()]
286
+ if keywords:
287
+ metadata[meta_key] = keywords # type: ignore[literal-required]
288
+ else:
289
+ metadata[meta_key] = value # type: ignore[literal-required]
290
+
291
+ # Handle dates separately
292
+ SpreadSheetExtractor._extract_date_properties(props, metadata)
293
+
294
+ @staticmethod
295
+ def _extract_date_properties(props: Any, metadata: Metadata) -> None:
296
+ """Extract and format date properties."""
297
+ date_mapping = {"created": "created_at", "modified": "modified_at"}
298
+
299
+ for prop_name, meta_key in date_mapping.items():
300
+ if hasattr(props, prop_name) and (date_value := getattr(props, prop_name)):
301
+ with contextlib.suppress(Exception):
302
+ if hasattr(date_value, "isoformat"):
303
+ metadata[meta_key] = date_value.isoformat() # type: ignore[literal-required]
304
+ else:
305
+ metadata[meta_key] = str(date_value) # type: ignore[literal-required]
306
+
307
+ @staticmethod
308
+ def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
309
+ """Add structural information about the spreadsheet."""
310
+ if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
311
+ return
312
+
313
+ sheet_count = len(workbook.sheet_names)
314
+ structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
315
+
316
+ # Don't list too many sheet names (magic number made constant)
317
+ max_sheet_names_to_list = 5
318
+ if sheet_count <= max_sheet_names_to_list:
319
+ structure_info += f": {', '.join(workbook.sheet_names)}"
320
+
321
+ metadata["description"] = structure_info
322
+
323
+ @staticmethod
324
+ def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
325
+ """Analyze spreadsheet content for complexity indicators."""
326
+ with contextlib.suppress(Exception):
327
+ has_formulas = False
328
+ total_cells = 0
329
+
330
+ # Check only first few sheets for performance
331
+ max_sheets_to_check = 3
332
+ max_rows_to_check = 50
333
+
334
+ for sheet_name in workbook.sheet_names[:max_sheets_to_check]:
335
+ with contextlib.suppress(Exception):
336
+ sheet = workbook.get_sheet_by_name(sheet_name)
337
+ data = sheet.to_python()
338
+
339
+ for row in data[:max_rows_to_check]:
340
+ if not row: # Skip empty rows
341
+ continue
342
+
343
+ total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
344
+
345
+ # Check for formulas (simple heuristic)
346
+ if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
347
+ has_formulas = True
348
+ break
349
+
350
+ # Build summary
351
+ summary_parts = []
352
+ if total_cells > 0:
353
+ summary_parts.append(f"Contains {total_cells}+ data cells")
354
+ if has_formulas:
355
+ summary_parts.append("includes formulas")
356
+
357
+ if summary_parts and "summary" not in metadata:
358
+ metadata["summary"] = f"Spreadsheet that {', '.join(summary_parts)}."
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
+
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
10
+ from kreuzberg._types import ExtractionResult, normalize_metadata
11
+ from kreuzberg._utils._string import normalize_spaces, safe_decode
12
+ from kreuzberg._utils._sync import run_sync
13
+
14
+ if TYPE_CHECKING:
15
+ from pathlib import Path
16
+
17
+
18
+ class StructuredDataExtractor(Extractor):
19
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
20
+ JSON_MIME_TYPE,
21
+ "text/json",
22
+ YAML_MIME_TYPE,
23
+ "text/yaml",
24
+ "text/x-yaml",
25
+ "application/yaml",
26
+ TOML_MIME_TYPE,
27
+ "text/toml",
28
+ }
29
+
30
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
31
+ return await run_sync(self.extract_bytes_sync, content)
32
+
33
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
34
+ content = await AsyncPath(path).read_bytes()
35
+ return await self.extract_bytes_async(content)
36
+
37
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
38
+ text_content = safe_decode(content)
39
+
40
+ try:
41
+ if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
42
+ data = json.loads(text_content)
43
+ elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
44
+ try:
45
+ import tomllib # type: ignore[import-not-found]
46
+ except ImportError:
47
+ try:
48
+ import tomli as tomllib # type: ignore[import-not-found]
49
+ except ImportError:
50
+ return ExtractionResult(
51
+ content=normalize_spaces(text_content),
52
+ mime_type=PLAIN_TEXT_MIME_TYPE,
53
+ metadata={"warning": "tomllib/tomli not available, returning raw text"},
54
+ chunks=[],
55
+ )
56
+ data = tomllib.loads(text_content)
57
+ else:
58
+ try:
59
+ import yaml
60
+
61
+ data = yaml.safe_load(text_content)
62
+ except ImportError:
63
+ return ExtractionResult(
64
+ content=normalize_spaces(text_content),
65
+ mime_type=PLAIN_TEXT_MIME_TYPE,
66
+ metadata={"warning": "PyYAML not available, returning raw text"},
67
+ chunks=[],
68
+ )
69
+
70
+ text_parts: list[str] = []
71
+ metadata: dict[str, Any] = {}
72
+
73
+ if isinstance(data, dict):
74
+ text_parts.extend(self._extract_from_dict(data, metadata))
75
+ elif isinstance(data, list):
76
+ text_parts.extend(self._extract_from_list(data, metadata))
77
+ else:
78
+ text_parts.append(str(data))
79
+
80
+ combined_text = "\n".join(text_parts) if text_parts else text_content
81
+
82
+ return ExtractionResult(
83
+ content=normalize_spaces(combined_text),
84
+ mime_type=PLAIN_TEXT_MIME_TYPE,
85
+ metadata=normalize_metadata(metadata),
86
+ chunks=[],
87
+ )
88
+
89
+ except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
90
+ return ExtractionResult(
91
+ content=normalize_spaces(text_content),
92
+ mime_type=PLAIN_TEXT_MIME_TYPE,
93
+ metadata={"parse_error": str(e)},
94
+ chunks=[],
95
+ )
96
+
97
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
98
+ content = path.read_bytes()
99
+ return self.extract_bytes_sync(content)
100
+
101
+ def _extract_from_dict(self, data: dict[str, Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
102
+ text_parts = []
103
+
104
+ for key, value in data.items():
105
+ full_key = f"{prefix}.{key}" if prefix else key
106
+
107
+ if isinstance(value, str) and value.strip():
108
+ text_parts.append(f"{full_key}: {value}")
109
+
110
+ if any(
111
+ text_field in key.lower()
112
+ for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
113
+ ):
114
+ metadata[full_key] = value
115
+
116
+ elif isinstance(value, (int, float, bool)):
117
+ text_parts.append(f"{full_key}: {value}")
118
+
119
+ elif isinstance(value, dict):
120
+ text_parts.extend(self._extract_from_dict(value, metadata, full_key))
121
+
122
+ elif isinstance(value, list):
123
+ text_parts.extend(self._extract_from_list(value, metadata, full_key))
124
+
125
+ elif value is not None:
126
+ text_parts.append(f"{full_key}: {value!s}")
127
+
128
+ return text_parts
129
+
130
+ def _extract_from_list(self, data: list[Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
131
+ text_parts = []
132
+
133
+ for i, item in enumerate(data):
134
+ item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
135
+
136
+ if isinstance(item, str) and item.strip():
137
+ text_parts.append(f"{item_key}: {item}")
138
+
139
+ elif isinstance(item, dict):
140
+ text_parts.extend(self._extract_from_dict(item, metadata, item_key))
141
+
142
+ elif isinstance(item, list):
143
+ text_parts.extend(self._extract_from_list(item, metadata, item_key))
144
+
145
+ elif item is not None:
146
+ text_parts.append(f"{item_key}: {item!s}")
147
+
148
+ return text_parts