kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,12 +1,3 @@
1
- """This module provides functions to extract textual content from files.
2
-
3
- It includes vendored code:
4
-
5
- - The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
6
- See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
7
- Refer to the markitdown repository for it's license (MIT).
8
- """
9
-
10
1
  from __future__ import annotations
11
2
 
12
3
  import re
@@ -30,99 +21,27 @@ if TYPE_CHECKING: # pragma: no cover
30
21
 
31
22
  from kreuzberg._types import Metadata
32
23
 
33
- # Pre-compiled regex patterns for performance
34
24
  _NON_WORD_PATTERN = re.compile(r"\W")
35
25
 
36
26
 
37
27
  class PresentationExtractor(Extractor):
38
- """Extractor for PowerPoint (.pptx) files.
39
-
40
- This extractor processes PowerPoint presentations and converts their content into Markdown format.
41
- It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
42
- of the presentation in a readable text format.
43
-
44
- The extractor provides both synchronous and asynchronous methods for processing files either
45
- from disk or from bytes in memory.
46
- """
47
-
48
28
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
49
29
 
50
30
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
51
- """Asynchronously extract content from PowerPoint file bytes.
52
-
53
- Args:
54
- content: Raw bytes of the PowerPoint file to process.
55
-
56
- Returns:
57
- ExtractionResult: Contains the extracted content in Markdown format,
58
- the MIME type, and any additional metadata.
59
- """
60
31
  return self._extract_pptx(content)
61
32
 
62
33
  async def extract_path_async(self, path: Path) -> ExtractionResult:
63
- """Asynchronously extract content from a PowerPoint file on disk.
64
-
65
- Args:
66
- path: Path to the PowerPoint file to process.
67
-
68
- Returns:
69
- ExtractionResult: Contains the extracted content in Markdown format,
70
- the MIME type, and any additional metadata.
71
- """
72
34
  content = await AsyncPath(path).read_bytes()
73
35
  return self._extract_pptx(content)
74
36
 
75
37
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
76
- """Synchronously extract content from PowerPoint file bytes.
77
-
78
- Args:
79
- content: Raw bytes of the PowerPoint file to process.
80
-
81
- Returns:
82
- ExtractionResult: Contains the extracted content in Markdown format,
83
- the MIME type, and any additional metadata.
84
- """
85
38
  return self._extract_pptx(content)
86
39
 
87
40
  def extract_path_sync(self, path: Path) -> ExtractionResult:
88
- """Synchronously extract content from a PowerPoint file on disk.
89
-
90
- Args:
91
- path: Path to the PowerPoint file to process.
92
-
93
- Returns:
94
- ExtractionResult: Contains the extracted content in Markdown format,
95
- the MIME type, and any additional metadata.
96
- """
97
41
  content = Path(path).read_bytes()
98
42
  return self._extract_pptx(content)
99
43
 
100
44
  def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
101
- """Process PowerPoint file contents and convert to Markdown.
102
-
103
- This method handles the core logic of extracting content from a PowerPoint file.
104
- It processes:
105
- - Slide titles and content
106
- - Images (with alt text if available)
107
- - Tables (converted to HTML format)
108
- - Text frames
109
- - Slide notes
110
-
111
- Args:
112
- file_contents: Raw bytes of the PowerPoint file to process.
113
-
114
- Returns:
115
- ExtractionResult: Contains the extracted content in Markdown format,
116
- the MIME type, and any additional metadata.
117
-
118
- Notes:
119
- The extraction preserves the following elements:
120
- - Slide numbers (as HTML comments)
121
- - Images (converted to Markdown image syntax with alt text)
122
- - Tables (converted to HTML table syntax)
123
- - Text content (with titles properly formatted)
124
- - Slide notes (under a dedicated section for each slide)
125
- """
126
45
  md_content = ""
127
46
  presentation = pptx.Presentation(BytesIO(file_contents))
128
47
 
@@ -191,33 +110,20 @@ class PresentationExtractor(Extractor):
191
110
 
192
111
  @staticmethod
193
112
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
194
- """Extract metadata from a presentation instance.
195
-
196
- Args:
197
- presentation: A `Presentation` object representing the PowerPoint file.
198
-
199
- Returns:
200
- PresentationMetadata: Object containing presentation-specific metadata fields.
201
- """
202
113
  metadata: Metadata = {}
203
114
 
204
- # Extract core properties
205
115
  PresentationExtractor._extract_core_properties(presentation, metadata)
206
116
 
207
- # Extract fonts used in presentation
208
117
  fonts = PresentationExtractor._extract_fonts(presentation)
209
118
  if fonts:
210
119
  metadata["fonts"] = list(fonts)
211
120
 
212
- # Add structural information
213
121
  PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
214
122
 
215
123
  return metadata
216
124
 
217
125
  @staticmethod
218
126
  def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
219
- """Extract core document properties from presentation."""
220
- # Property mapping for core metadata
221
127
  property_mapping = [
222
128
  ("authors", "author"),
223
129
  ("comments", "comments"),
@@ -236,7 +142,6 @@ class PresentationExtractor(Extractor):
236
142
  if core_property := getattr(presentation.core_properties, core_property_key, None):
237
143
  metadata[metadata_key] = core_property # type: ignore[literal-required]
238
144
 
239
- # Handle special list properties
240
145
  if presentation.core_properties.language:
241
146
  metadata["languages"] = [presentation.core_properties.language]
242
147
 
@@ -245,7 +150,6 @@ class PresentationExtractor(Extractor):
245
150
 
246
151
  @staticmethod
247
152
  def _extract_fonts(presentation: Presentation) -> set[str]:
248
- """Extract all fonts used in the presentation."""
249
153
  fonts = set()
250
154
  for slide in presentation.slides:
251
155
  for shape in slide.shapes:
@@ -260,12 +164,10 @@ class PresentationExtractor(Extractor):
260
164
 
261
165
  @staticmethod
262
166
  def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
263
- """Add structural information about the presentation."""
264
167
  slide_count = len(presentation.slides)
265
168
  if slide_count == 0:
266
169
  return
267
170
 
268
- # Build description
269
171
  structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
270
172
 
271
173
  slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
@@ -274,7 +176,6 @@ class PresentationExtractor(Extractor):
274
176
 
275
177
  metadata["description"] = structure_info
276
178
 
277
- # Build summary if not already present
278
179
  if "summary" not in metadata:
279
180
  summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
280
181
  if slides_with_notes > 0:
@@ -10,15 +10,17 @@ from io import StringIO
10
10
  from pathlib import Path
11
11
  from typing import Any
12
12
 
13
+ import polars as pl
13
14
  from anyio import Path as AsyncPath
14
15
  from PIL import Image
15
16
  from python_calamine import CalamineWorkbook
16
17
 
17
18
  from kreuzberg._extractors._base import Extractor
18
19
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
19
- from kreuzberg._types import ExtractionResult, Metadata
20
+ from kreuzberg._types import ExtractionResult, Metadata, TableData
20
21
  from kreuzberg._utils._string import normalize_spaces
21
22
  from kreuzberg._utils._sync import run_sync, run_taskgroup
23
+ from kreuzberg._utils._table import enhance_table_markdown
22
24
  from kreuzberg._utils._tmp import create_temp_file
23
25
  from kreuzberg.exceptions import ParsingError
24
26
 
@@ -70,7 +72,6 @@ class SpreadSheetExtractor(Extractor):
70
72
  ) from e
71
73
 
72
74
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
73
- """Pure sync implementation of extract_bytes."""
74
75
  fd, temp_path = tempfile.mkstemp(suffix=".xlsx")
75
76
 
76
77
  try:
@@ -83,7 +84,6 @@ class SpreadSheetExtractor(Extractor):
83
84
  Path(temp_path).unlink()
84
85
 
85
86
  def extract_path_sync(self, path: Path) -> ExtractionResult:
86
- """Pure sync implementation of extract_path."""
87
87
  try:
88
88
  workbook = CalamineWorkbook.from_path(str(path))
89
89
  results = []
@@ -108,14 +108,6 @@ class SpreadSheetExtractor(Extractor):
108
108
 
109
109
  @staticmethod
110
110
  def _convert_cell_to_str(value: Any) -> str:
111
- """Convert a cell value to string representation.
112
-
113
- Args:
114
- value: The cell value to convert.
115
-
116
- Returns:
117
- String representation of the cell value.
118
- """
119
111
  if value is None:
120
112
  return ""
121
113
  if isinstance(value, bool):
@@ -139,7 +131,7 @@ class SpreadSheetExtractor(Extractor):
139
131
  csv_buffer.close()
140
132
 
141
133
  csv_path, unlink = await create_temp_file(".csv")
142
- await AsyncPath(csv_path).write_text(csv_data)
134
+ await AsyncPath(csv_path).write_text(csv_data, encoding="utf-8")
143
135
 
144
136
  csv_reader = csv.reader(StringIO(csv_data))
145
137
  rows = list(csv_reader)
@@ -162,7 +154,6 @@ class SpreadSheetExtractor(Extractor):
162
154
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
163
155
 
164
156
  def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
165
- """Synchronous version of _convert_sheet_to_text."""
166
157
  values = workbook.get_sheet_by_name(sheet_name).to_python()
167
158
 
168
159
  csv_buffer = StringIO()
@@ -195,82 +186,57 @@ class SpreadSheetExtractor(Extractor):
195
186
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
196
187
 
197
188
  def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
198
- """Enhanced sheet processing with better table structure preservation."""
199
189
  try:
200
- # pandas is optional dependency
201
- import pandas as pd # noqa: PLC0415
202
-
203
- from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
204
-
205
190
  sheet = workbook.get_sheet_by_name(sheet_name)
206
191
  data = sheet.to_python()
207
192
 
208
193
  if not data or not any(row for row in data):
209
194
  return f"## {sheet_name}\n\n*Empty sheet*"
210
195
 
211
- # Convert to DataFrame
212
- df = pd.DataFrame(data)
196
+ df = pl.DataFrame(data)
213
197
 
214
- # Clean up empty rows and columns
215
- df = df.dropna(how="all").dropna(axis=1, how="all")
198
+ df = df.filter(~pl.all_horizontal(pl.all().is_null()))
199
+ df = df.select([col for col in df.columns if not df[col].is_null().all()])
216
200
 
217
- if df.empty:
201
+ if df.is_empty():
218
202
  return f"## {sheet_name}\n\n*No data*"
219
203
 
220
- # Create a mock TableData for enhanced formatting
221
- from kreuzberg._types import TableData # noqa: PLC0415
222
-
223
- # Create a 1x1 transparent image as placeholder
224
204
  placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
225
205
  mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
226
206
 
227
207
  enhanced_markdown = enhance_table_markdown(mock_table)
228
208
  return f"## {sheet_name}\n\n{enhanced_markdown}"
229
209
 
230
- except (ImportError, AttributeError, ValueError):
231
- # Fallback to original method if pandas/table enhancement fails
210
+ except (AttributeError, ValueError):
232
211
  return self._convert_sheet_to_text_sync(workbook, sheet_name)
233
212
 
234
213
  @staticmethod
235
214
  def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
236
- """Extract metadata from spreadsheet using python-calamine.
237
-
238
- Args:
239
- workbook: CalamineWorkbook instance
240
-
241
- Returns:
242
- Metadata dict using existing metadata keys where possible
243
- """
244
215
  metadata: Metadata = {}
245
216
 
246
- # Extract basic document properties
247
217
  SpreadSheetExtractor._extract_document_properties(workbook, metadata)
248
218
 
249
- # Add structural information
250
219
  SpreadSheetExtractor._add_structure_info(workbook, metadata)
251
220
 
252
- # Analyze content complexity
253
221
  SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
254
222
 
255
223
  return metadata
256
224
 
257
225
  @staticmethod
258
226
  def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
259
- """Extract basic document properties from workbook."""
260
227
  with contextlib.suppress(AttributeError, Exception):
261
228
  if not (hasattr(workbook, "metadata") and workbook.metadata):
262
229
  return
263
230
 
264
231
  props = workbook.metadata
265
232
 
266
- # Basic properties mapping
267
233
  property_mapping = {
268
234
  "title": "title",
269
- "author": "authors", # Convert to list
235
+ "author": "authors",
270
236
  "subject": "subject",
271
237
  "comments": "comments",
272
- "keywords": "keywords", # Process separately
273
- "category": "categories", # Convert to list
238
+ "keywords": "keywords",
239
+ "category": "categories",
274
240
  "company": "organization",
275
241
  "manager": "modified_by",
276
242
  }
@@ -286,12 +252,10 @@ class SpreadSheetExtractor(Extractor):
286
252
  else:
287
253
  metadata[meta_key] = value # type: ignore[literal-required]
288
254
 
289
- # Handle dates separately
290
255
  SpreadSheetExtractor._extract_date_properties(props, metadata)
291
256
 
292
257
  @staticmethod
293
258
  def _extract_date_properties(props: Any, metadata: Metadata) -> None:
294
- """Extract and format date properties."""
295
259
  date_mapping = {"created": "created_at", "modified": "modified_at"}
296
260
 
297
261
  for prop_name, meta_key in date_mapping.items():
@@ -304,14 +268,12 @@ class SpreadSheetExtractor(Extractor):
304
268
 
305
269
  @staticmethod
306
270
  def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
307
- """Add structural information about the spreadsheet."""
308
271
  if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
309
272
  return
310
273
 
311
274
  sheet_count = len(workbook.sheet_names)
312
275
  structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
313
276
 
314
- # Don't list too many sheet names (magic number made constant)
315
277
  max_sheet_names_to_list = 5
316
278
  if sheet_count <= max_sheet_names_to_list:
317
279
  structure_info += f": {', '.join(workbook.sheet_names)}"
@@ -320,12 +282,10 @@ class SpreadSheetExtractor(Extractor):
320
282
 
321
283
  @staticmethod
322
284
  def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
323
- """Analyze spreadsheet content for complexity indicators."""
324
285
  with contextlib.suppress(Exception):
325
286
  has_formulas = False
326
287
  total_cells = 0
327
288
 
328
- # Check only first few sheets for performance
329
289
  max_sheets_to_check = 3
330
290
  max_rows_to_check = 50
331
291
 
@@ -335,17 +295,15 @@ class SpreadSheetExtractor(Extractor):
335
295
  data = sheet.to_python()
336
296
 
337
297
  for row in data[:max_rows_to_check]:
338
- if not row: # Skip empty rows
298
+ if not row:
339
299
  continue
340
300
 
341
301
  total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
342
302
 
343
- # Check for formulas (simple heuristic)
344
303
  if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
345
304
  has_formulas = True
346
305
  break
347
306
 
348
- # Build summary
349
307
  summary_parts = []
350
308
  if total_cells > 0:
351
309
  summary_parts.append(f"Contains {total_cells}+ data cells")
@@ -28,7 +28,6 @@ from kreuzberg._utils._sync import run_sync
28
28
  if TYPE_CHECKING:
29
29
  from pathlib import Path
30
30
 
31
- # Define text field keywords as a set for O(1) membership testing
32
31
  _TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
33
32
 
34
33
 
@@ -79,7 +78,6 @@ class StructuredDataExtractor(Extractor):
79
78
  text_parts: list[str] = []
80
79
  metadata: dict[str, Any] = {}
81
80
 
82
- # Use match statement for cleaner code and avoid multiple isinstance calls
83
81
  if isinstance(data, dict):
84
82
  text_parts = self._extract_from_dict(data, metadata)
85
83
  elif isinstance(data, list):
@@ -96,7 +94,7 @@ class StructuredDataExtractor(Extractor):
96
94
  chunks=[],
97
95
  )
98
96
 
99
- except (json.JSONDecodeError, ValueError, TypeError) as e:
97
+ except (ValueError, TypeError) as e:
100
98
  return ExtractionResult(
101
99
  content=normalize_spaces(text_content),
102
100
  mime_type=PLAIN_TEXT_MIME_TYPE,
@@ -117,7 +115,6 @@ class StructuredDataExtractor(Extractor):
117
115
  if isinstance(value, str) and value.strip():
118
116
  text_parts.append(f"{full_key}: {value}")
119
117
 
120
- # Check if key contains any text field keywords efficiently
121
118
  key_lower = key.lower()
122
119
  if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
123
120
  metadata[full_key] = value