kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -18,11 +18,8 @@ from playa import parse
18
18
  from kreuzberg._extractors._base import Extractor
19
19
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
20
20
  from kreuzberg._ocr import get_ocr_backend
21
- from kreuzberg._ocr._easyocr import EasyOCRConfig
22
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
- from kreuzberg._ocr._tesseract import TesseractConfig
24
21
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
25
- from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
22
+ from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
26
23
  from kreuzberg._utils._errors import create_error_context, should_retry
27
24
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
28
25
  from kreuzberg._utils._string import normalize_spaces
@@ -65,7 +62,6 @@ class PDFExtractor(Extractor):
65
62
  if self._validate_extracted_text(content):
66
63
  result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
67
64
  except ParsingError:
68
- # If searchable text extraction fails, continue to OCR or empty result
69
65
  pass
70
66
 
71
67
  if not result and self.config.ocr_backend is not None:
@@ -77,7 +73,7 @@ class PDFExtractor(Extractor):
77
73
  result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
78
74
 
79
75
  if self.config.extract_tables:
80
- # GMFT is optional dependency
76
+ # GMFT is optional dependency ~keep
81
77
  try:
82
78
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
83
79
 
@@ -85,7 +81,6 @@ class PDFExtractor(Extractor):
85
81
  except ImportError: # pragma: no cover
86
82
  result.tables = []
87
83
 
88
- # Enhance metadata with table information
89
84
  if result.tables:
90
85
  table_summary = generate_table_summary(result.tables)
91
86
  result.metadata = result.metadata | {
@@ -126,7 +121,7 @@ class PDFExtractor(Extractor):
126
121
 
127
122
  tables = []
128
123
  if self.config.extract_tables:
129
- # GMFT is optional dependency
124
+ # GMFT is optional dependency ~keep
130
125
  try:
131
126
  from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
132
127
 
@@ -134,7 +129,6 @@ class PDFExtractor(Extractor):
134
129
  except ImportError:
135
130
  tables = []
136
131
 
137
- # Use playa for better text structure preservation when not using OCR
138
132
  if not self.config.force_ocr and self._validate_extracted_text(text):
139
133
  text = self._extract_with_playa_sync(path, fallback_text=text)
140
134
 
@@ -148,7 +142,6 @@ class PDFExtractor(Extractor):
148
142
  chunks=[],
149
143
  )
150
144
 
151
- # Enhance metadata with table information
152
145
  if tables:
153
146
  table_summary = generate_table_summary(tables)
154
147
  result.metadata = result.metadata | {
@@ -158,25 +151,9 @@ class PDFExtractor(Extractor):
158
151
  f"{table_summary['total_rows']} total rows",
159
152
  }
160
153
 
161
- # Apply quality processing
162
154
  return self._apply_quality_processing(result)
163
155
 
164
156
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
165
- """Check if text extracted from PDF is valid or corrupted.
166
-
167
- This checks for indicators of corrupted PDF text extraction:
168
- 1. Empty or whitespace-only text
169
- 2. High concentration of control characters and null bytes
170
- 3. High concentration of Unicode replacement characters
171
-
172
- Args:
173
- text: The extracted text to validate
174
- corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
175
- characters (default: 0.05 or 5%)
176
-
177
- Returns:
178
- True if the text appears valid, False if it seems corrupted
179
- """
180
157
  if not text or not text.strip():
181
158
  return False
182
159
 
@@ -188,17 +165,6 @@ class PDFExtractor(Extractor):
188
165
  return (len(corruption_matches) / len(text)) < corruption_threshold
189
166
 
190
167
  async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
191
- """Convert a PDF file to images.
192
-
193
- Args:
194
- input_file: The path to the PDF file.
195
-
196
- Raises:
197
- ParsingError: If the PDF file could not be converted to images.
198
-
199
- Returns:
200
- A list of Pillow Images.
201
- """
202
168
  document: pypdfium2.PdfDocument | None = None
203
169
  last_error = None
204
170
 
@@ -206,7 +172,7 @@ class PDFExtractor(Extractor):
206
172
  try:
207
173
  with pypdfium_file_lock(input_file):
208
174
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
209
- return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
175
+ return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
210
176
  except pypdfium2.PdfiumError as e: # noqa: PERF203
211
177
  last_error = e
212
178
  if not should_retry(e, attempt + 1):
@@ -238,39 +204,18 @@ class PDFExtractor(Extractor):
238
204
  ) from last_error
239
205
 
240
206
  async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
241
- """Extract text from a scanned PDF file using OCR.
242
-
243
- Args:
244
- input_file: The path to the PDF file.
245
- ocr_backend: The OCR backend to use.
246
-
247
- Returns:
248
- The extraction result with text content and metadata.
249
- """
250
207
  images = await self._convert_pdf_to_images(input_file)
251
208
  backend = get_ocr_backend(ocr_backend)
252
209
  ocr_results = await run_taskgroup_batched(
253
210
  *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
254
211
  batch_size=cpu_count(),
255
212
  )
256
- # Use list comprehension and join for efficient string building
257
213
  content = "\n".join(result.content for result in ocr_results)
258
214
 
259
215
  return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
260
216
 
261
217
  @staticmethod
262
218
  async def _extract_pdf_searchable_text(input_file: Path) -> str:
263
- """Extract text from a searchable PDF file using pypdfium2.
264
-
265
- Args:
266
- input_file: The path to the PDF file.
267
-
268
- Raises:
269
- ParsingError: If the text could not be extracted from the PDF file.
270
-
271
- Returns:
272
- The extracted text.
273
- """
274
219
  document: pypdfium2.PdfDocument | None = None
275
220
  try:
276
221
  with pypdfium_file_lock(input_file):
@@ -318,7 +263,6 @@ class PDFExtractor(Extractor):
318
263
  await run_sync(document.close)
319
264
 
320
265
  def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
321
- """Extract searchable text from PDF using pypdfium2 (sync version)."""
322
266
  pdf = None
323
267
  try:
324
268
  with pypdfium_file_lock(path):
@@ -339,7 +283,6 @@ class PDFExtractor(Extractor):
339
283
  pdf.close()
340
284
 
341
285
  def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
342
- """Extract text from PDF using OCR (sync version)."""
343
286
  pdf = None
344
287
  try:
345
288
  images = []
@@ -352,23 +295,7 @@ class PDFExtractor(Extractor):
352
295
  bitmap.close()
353
296
  page.close()
354
297
 
355
- image_paths = []
356
- temp_files = []
357
-
358
- try:
359
- for i, img in enumerate(images):
360
- fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
361
- temp_files.append((fd, temp_path))
362
- img.save(temp_path, format="PNG")
363
- os.close(fd)
364
- image_paths.append(temp_path)
365
-
366
- return self._process_pdf_images_with_ocr(image_paths)
367
-
368
- finally:
369
- for _, temp_path in temp_files:
370
- with contextlib.suppress(OSError):
371
- Path(temp_path).unlink()
298
+ return self._process_pdf_images_with_ocr_direct(images)
372
299
 
373
300
  except Exception as e:
374
301
  raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -378,7 +305,6 @@ class PDFExtractor(Extractor):
378
305
  pdf.close()
379
306
 
380
307
  def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
381
- """Process PDF images with the configured OCR backend."""
382
308
  backend = get_ocr_backend(self.config.ocr_backend)
383
309
  paths = [Path(p) for p in image_paths]
384
310
 
@@ -401,18 +327,48 @@ class PDFExtractor(Extractor):
401
327
  case _:
402
328
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
403
329
 
404
- # Use list comprehension and join for efficient string building
330
+ return "\n\n".join(result.content for result in results)
331
+
332
+ def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
333
+ """Process PIL images directly without temp files."""
334
+ backend = get_ocr_backend(self.config.ocr_backend)
335
+
336
+ match self.config.ocr_backend:
337
+ case "tesseract":
338
+ config = (
339
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
340
+ )
341
+ results = []
342
+ for image in images:
343
+ result = backend.process_image_sync(image, **asdict(config))
344
+ results.append(result)
345
+ case "paddleocr":
346
+ paddle_config = (
347
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
348
+ )
349
+ results = []
350
+ for image in images:
351
+ result = backend.process_image_sync(image, **asdict(paddle_config))
352
+ results.append(result)
353
+ case "easyocr":
354
+ easy_config = (
355
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
356
+ )
357
+ results = []
358
+ for image in images:
359
+ result = backend.process_image_sync(image, **asdict(easy_config))
360
+ results.append(result)
361
+ case _:
362
+ raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
363
+
405
364
  return "\n\n".join(result.content for result in results)
406
365
 
407
366
  def _parse_with_password_attempts(self, content: bytes) -> Document:
408
- """Parse PDF with password attempts."""
409
- # Normalize password to list
410
367
  if isinstance(self.config.pdf_password, str):
411
368
  passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
412
369
  else:
413
370
  passwords = list(self.config.pdf_password)
414
371
 
415
- # Try each password in sequence
416
372
  last_exception = None
417
373
  for password in passwords:
418
374
  try:
@@ -421,21 +377,17 @@ class PDFExtractor(Extractor):
421
377
  last_exception = e
422
378
  continue
423
379
 
424
- # If all passwords failed, raise the last exception
425
380
  if last_exception:
426
381
  raise last_exception from None
427
382
 
428
- # Fallback to no password
429
383
  return parse(content, max_workers=1, password="")
430
384
 
431
385
  def _get_passwords_to_try(self) -> list[str]:
432
- """Get list of passwords to try in sequence."""
433
386
  if isinstance(self.config.pdf_password, str):
434
387
  return [self.config.pdf_password] if self.config.pdf_password else [""]
435
388
  return list(self.config.pdf_password) if self.config.pdf_password else [""]
436
389
 
437
390
  async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
438
- """Extract PDF metadata with password attempts."""
439
391
  passwords = self._get_passwords_to_try()
440
392
 
441
393
  last_exception = None
@@ -446,7 +398,6 @@ class PDFExtractor(Extractor):
446
398
  last_exception = e
447
399
  continue
448
400
 
449
- # If all passwords failed, try with empty password as fallback
450
401
  try:
451
402
  return await extract_pdf_metadata(content, password="")
452
403
  except Exception:
@@ -455,7 +406,6 @@ class PDFExtractor(Extractor):
455
406
  raise
456
407
 
457
408
  def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
458
- """Extract PDF metadata with password attempts (sync version)."""
459
409
  passwords = self._get_passwords_to_try()
460
410
 
461
411
  last_exception = None
@@ -466,7 +416,6 @@ class PDFExtractor(Extractor):
466
416
  last_exception = e
467
417
  continue
468
418
 
469
- # If all passwords failed, try with empty password as fallback
470
419
  try:
471
420
  return extract_pdf_metadata_sync(content, password="")
472
421
  except Exception:
@@ -475,12 +424,10 @@ class PDFExtractor(Extractor):
475
424
  raise
476
425
 
477
426
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
478
- """Extract text using playa for better structure preservation."""
479
427
  with contextlib.suppress(Exception):
480
428
  content = path.read_bytes()
481
429
  document = self._parse_with_password_attempts(content)
482
430
 
483
- # Extract text while preserving structure
484
431
  pages_text = []
485
432
  for page in document.pages:
486
433
  page_text = page.extract_text()
@@ -1,12 +1,3 @@
1
- """This module provides functions to extract textual content from files.
2
-
3
- It includes vendored code:
4
-
5
- - The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
6
- See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
7
- Refer to the markitdown repository for it's license (MIT).
8
- """
9
-
10
1
  from __future__ import annotations
11
2
 
12
3
  import re
@@ -30,7 +21,6 @@ if TYPE_CHECKING: # pragma: no cover
30
21
 
31
22
  from kreuzberg._types import Metadata
32
23
 
33
- # Pre-compiled regex patterns for performance
34
24
  _NON_WORD_PATTERN = re.compile(r"\W")
35
25
 
36
26
 
@@ -201,15 +191,12 @@ class PresentationExtractor(Extractor):
201
191
  """
202
192
  metadata: Metadata = {}
203
193
 
204
- # Extract core properties
205
194
  PresentationExtractor._extract_core_properties(presentation, metadata)
206
195
 
207
- # Extract fonts used in presentation
208
196
  fonts = PresentationExtractor._extract_fonts(presentation)
209
197
  if fonts:
210
198
  metadata["fonts"] = list(fonts)
211
199
 
212
- # Add structural information
213
200
  PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
214
201
 
215
202
  return metadata
@@ -217,7 +204,6 @@ class PresentationExtractor(Extractor):
217
204
  @staticmethod
218
205
  def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
219
206
  """Extract core document properties from presentation."""
220
- # Property mapping for core metadata
221
207
  property_mapping = [
222
208
  ("authors", "author"),
223
209
  ("comments", "comments"),
@@ -236,7 +222,6 @@ class PresentationExtractor(Extractor):
236
222
  if core_property := getattr(presentation.core_properties, core_property_key, None):
237
223
  metadata[metadata_key] = core_property # type: ignore[literal-required]
238
224
 
239
- # Handle special list properties
240
225
  if presentation.core_properties.language:
241
226
  metadata["languages"] = [presentation.core_properties.language]
242
227
 
@@ -265,7 +250,6 @@ class PresentationExtractor(Extractor):
265
250
  if slide_count == 0:
266
251
  return
267
252
 
268
- # Build description
269
253
  structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
270
254
 
271
255
  slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
@@ -274,7 +258,6 @@ class PresentationExtractor(Extractor):
274
258
 
275
259
  metadata["description"] = structure_info
276
260
 
277
- # Build summary if not already present
278
261
  if "summary" not in metadata:
279
262
  summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
280
263
  if slides_with_notes > 0:
@@ -10,15 +10,17 @@ from io import StringIO
10
10
  from pathlib import Path
11
11
  from typing import Any
12
12
 
13
+ import polars as pl
13
14
  from anyio import Path as AsyncPath
14
15
  from PIL import Image
15
16
  from python_calamine import CalamineWorkbook
16
17
 
17
18
  from kreuzberg._extractors._base import Extractor
18
19
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
19
- from kreuzberg._types import ExtractionResult, Metadata
20
+ from kreuzberg._types import ExtractionResult, Metadata, TableData
20
21
  from kreuzberg._utils._string import normalize_spaces
21
22
  from kreuzberg._utils._sync import run_sync, run_taskgroup
23
+ from kreuzberg._utils._table import enhance_table_markdown
22
24
  from kreuzberg._utils._tmp import create_temp_file
23
25
  from kreuzberg.exceptions import ParsingError
24
26
 
@@ -108,14 +110,6 @@ class SpreadSheetExtractor(Extractor):
108
110
 
109
111
  @staticmethod
110
112
  def _convert_cell_to_str(value: Any) -> str:
111
- """Convert a cell value to string representation.
112
-
113
- Args:
114
- value: The cell value to convert.
115
-
116
- Returns:
117
- String representation of the cell value.
118
- """
119
113
  if value is None:
120
114
  return ""
121
115
  if isinstance(value, bool):
@@ -139,7 +133,7 @@ class SpreadSheetExtractor(Extractor):
139
133
  csv_buffer.close()
140
134
 
141
135
  csv_path, unlink = await create_temp_file(".csv")
142
- await AsyncPath(csv_path).write_text(csv_data)
136
+ await AsyncPath(csv_path).write_text(csv_data, encoding="utf-8")
143
137
 
144
138
  csv_reader = csv.reader(StringIO(csv_data))
145
139
  rows = list(csv_reader)
@@ -162,7 +156,6 @@ class SpreadSheetExtractor(Extractor):
162
156
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
163
157
 
164
158
  def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
165
- """Synchronous version of _convert_sheet_to_text."""
166
159
  values = workbook.get_sheet_by_name(sheet_name).to_python()
167
160
 
168
161
  csv_buffer = StringIO()
@@ -195,82 +188,57 @@ class SpreadSheetExtractor(Extractor):
195
188
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
196
189
 
197
190
  def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
198
- """Enhanced sheet processing with better table structure preservation."""
199
191
  try:
200
- # pandas is optional dependency
201
- import pandas as pd # noqa: PLC0415
202
-
203
- from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
204
-
205
192
  sheet = workbook.get_sheet_by_name(sheet_name)
206
193
  data = sheet.to_python()
207
194
 
208
195
  if not data or not any(row for row in data):
209
196
  return f"## {sheet_name}\n\n*Empty sheet*"
210
197
 
211
- # Convert to DataFrame
212
- df = pd.DataFrame(data)
198
+ df = pl.DataFrame(data)
213
199
 
214
- # Clean up empty rows and columns
215
- df = df.dropna(how="all").dropna(axis=1, how="all")
200
+ df = df.filter(~pl.all_horizontal(pl.all().is_null()))
201
+ df = df.select([col for col in df.columns if not df[col].is_null().all()])
216
202
 
217
- if df.empty:
203
+ if df.is_empty():
218
204
  return f"## {sheet_name}\n\n*No data*"
219
205
 
220
- # Create a mock TableData for enhanced formatting
221
- from kreuzberg._types import TableData # noqa: PLC0415
222
-
223
- # Create a 1x1 transparent image as placeholder
224
206
  placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
225
207
  mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
226
208
 
227
209
  enhanced_markdown = enhance_table_markdown(mock_table)
228
210
  return f"## {sheet_name}\n\n{enhanced_markdown}"
229
211
 
230
- except (ImportError, AttributeError, ValueError):
231
- # Fallback to original method if pandas/table enhancement fails
212
+ except (AttributeError, ValueError):
232
213
  return self._convert_sheet_to_text_sync(workbook, sheet_name)
233
214
 
234
215
  @staticmethod
235
216
  def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
236
- """Extract metadata from spreadsheet using python-calamine.
237
-
238
- Args:
239
- workbook: CalamineWorkbook instance
240
-
241
- Returns:
242
- Metadata dict using existing metadata keys where possible
243
- """
244
217
  metadata: Metadata = {}
245
218
 
246
- # Extract basic document properties
247
219
  SpreadSheetExtractor._extract_document_properties(workbook, metadata)
248
220
 
249
- # Add structural information
250
221
  SpreadSheetExtractor._add_structure_info(workbook, metadata)
251
222
 
252
- # Analyze content complexity
253
223
  SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
254
224
 
255
225
  return metadata
256
226
 
257
227
  @staticmethod
258
228
  def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
259
- """Extract basic document properties from workbook."""
260
229
  with contextlib.suppress(AttributeError, Exception):
261
230
  if not (hasattr(workbook, "metadata") and workbook.metadata):
262
231
  return
263
232
 
264
233
  props = workbook.metadata
265
234
 
266
- # Basic properties mapping
267
235
  property_mapping = {
268
236
  "title": "title",
269
- "author": "authors", # Convert to list
237
+ "author": "authors",
270
238
  "subject": "subject",
271
239
  "comments": "comments",
272
- "keywords": "keywords", # Process separately
273
- "category": "categories", # Convert to list
240
+ "keywords": "keywords",
241
+ "category": "categories",
274
242
  "company": "organization",
275
243
  "manager": "modified_by",
276
244
  }
@@ -286,12 +254,10 @@ class SpreadSheetExtractor(Extractor):
286
254
  else:
287
255
  metadata[meta_key] = value # type: ignore[literal-required]
288
256
 
289
- # Handle dates separately
290
257
  SpreadSheetExtractor._extract_date_properties(props, metadata)
291
258
 
292
259
  @staticmethod
293
260
  def _extract_date_properties(props: Any, metadata: Metadata) -> None:
294
- """Extract and format date properties."""
295
261
  date_mapping = {"created": "created_at", "modified": "modified_at"}
296
262
 
297
263
  for prop_name, meta_key in date_mapping.items():
@@ -304,14 +270,12 @@ class SpreadSheetExtractor(Extractor):
304
270
 
305
271
  @staticmethod
306
272
  def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
307
- """Add structural information about the spreadsheet."""
308
273
  if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
309
274
  return
310
275
 
311
276
  sheet_count = len(workbook.sheet_names)
312
277
  structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
313
278
 
314
- # Don't list too many sheet names (magic number made constant)
315
279
  max_sheet_names_to_list = 5
316
280
  if sheet_count <= max_sheet_names_to_list:
317
281
  structure_info += f": {', '.join(workbook.sheet_names)}"
@@ -320,12 +284,10 @@ class SpreadSheetExtractor(Extractor):
320
284
 
321
285
  @staticmethod
322
286
  def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
323
- """Analyze spreadsheet content for complexity indicators."""
324
287
  with contextlib.suppress(Exception):
325
288
  has_formulas = False
326
289
  total_cells = 0
327
290
 
328
- # Check only first few sheets for performance
329
291
  max_sheets_to_check = 3
330
292
  max_rows_to_check = 50
331
293
 
@@ -335,17 +297,15 @@ class SpreadSheetExtractor(Extractor):
335
297
  data = sheet.to_python()
336
298
 
337
299
  for row in data[:max_rows_to_check]:
338
- if not row: # Skip empty rows
300
+ if not row:
339
301
  continue
340
302
 
341
303
  total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
342
304
 
343
- # Check for formulas (simple heuristic)
344
305
  if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
345
306
  has_formulas = True
346
307
  break
347
308
 
348
- # Build summary
349
309
  summary_parts = []
350
310
  if total_cells > 0:
351
311
  summary_parts.append(f"Contains {total_cells}+ data cells")
@@ -28,7 +28,6 @@ from kreuzberg._utils._sync import run_sync
28
28
  if TYPE_CHECKING:
29
29
  from pathlib import Path
30
30
 
31
- # Define text field keywords as a set for O(1) membership testing
32
31
  _TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
33
32
 
34
33
 
@@ -79,7 +78,6 @@ class StructuredDataExtractor(Extractor):
79
78
  text_parts: list[str] = []
80
79
  metadata: dict[str, Any] = {}
81
80
 
82
- # Use match statement for cleaner code and avoid multiple isinstance calls
83
81
  if isinstance(data, dict):
84
82
  text_parts = self._extract_from_dict(data, metadata)
85
83
  elif isinstance(data, list):
@@ -96,7 +94,7 @@ class StructuredDataExtractor(Extractor):
96
94
  chunks=[],
97
95
  )
98
96
 
99
- except (json.JSONDecodeError, ValueError, TypeError) as e:
97
+ except (ValueError, TypeError) as e:
100
98
  return ExtractionResult(
101
99
  content=normalize_spaces(text_content),
102
100
  mime_type=PLAIN_TEXT_MIME_TYPE,
@@ -117,7 +115,6 @@ class StructuredDataExtractor(Extractor):
117
115
  if isinstance(value, str) and value.strip():
118
116
  text_parts.append(f"{full_key}: {value}")
119
117
 
120
- # Check if key contains any text field keywords efficiently
121
118
  key_lower = key.lower()
122
119
  if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
123
120
  metadata[full_key] = value