kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import os
4
5
  import re
6
+ import subprocess
5
7
  import sys
8
+ import tempfile
6
9
  from json import JSONDecodeError, loads
7
10
  from pathlib import Path
8
11
  from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
203
206
  Returns:
204
207
  ExtractionResult with the extracted text and metadata.
205
208
  """
206
- import os
207
- import tempfile
208
- from pathlib import Path
209
-
210
209
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
211
210
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
212
211
 
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
579
578
 
580
579
  def _validate_pandoc_version_sync(self) -> None:
581
580
  """Synchronous version of _validate_pandoc_version."""
582
- import subprocess
583
-
584
581
  try:
585
582
  if self._checked_version:
586
583
  return
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
625
622
 
626
623
  def _extract_metadata_sync(self, path: Path) -> Metadata:
627
624
  """Synchronous version of _handle_extract_metadata."""
628
- import os
629
- import subprocess
630
- import tempfile
631
-
632
625
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
633
626
  fd, metadata_file = tempfile.mkstemp(suffix=".json")
634
627
  os.close(fd)
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
663
656
 
664
657
  def _extract_file_sync(self, path: Path) -> str:
665
658
  """Synchronous version of _handle_extract_file."""
666
- import os
667
- import subprocess
668
- import tempfile
669
-
670
659
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
671
660
  fd, output_path = tempfile.mkstemp(suffix=".md")
672
661
  os.close(fd)
@@ -1,6 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import contextlib
4
+ import os
5
+ import tempfile
4
6
  from multiprocessing import cpu_count
5
7
  from pathlib import Path
6
8
  from re import Pattern
@@ -10,15 +12,21 @@ from typing import TYPE_CHECKING, ClassVar, cast
10
12
  import anyio
11
13
  import pypdfium2
12
14
  from anyio import Path as AsyncPath
15
+ from playa import parse
13
16
 
14
17
  from kreuzberg._extractors._base import Extractor
15
18
  from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
16
19
  from kreuzberg._ocr import get_ocr_backend
17
- from kreuzberg._playa import extract_pdf_metadata
20
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
21
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
22
+ from kreuzberg._ocr._tesseract import TesseractConfig
23
+ from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
18
24
  from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._utils._errors import create_error_context, should_retry
19
26
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
20
27
  from kreuzberg._utils._string import normalize_spaces
21
28
  from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
29
+ from kreuzberg._utils._table import generate_table_summary
22
30
  from kreuzberg._utils._tmp import create_temp_file
23
31
  from kreuzberg.exceptions import ParsingError
24
32
 
@@ -63,17 +71,30 @@ class PDFExtractor(Extractor):
63
71
  result.metadata = await extract_pdf_metadata(content_bytes)
64
72
 
65
73
  if self.config.extract_tables:
66
- from kreuzberg._gmft import extract_tables
67
-
68
- result.tables = await extract_tables(path, self.config.gmft_config)
74
+ # GMFT is optional dependency
75
+ try:
76
+ from kreuzberg._gmft import extract_tables
69
77
 
70
- return result
78
+ result.tables = await extract_tables(path, self.config.gmft_config)
79
+ except ImportError:
80
+ result.tables = []
81
+
82
+ # Enhance metadata with table information
83
+ if result.tables:
84
+ table_summary = generate_table_summary(result.tables)
85
+ result.metadata.update(
86
+ {
87
+ "table_count": table_summary["table_count"],
88
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
89
+ f"across {table_summary['pages_with_tables']} pages with "
90
+ f"{table_summary['total_rows']} total rows",
91
+ }
92
+ )
93
+
94
+ return self._apply_quality_processing(result)
71
95
 
72
96
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
73
97
  """Pure sync implementation of PDF extraction from bytes."""
74
- import os
75
- import tempfile
76
-
77
98
  fd, temp_path = tempfile.mkstemp(suffix=".pdf")
78
99
  try:
79
100
  with os.fdopen(fd, "wb") as f:
@@ -81,8 +102,6 @@ class PDFExtractor(Extractor):
81
102
 
82
103
  result = self.extract_path_sync(Path(temp_path))
83
104
 
84
- from kreuzberg._playa import extract_pdf_metadata_sync
85
-
86
105
  metadata = extract_pdf_metadata_sync(content)
87
106
  result.metadata = metadata
88
107
 
@@ -100,16 +119,21 @@ class PDFExtractor(Extractor):
100
119
 
101
120
  tables = []
102
121
  if self.config.extract_tables:
122
+ # GMFT is optional dependency
103
123
  try:
104
124
  from kreuzberg._gmft import extract_tables_sync
105
125
 
106
126
  tables = extract_tables_sync(path)
107
127
  except ImportError:
108
- pass
128
+ tables = []
129
+
130
+ # Use playa for better text structure preservation when not using OCR
131
+ if not self.config.force_ocr and self._validate_extracted_text(text):
132
+ text = self._extract_with_playa_sync(path, fallback_text=text)
109
133
 
110
134
  text = normalize_spaces(text)
111
135
 
112
- return ExtractionResult(
136
+ result = ExtractionResult(
113
137
  content=text,
114
138
  mime_type=PLAIN_TEXT_MIME_TYPE,
115
139
  metadata={},
@@ -117,6 +141,21 @@ class PDFExtractor(Extractor):
117
141
  chunks=[],
118
142
  )
119
143
 
144
+ # Enhance metadata with table information
145
+ if tables:
146
+ table_summary = generate_table_summary(tables)
147
+ result.metadata.update(
148
+ {
149
+ "table_count": table_summary["table_count"],
150
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
151
+ f"across {table_summary['pages_with_tables']} pages with "
152
+ f"{table_summary['total_rows']} total rows",
153
+ }
154
+ )
155
+
156
+ # Apply quality processing
157
+ return self._apply_quality_processing(result)
158
+
120
159
  def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
121
160
  """Check if text extracted from PDF is valid or corrupted.
122
161
 
@@ -155,8 +194,6 @@ class PDFExtractor(Extractor):
155
194
  Returns:
156
195
  A list of Pillow Images.
157
196
  """
158
- from kreuzberg._utils._errors import create_error_context, should_retry
159
-
160
197
  document: pypdfium2.PdfDocument | None = None
161
198
  last_error = None
162
199
 
@@ -228,8 +265,6 @@ class PDFExtractor(Extractor):
228
265
  Returns:
229
266
  The extracted text.
230
267
  """
231
- from kreuzberg._utils._errors import create_error_context
232
-
233
268
  document: pypdfium2.PdfDocument | None = None
234
269
  try:
235
270
  with pypdfium_file_lock(input_file):
@@ -283,7 +318,7 @@ class PDFExtractor(Extractor):
283
318
  text_parts = []
284
319
  for page in pdf:
285
320
  text_page = page.get_textpage()
286
- text = text_page.get_text_range()
321
+ text = text_page.get_text_bounded()
287
322
  text_parts.append(text)
288
323
  text_page.close()
289
324
  page.close()
@@ -299,8 +334,6 @@ class PDFExtractor(Extractor):
299
334
  """Extract text from PDF using OCR (sync version)."""
300
335
  pdf = None
301
336
  try:
302
- from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
303
-
304
337
  images = []
305
338
  with pypdfium_file_lock(path):
306
339
  pdf = pypdfium2.PdfDocument(str(path))
@@ -311,9 +344,6 @@ class PDFExtractor(Extractor):
311
344
  bitmap.close()
312
345
  page.close()
313
346
 
314
- import os
315
- import tempfile
316
-
317
347
  image_paths = []
318
348
  temp_files = []
319
349
 
@@ -325,18 +355,7 @@ class PDFExtractor(Extractor):
325
355
  os.close(fd)
326
356
  image_paths.append(temp_path)
327
357
 
328
- if self.config.ocr_backend == "tesseract":
329
- from kreuzberg._ocr._tesseract import TesseractConfig
330
-
331
- if isinstance(self.config.ocr_config, TesseractConfig):
332
- config = self.config.ocr_config
333
- else:
334
- config = TesseractConfig()
335
- results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
336
- text_parts = [r.content for r in results]
337
- return "\n\n".join(text_parts)
338
-
339
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
358
+ return self._process_pdf_images_with_ocr(image_paths)
340
359
 
341
360
  finally:
342
361
  for _, temp_path in temp_files:
@@ -349,3 +368,47 @@ class PDFExtractor(Extractor):
349
368
  if pdf:
350
369
  with pypdfium_file_lock(path), contextlib.suppress(Exception):
351
370
  pdf.close()
371
+
372
+ def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
373
+ """Process PDF images with the configured OCR backend."""
374
+ backend = get_ocr_backend(self.config.ocr_backend)
375
+ paths = [Path(p) for p in image_paths]
376
+
377
+ if self.config.ocr_backend == "tesseract":
378
+ config = (
379
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
380
+ )
381
+ results = backend.process_batch_sync(paths, **config.__dict__)
382
+ elif self.config.ocr_backend == "paddleocr":
383
+ paddle_config = (
384
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
385
+ )
386
+ results = backend.process_batch_sync(paths, **paddle_config.__dict__)
387
+ elif self.config.ocr_backend == "easyocr":
388
+ easy_config = (
389
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
390
+ )
391
+ results = backend.process_batch_sync(paths, **easy_config.__dict__)
392
+ else:
393
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
394
+
395
+ text_parts = [r.content for r in results]
396
+ return "\n\n".join(text_parts)
397
+
398
+ def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
399
+ """Extract text using playa for better structure preservation."""
400
+ with contextlib.suppress(Exception):
401
+ content = path.read_bytes()
402
+ document = parse(content, max_workers=1)
403
+
404
+ text_parts = []
405
+ for page in document.pages:
406
+ # Extract text while preserving structure
407
+ page_text = page.extract_text()
408
+ if page_text and page_text.strip():
409
+ text_parts.append(page_text)
410
+
411
+ if text_parts:
412
+ return "\n\n".join(text_parts)
413
+
414
+ return fallback_text
@@ -30,6 +30,9 @@ if TYPE_CHECKING: # pragma: no cover
30
30
 
31
31
  from kreuzberg._types import Metadata
32
32
 
33
+ # Pre-compiled regex patterns for performance
34
+ _NON_WORD_PATTERN = re.compile(r"\W")
35
+
33
36
 
34
37
  class PresentationExtractor(Extractor):
35
38
  """Extractor for PowerPoint (.pptx) files.
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
141
144
  with suppress(AttributeError):
142
145
  alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
143
146
 
144
- filename = re.sub(r"\W", "", shape.name) + ".jpg"
147
+ filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
145
148
  md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
146
149
 
147
150
  elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
162
165
  md_content += "\n" + html_table + "\n"
163
166
 
164
167
  elif shape.has_text_frame:
165
- md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
168
+ if shape == title:
169
+ md_content += "# " + shape.text.lstrip() + "\n"
170
+ else:
171
+ md_content += shape.text + "\n"
166
172
 
167
173
  md_content = md_content.strip()
168
174
  if slide.has_notes_slide:
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
174
180
 
175
181
  md_content = md_content.strip()
176
182
 
177
- return ExtractionResult(
183
+ result = ExtractionResult(
178
184
  content=normalize_spaces(md_content),
179
185
  mime_type=MARKDOWN_MIME_TYPE,
180
186
  metadata=self._extract_presentation_metadata(presentation),
181
187
  chunks=[],
182
188
  )
183
189
 
190
+ return self._apply_quality_processing(result)
191
+
184
192
  @staticmethod
185
193
  def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
186
194
  """Extract metadata from a presentation instance.
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
193
201
  """
194
202
  metadata: Metadata = {}
195
203
 
196
- for metadata_key, core_property_key in [
204
+ # Extract core properties
205
+ PresentationExtractor._extract_core_properties(presentation, metadata)
206
+
207
+ # Extract fonts used in presentation
208
+ fonts = PresentationExtractor._extract_fonts(presentation)
209
+ if fonts:
210
+ metadata["fonts"] = list(fonts)
211
+
212
+ # Add structural information
213
+ PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
214
+
215
+ return metadata
216
+
217
+ @staticmethod
218
+ def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
219
+ """Extract core document properties from presentation."""
220
+ # Property mapping for core metadata
221
+ property_mapping = [
197
222
  ("authors", "author"),
198
223
  ("comments", "comments"),
199
224
  ("status", "content_status"),
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
205
230
  ("version", "revision"),
206
231
  ("subject", "subject"),
207
232
  ("title", "title"),
208
- ("version", "version"),
209
- ]:
233
+ ]
234
+
235
+ for metadata_key, core_property_key in property_mapping:
210
236
  if core_property := getattr(presentation.core_properties, core_property_key, None):
211
237
  metadata[metadata_key] = core_property # type: ignore[literal-required]
212
238
 
239
+ # Handle special list properties
213
240
  if presentation.core_properties.language:
214
241
  metadata["languages"] = [presentation.core_properties.language]
215
242
 
216
243
  if presentation.core_properties.category:
217
244
  metadata["categories"] = [presentation.core_properties.category]
218
245
 
246
+ @staticmethod
247
+ def _extract_fonts(presentation: Presentation) -> set[str]:
248
+ """Extract all fonts used in the presentation."""
219
249
  fonts = set()
220
250
  for slide in presentation.slides:
221
251
  for shape in slide.shapes:
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
226
256
  for run in paragraph.runs:
227
257
  if hasattr(run, "font") and run.font.name:
228
258
  fonts.add(run.font.name)
259
+ return fonts
229
260
 
230
- if fonts:
231
- metadata["fonts"] = list(fonts)
232
-
233
- return metadata
261
+ @staticmethod
262
+ def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
263
+ """Add structural information about the presentation."""
264
+ slide_count = len(presentation.slides)
265
+ if slide_count == 0:
266
+ return
267
+
268
+ # Build description
269
+ structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
270
+
271
+ slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
272
+ if slides_with_notes > 0:
273
+ structure_info += f", {slides_with_notes} with notes"
274
+
275
+ metadata["description"] = structure_info
276
+
277
+ # Build summary if not already present
278
+ if "summary" not in metadata:
279
+ summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
280
+ if slides_with_notes > 0:
281
+ summary_parts.append(f"{slides_with_notes} slides have notes")
282
+ if fonts:
283
+ summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
284
+
285
+ metadata["summary"] = f"{'. '.join(summary_parts)}."
@@ -6,14 +6,14 @@ import sys
6
6
  from datetime import date, datetime, time, timedelta
7
7
  from io import StringIO
8
8
  from pathlib import Path
9
- from typing import Any, Union
9
+ from typing import Any
10
10
 
11
11
  from anyio import Path as AsyncPath
12
12
  from python_calamine import CalamineWorkbook
13
13
 
14
14
  from kreuzberg._extractors._base import Extractor
15
15
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
16
- from kreuzberg._types import ExtractionResult
16
+ from kreuzberg._types import ExtractionResult, Metadata
17
17
  from kreuzberg._utils._string import normalize_spaces
18
18
  from kreuzberg._utils._sync import run_sync, run_taskgroup
19
19
  from kreuzberg._utils._tmp import create_temp_file
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
23
23
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
24
24
 
25
25
 
26
- CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
26
+ CellValue = int | float | str | bool | time | date | datetime | timedelta
27
27
 
28
28
 
29
29
  class SpreadSheetExtractor(Extractor):
@@ -45,9 +45,14 @@ class SpreadSheetExtractor(Extractor):
45
45
  try:
46
46
  results: list[str] = await run_taskgroup(*tasks)
47
47
 
48
- return ExtractionResult(
49
- content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[]
48
+ result = ExtractionResult(
49
+ content="\n\n".join(results),
50
+ mime_type=MARKDOWN_MIME_TYPE,
51
+ metadata=self._extract_spreadsheet_metadata(workbook),
52
+ chunks=[],
50
53
  )
54
+
55
+ return self._apply_quality_processing(result)
51
56
  except ExceptionGroup as eg:
52
57
  raise ParsingError(
53
58
  "Failed to extract file data",
@@ -87,7 +92,14 @@ class SpreadSheetExtractor(Extractor):
87
92
  sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
88
93
  results.append(sheet_text)
89
94
 
90
- return ExtractionResult(content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
95
+ result = ExtractionResult(
96
+ content="\n\n".join(results),
97
+ mime_type=MARKDOWN_MIME_TYPE,
98
+ metadata=self._extract_spreadsheet_metadata(workbook),
99
+ chunks=[],
100
+ )
101
+
102
+ return self._apply_quality_processing(result)
91
103
  except Exception as e:
92
104
  raise ParsingError(
93
105
  "Failed to extract file data",
@@ -181,3 +193,166 @@ class SpreadSheetExtractor(Extractor):
181
193
  result = "\n".join(markdown_lines)
182
194
 
183
195
  return f"## {sheet_name}\n\n{normalize_spaces(result)}"
196
+
197
+ def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
198
+ """Enhanced sheet processing with better table structure preservation."""
199
+ try:
200
+ # pandas is optional dependency
201
+ import pandas as pd
202
+
203
+ from kreuzberg._utils._table import enhance_table_markdown
204
+
205
+ sheet = workbook.get_sheet_by_name(sheet_name)
206
+ data = sheet.to_python()
207
+
208
+ if not data or not any(row for row in data):
209
+ return f"## {sheet_name}\n\n*Empty sheet*"
210
+
211
+ # Convert to DataFrame
212
+ df = pd.DataFrame(data)
213
+
214
+ # Clean up empty rows and columns
215
+ df = df.dropna(how="all").dropna(axis=1, how="all")
216
+
217
+ if df.empty:
218
+ return f"## {sheet_name}\n\n*No data*"
219
+
220
+ # Create a mock TableData for enhanced formatting
221
+ from PIL import Image
222
+
223
+ from kreuzberg._types import TableData
224
+
225
+ # Create a 1x1 transparent image as placeholder
226
+ placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
227
+ mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
228
+
229
+ enhanced_markdown = enhance_table_markdown(mock_table)
230
+ return f"## {sheet_name}\n\n{enhanced_markdown}"
231
+
232
+ except (ImportError, AttributeError, ValueError):
233
+ # Fallback to original method if pandas/table enhancement fails
234
+ return self._convert_sheet_to_text_sync(workbook, sheet_name)
235
+
236
+ @staticmethod
237
+ def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
238
+ """Extract metadata from spreadsheet using python-calamine.
239
+
240
+ Args:
241
+ workbook: CalamineWorkbook instance
242
+
243
+ Returns:
244
+ Metadata dict using existing metadata keys where possible
245
+ """
246
+ metadata: Metadata = {}
247
+
248
+ # Extract basic document properties
249
+ SpreadSheetExtractor._extract_document_properties(workbook, metadata)
250
+
251
+ # Add structural information
252
+ SpreadSheetExtractor._add_structure_info(workbook, metadata)
253
+
254
+ # Analyze content complexity
255
+ SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
256
+
257
+ return metadata
258
+
259
+ @staticmethod
260
+ def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
261
+ """Extract basic document properties from workbook."""
262
+ with contextlib.suppress(AttributeError, Exception):
263
+ if not (hasattr(workbook, "metadata") and workbook.metadata):
264
+ return
265
+
266
+ props = workbook.metadata
267
+
268
+ # Basic properties mapping
269
+ property_mapping = {
270
+ "title": "title",
271
+ "author": "authors", # Convert to list
272
+ "subject": "subject",
273
+ "comments": "comments",
274
+ "keywords": "keywords", # Process separately
275
+ "category": "categories", # Convert to list
276
+ "company": "organization",
277
+ "manager": "modified_by",
278
+ }
279
+
280
+ for prop_name, meta_key in property_mapping.items():
281
+ if hasattr(props, prop_name) and (value := getattr(props, prop_name)):
282
+ if meta_key in ("authors", "categories"):
283
+ metadata[meta_key] = [value] # type: ignore[literal-required]
284
+ elif meta_key == "keywords":
285
+ keywords = [k.strip() for k in value.replace(";", ",").split(",") if k.strip()]
286
+ if keywords:
287
+ metadata[meta_key] = keywords # type: ignore[literal-required]
288
+ else:
289
+ metadata[meta_key] = value # type: ignore[literal-required]
290
+
291
+ # Handle dates separately
292
+ SpreadSheetExtractor._extract_date_properties(props, metadata)
293
+
294
+ @staticmethod
295
+ def _extract_date_properties(props: Any, metadata: Metadata) -> None:
296
+ """Extract and format date properties."""
297
+ date_mapping = {"created": "created_at", "modified": "modified_at"}
298
+
299
+ for prop_name, meta_key in date_mapping.items():
300
+ if hasattr(props, prop_name) and (date_value := getattr(props, prop_name)):
301
+ with contextlib.suppress(Exception):
302
+ if hasattr(date_value, "isoformat"):
303
+ metadata[meta_key] = date_value.isoformat() # type: ignore[literal-required]
304
+ else:
305
+ metadata[meta_key] = str(date_value) # type: ignore[literal-required]
306
+
307
+ @staticmethod
308
+ def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
309
+ """Add structural information about the spreadsheet."""
310
+ if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
311
+ return
312
+
313
+ sheet_count = len(workbook.sheet_names)
314
+ structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
315
+
316
+ # Don't list too many sheet names (magic number made constant)
317
+ max_sheet_names_to_list = 5
318
+ if sheet_count <= max_sheet_names_to_list:
319
+ structure_info += f": {', '.join(workbook.sheet_names)}"
320
+
321
+ metadata["description"] = structure_info
322
+
323
+ @staticmethod
324
+ def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
325
+ """Analyze spreadsheet content for complexity indicators."""
326
+ with contextlib.suppress(Exception):
327
+ has_formulas = False
328
+ total_cells = 0
329
+
330
+ # Check only first few sheets for performance
331
+ max_sheets_to_check = 3
332
+ max_rows_to_check = 50
333
+
334
+ for sheet_name in workbook.sheet_names[:max_sheets_to_check]:
335
+ with contextlib.suppress(Exception):
336
+ sheet = workbook.get_sheet_by_name(sheet_name)
337
+ data = sheet.to_python()
338
+
339
+ for row in data[:max_rows_to_check]:
340
+ if not row: # Skip empty rows
341
+ continue
342
+
343
+ total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
344
+
345
+ # Check for formulas (simple heuristic)
346
+ if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
347
+ has_formulas = True
348
+ break
349
+
350
+ # Build summary
351
+ summary_parts = []
352
+ if total_cells > 0:
353
+ summary_parts.append(f"Contains {total_cells}+ data cells")
354
+ if has_formulas:
355
+ summary_parts.append("includes formulas")
356
+
357
+ if summary_parts and "summary" not in metadata:
358
+ metadata["summary"] = f"Spreadsheet that {', '.join(summary_parts)}."