kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +9 -2
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_entity_extraction.py +238 -0
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +27 -22
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +97 -34
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +181 -6
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +318 -11
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +569 -5
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +181 -4
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +182 -9
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +6 -7
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +43 -34
- kreuzberg-3.8.1.dist-info/METADATA +301 -0
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
- kreuzberg/_multiprocessing/process_manager.py +0 -188
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.3.0.dist-info/METADATA +0 -235
- kreuzberg-3.3.0.dist-info/RECORD +0 -48
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import os
|
4
5
|
import re
|
6
|
+
import subprocess
|
5
7
|
import sys
|
8
|
+
import tempfile
|
6
9
|
from json import JSONDecodeError, loads
|
7
10
|
from pathlib import Path
|
8
11
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
|
@@ -203,10 +206,6 @@ class PandocExtractor(Extractor):
|
|
203
206
|
Returns:
|
204
207
|
ExtractionResult with the extracted text and metadata.
|
205
208
|
"""
|
206
|
-
import os
|
207
|
-
import tempfile
|
208
|
-
from pathlib import Path
|
209
|
-
|
210
209
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
211
210
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
212
211
|
|
@@ -579,8 +578,6 @@ class PandocExtractor(Extractor):
|
|
579
578
|
|
580
579
|
def _validate_pandoc_version_sync(self) -> None:
|
581
580
|
"""Synchronous version of _validate_pandoc_version."""
|
582
|
-
import subprocess
|
583
|
-
|
584
581
|
try:
|
585
582
|
if self._checked_version:
|
586
583
|
return
|
@@ -625,10 +622,6 @@ class PandocExtractor(Extractor):
|
|
625
622
|
|
626
623
|
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
627
624
|
"""Synchronous version of _handle_extract_metadata."""
|
628
|
-
import os
|
629
|
-
import subprocess
|
630
|
-
import tempfile
|
631
|
-
|
632
625
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
633
626
|
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
634
627
|
os.close(fd)
|
@@ -663,10 +656,6 @@ class PandocExtractor(Extractor):
|
|
663
656
|
|
664
657
|
def _extract_file_sync(self, path: Path) -> str:
|
665
658
|
"""Synchronous version of _handle_extract_file."""
|
666
|
-
import os
|
667
|
-
import subprocess
|
668
|
-
import tempfile
|
669
|
-
|
670
659
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
671
660
|
fd, output_path = tempfile.mkstemp(suffix=".md")
|
672
661
|
os.close(fd)
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
4
6
|
from multiprocessing import cpu_count
|
5
7
|
from pathlib import Path
|
6
8
|
from re import Pattern
|
@@ -10,15 +12,21 @@ from typing import TYPE_CHECKING, ClassVar, cast
|
|
10
12
|
import anyio
|
11
13
|
import pypdfium2
|
12
14
|
from anyio import Path as AsyncPath
|
15
|
+
from playa import parse
|
13
16
|
|
14
17
|
from kreuzberg._extractors._base import Extractor
|
15
18
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
16
19
|
from kreuzberg._ocr import get_ocr_backend
|
17
|
-
from kreuzberg.
|
20
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
21
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
22
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
23
|
+
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
18
24
|
from kreuzberg._types import ExtractionResult, OcrBackendType
|
25
|
+
from kreuzberg._utils._errors import create_error_context, should_retry
|
19
26
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
20
27
|
from kreuzberg._utils._string import normalize_spaces
|
21
28
|
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
29
|
+
from kreuzberg._utils._table import generate_table_summary
|
22
30
|
from kreuzberg._utils._tmp import create_temp_file
|
23
31
|
from kreuzberg.exceptions import ParsingError
|
24
32
|
|
@@ -63,17 +71,30 @@ class PDFExtractor(Extractor):
|
|
63
71
|
result.metadata = await extract_pdf_metadata(content_bytes)
|
64
72
|
|
65
73
|
if self.config.extract_tables:
|
66
|
-
|
67
|
-
|
68
|
-
|
74
|
+
# GMFT is optional dependency
|
75
|
+
try:
|
76
|
+
from kreuzberg._gmft import extract_tables
|
69
77
|
|
70
|
-
|
78
|
+
result.tables = await extract_tables(path, self.config.gmft_config)
|
79
|
+
except ImportError:
|
80
|
+
result.tables = []
|
81
|
+
|
82
|
+
# Enhance metadata with table information
|
83
|
+
if result.tables:
|
84
|
+
table_summary = generate_table_summary(result.tables)
|
85
|
+
result.metadata.update(
|
86
|
+
{
|
87
|
+
"table_count": table_summary["table_count"],
|
88
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
89
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
90
|
+
f"{table_summary['total_rows']} total rows",
|
91
|
+
}
|
92
|
+
)
|
93
|
+
|
94
|
+
return self._apply_quality_processing(result)
|
71
95
|
|
72
96
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
73
97
|
"""Pure sync implementation of PDF extraction from bytes."""
|
74
|
-
import os
|
75
|
-
import tempfile
|
76
|
-
|
77
98
|
fd, temp_path = tempfile.mkstemp(suffix=".pdf")
|
78
99
|
try:
|
79
100
|
with os.fdopen(fd, "wb") as f:
|
@@ -81,8 +102,6 @@ class PDFExtractor(Extractor):
|
|
81
102
|
|
82
103
|
result = self.extract_path_sync(Path(temp_path))
|
83
104
|
|
84
|
-
from kreuzberg._playa import extract_pdf_metadata_sync
|
85
|
-
|
86
105
|
metadata = extract_pdf_metadata_sync(content)
|
87
106
|
result.metadata = metadata
|
88
107
|
|
@@ -100,16 +119,21 @@ class PDFExtractor(Extractor):
|
|
100
119
|
|
101
120
|
tables = []
|
102
121
|
if self.config.extract_tables:
|
122
|
+
# GMFT is optional dependency
|
103
123
|
try:
|
104
124
|
from kreuzberg._gmft import extract_tables_sync
|
105
125
|
|
106
126
|
tables = extract_tables_sync(path)
|
107
127
|
except ImportError:
|
108
|
-
|
128
|
+
tables = []
|
129
|
+
|
130
|
+
# Use playa for better text structure preservation when not using OCR
|
131
|
+
if not self.config.force_ocr and self._validate_extracted_text(text):
|
132
|
+
text = self._extract_with_playa_sync(path, fallback_text=text)
|
109
133
|
|
110
134
|
text = normalize_spaces(text)
|
111
135
|
|
112
|
-
|
136
|
+
result = ExtractionResult(
|
113
137
|
content=text,
|
114
138
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
115
139
|
metadata={},
|
@@ -117,6 +141,21 @@ class PDFExtractor(Extractor):
|
|
117
141
|
chunks=[],
|
118
142
|
)
|
119
143
|
|
144
|
+
# Enhance metadata with table information
|
145
|
+
if tables:
|
146
|
+
table_summary = generate_table_summary(tables)
|
147
|
+
result.metadata.update(
|
148
|
+
{
|
149
|
+
"table_count": table_summary["table_count"],
|
150
|
+
"tables_summary": f"Document contains {table_summary['table_count']} tables "
|
151
|
+
f"across {table_summary['pages_with_tables']} pages with "
|
152
|
+
f"{table_summary['total_rows']} total rows",
|
153
|
+
}
|
154
|
+
)
|
155
|
+
|
156
|
+
# Apply quality processing
|
157
|
+
return self._apply_quality_processing(result)
|
158
|
+
|
120
159
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
121
160
|
"""Check if text extracted from PDF is valid or corrupted.
|
122
161
|
|
@@ -155,8 +194,6 @@ class PDFExtractor(Extractor):
|
|
155
194
|
Returns:
|
156
195
|
A list of Pillow Images.
|
157
196
|
"""
|
158
|
-
from kreuzberg._utils._errors import create_error_context, should_retry
|
159
|
-
|
160
197
|
document: pypdfium2.PdfDocument | None = None
|
161
198
|
last_error = None
|
162
199
|
|
@@ -228,8 +265,6 @@ class PDFExtractor(Extractor):
|
|
228
265
|
Returns:
|
229
266
|
The extracted text.
|
230
267
|
"""
|
231
|
-
from kreuzberg._utils._errors import create_error_context
|
232
|
-
|
233
268
|
document: pypdfium2.PdfDocument | None = None
|
234
269
|
try:
|
235
270
|
with pypdfium_file_lock(input_file):
|
@@ -283,7 +318,7 @@ class PDFExtractor(Extractor):
|
|
283
318
|
text_parts = []
|
284
319
|
for page in pdf:
|
285
320
|
text_page = page.get_textpage()
|
286
|
-
text = text_page.
|
321
|
+
text = text_page.get_text_bounded()
|
287
322
|
text_parts.append(text)
|
288
323
|
text_page.close()
|
289
324
|
page.close()
|
@@ -299,8 +334,6 @@ class PDFExtractor(Extractor):
|
|
299
334
|
"""Extract text from PDF using OCR (sync version)."""
|
300
335
|
pdf = None
|
301
336
|
try:
|
302
|
-
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
303
|
-
|
304
337
|
images = []
|
305
338
|
with pypdfium_file_lock(path):
|
306
339
|
pdf = pypdfium2.PdfDocument(str(path))
|
@@ -311,9 +344,6 @@ class PDFExtractor(Extractor):
|
|
311
344
|
bitmap.close()
|
312
345
|
page.close()
|
313
346
|
|
314
|
-
import os
|
315
|
-
import tempfile
|
316
|
-
|
317
347
|
image_paths = []
|
318
348
|
temp_files = []
|
319
349
|
|
@@ -325,18 +355,7 @@ class PDFExtractor(Extractor):
|
|
325
355
|
os.close(fd)
|
326
356
|
image_paths.append(temp_path)
|
327
357
|
|
328
|
-
|
329
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
330
|
-
|
331
|
-
if isinstance(self.config.ocr_config, TesseractConfig):
|
332
|
-
config = self.config.ocr_config
|
333
|
-
else:
|
334
|
-
config = TesseractConfig()
|
335
|
-
results = process_batch_images_sync_pure([str(p) for p in image_paths], config)
|
336
|
-
text_parts = [r.content for r in results]
|
337
|
-
return "\n\n".join(text_parts)
|
338
|
-
|
339
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
358
|
+
return self._process_pdf_images_with_ocr(image_paths)
|
340
359
|
|
341
360
|
finally:
|
342
361
|
for _, temp_path in temp_files:
|
@@ -349,3 +368,47 @@ class PDFExtractor(Extractor):
|
|
349
368
|
if pdf:
|
350
369
|
with pypdfium_file_lock(path), contextlib.suppress(Exception):
|
351
370
|
pdf.close()
|
371
|
+
|
372
|
+
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
373
|
+
"""Process PDF images with the configured OCR backend."""
|
374
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
375
|
+
paths = [Path(p) for p in image_paths]
|
376
|
+
|
377
|
+
if self.config.ocr_backend == "tesseract":
|
378
|
+
config = (
|
379
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
380
|
+
)
|
381
|
+
results = backend.process_batch_sync(paths, **config.__dict__)
|
382
|
+
elif self.config.ocr_backend == "paddleocr":
|
383
|
+
paddle_config = (
|
384
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
385
|
+
)
|
386
|
+
results = backend.process_batch_sync(paths, **paddle_config.__dict__)
|
387
|
+
elif self.config.ocr_backend == "easyocr":
|
388
|
+
easy_config = (
|
389
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
390
|
+
)
|
391
|
+
results = backend.process_batch_sync(paths, **easy_config.__dict__)
|
392
|
+
else:
|
393
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
394
|
+
|
395
|
+
text_parts = [r.content for r in results]
|
396
|
+
return "\n\n".join(text_parts)
|
397
|
+
|
398
|
+
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
399
|
+
"""Extract text using playa for better structure preservation."""
|
400
|
+
with contextlib.suppress(Exception):
|
401
|
+
content = path.read_bytes()
|
402
|
+
document = parse(content, max_workers=1)
|
403
|
+
|
404
|
+
text_parts = []
|
405
|
+
for page in document.pages:
|
406
|
+
# Extract text while preserving structure
|
407
|
+
page_text = page.extract_text()
|
408
|
+
if page_text and page_text.strip():
|
409
|
+
text_parts.append(page_text)
|
410
|
+
|
411
|
+
if text_parts:
|
412
|
+
return "\n\n".join(text_parts)
|
413
|
+
|
414
|
+
return fallback_text
|
@@ -30,6 +30,9 @@ if TYPE_CHECKING: # pragma: no cover
|
|
30
30
|
|
31
31
|
from kreuzberg._types import Metadata
|
32
32
|
|
33
|
+
# Pre-compiled regex patterns for performance
|
34
|
+
_NON_WORD_PATTERN = re.compile(r"\W")
|
35
|
+
|
33
36
|
|
34
37
|
class PresentationExtractor(Extractor):
|
35
38
|
"""Extractor for PowerPoint (.pptx) files.
|
@@ -141,7 +144,7 @@ class PresentationExtractor(Extractor):
|
|
141
144
|
with suppress(AttributeError):
|
142
145
|
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
143
146
|
|
144
|
-
filename =
|
147
|
+
filename = _NON_WORD_PATTERN.sub("", shape.name) + ".jpg"
|
145
148
|
md_content += f"\n\n"
|
146
149
|
|
147
150
|
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
@@ -162,7 +165,10 @@ class PresentationExtractor(Extractor):
|
|
162
165
|
md_content += "\n" + html_table + "\n"
|
163
166
|
|
164
167
|
elif shape.has_text_frame:
|
165
|
-
|
168
|
+
if shape == title:
|
169
|
+
md_content += "# " + shape.text.lstrip() + "\n"
|
170
|
+
else:
|
171
|
+
md_content += shape.text + "\n"
|
166
172
|
|
167
173
|
md_content = md_content.strip()
|
168
174
|
if slide.has_notes_slide:
|
@@ -174,13 +180,15 @@ class PresentationExtractor(Extractor):
|
|
174
180
|
|
175
181
|
md_content = md_content.strip()
|
176
182
|
|
177
|
-
|
183
|
+
result = ExtractionResult(
|
178
184
|
content=normalize_spaces(md_content),
|
179
185
|
mime_type=MARKDOWN_MIME_TYPE,
|
180
186
|
metadata=self._extract_presentation_metadata(presentation),
|
181
187
|
chunks=[],
|
182
188
|
)
|
183
189
|
|
190
|
+
return self._apply_quality_processing(result)
|
191
|
+
|
184
192
|
@staticmethod
|
185
193
|
def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
|
186
194
|
"""Extract metadata from a presentation instance.
|
@@ -193,7 +201,24 @@ class PresentationExtractor(Extractor):
|
|
193
201
|
"""
|
194
202
|
metadata: Metadata = {}
|
195
203
|
|
196
|
-
|
204
|
+
# Extract core properties
|
205
|
+
PresentationExtractor._extract_core_properties(presentation, metadata)
|
206
|
+
|
207
|
+
# Extract fonts used in presentation
|
208
|
+
fonts = PresentationExtractor._extract_fonts(presentation)
|
209
|
+
if fonts:
|
210
|
+
metadata["fonts"] = list(fonts)
|
211
|
+
|
212
|
+
# Add structural information
|
213
|
+
PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
|
214
|
+
|
215
|
+
return metadata
|
216
|
+
|
217
|
+
@staticmethod
|
218
|
+
def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
|
219
|
+
"""Extract core document properties from presentation."""
|
220
|
+
# Property mapping for core metadata
|
221
|
+
property_mapping = [
|
197
222
|
("authors", "author"),
|
198
223
|
("comments", "comments"),
|
199
224
|
("status", "content_status"),
|
@@ -205,17 +230,22 @@ class PresentationExtractor(Extractor):
|
|
205
230
|
("version", "revision"),
|
206
231
|
("subject", "subject"),
|
207
232
|
("title", "title"),
|
208
|
-
|
209
|
-
|
233
|
+
]
|
234
|
+
|
235
|
+
for metadata_key, core_property_key in property_mapping:
|
210
236
|
if core_property := getattr(presentation.core_properties, core_property_key, None):
|
211
237
|
metadata[metadata_key] = core_property # type: ignore[literal-required]
|
212
238
|
|
239
|
+
# Handle special list properties
|
213
240
|
if presentation.core_properties.language:
|
214
241
|
metadata["languages"] = [presentation.core_properties.language]
|
215
242
|
|
216
243
|
if presentation.core_properties.category:
|
217
244
|
metadata["categories"] = [presentation.core_properties.category]
|
218
245
|
|
246
|
+
@staticmethod
|
247
|
+
def _extract_fonts(presentation: Presentation) -> set[str]:
|
248
|
+
"""Extract all fonts used in the presentation."""
|
219
249
|
fonts = set()
|
220
250
|
for slide in presentation.slides:
|
221
251
|
for shape in slide.shapes:
|
@@ -226,8 +256,30 @@ class PresentationExtractor(Extractor):
|
|
226
256
|
for run in paragraph.runs:
|
227
257
|
if hasattr(run, "font") and run.font.name:
|
228
258
|
fonts.add(run.font.name)
|
259
|
+
return fonts
|
229
260
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
261
|
+
@staticmethod
|
262
|
+
def _add_presentation_structure_info(presentation: Presentation, metadata: Metadata, fonts: set[str]) -> None:
|
263
|
+
"""Add structural information about the presentation."""
|
264
|
+
slide_count = len(presentation.slides)
|
265
|
+
if slide_count == 0:
|
266
|
+
return
|
267
|
+
|
268
|
+
# Build description
|
269
|
+
structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
|
270
|
+
|
271
|
+
slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
|
272
|
+
if slides_with_notes > 0:
|
273
|
+
structure_info += f", {slides_with_notes} with notes"
|
274
|
+
|
275
|
+
metadata["description"] = structure_info
|
276
|
+
|
277
|
+
# Build summary if not already present
|
278
|
+
if "summary" not in metadata:
|
279
|
+
summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
|
280
|
+
if slides_with_notes > 0:
|
281
|
+
summary_parts.append(f"{slides_with_notes} slides have notes")
|
282
|
+
if fonts:
|
283
|
+
summary_parts.append(f"uses {len(fonts)} font{'s' if len(fonts) != 1 else ''}")
|
284
|
+
|
285
|
+
metadata["summary"] = f"{'. '.join(summary_parts)}."
|
@@ -6,14 +6,14 @@ import sys
|
|
6
6
|
from datetime import date, datetime, time, timedelta
|
7
7
|
from io import StringIO
|
8
8
|
from pathlib import Path
|
9
|
-
from typing import Any
|
9
|
+
from typing import Any
|
10
10
|
|
11
11
|
from anyio import Path as AsyncPath
|
12
12
|
from python_calamine import CalamineWorkbook
|
13
13
|
|
14
14
|
from kreuzberg._extractors._base import Extractor
|
15
15
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
|
16
|
-
from kreuzberg._types import ExtractionResult
|
16
|
+
from kreuzberg._types import ExtractionResult, Metadata
|
17
17
|
from kreuzberg._utils._string import normalize_spaces
|
18
18
|
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
19
19
|
from kreuzberg._utils._tmp import create_temp_file
|
@@ -23,7 +23,7 @@ if sys.version_info < (3, 11): # pragma: no cover
|
|
23
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
24
24
|
|
25
25
|
|
26
|
-
CellValue =
|
26
|
+
CellValue = int | float | str | bool | time | date | datetime | timedelta
|
27
27
|
|
28
28
|
|
29
29
|
class SpreadSheetExtractor(Extractor):
|
@@ -45,9 +45,14 @@ class SpreadSheetExtractor(Extractor):
|
|
45
45
|
try:
|
46
46
|
results: list[str] = await run_taskgroup(*tasks)
|
47
47
|
|
48
|
-
|
49
|
-
content="\n\n".join(results),
|
48
|
+
result = ExtractionResult(
|
49
|
+
content="\n\n".join(results),
|
50
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
51
|
+
metadata=self._extract_spreadsheet_metadata(workbook),
|
52
|
+
chunks=[],
|
50
53
|
)
|
54
|
+
|
55
|
+
return self._apply_quality_processing(result)
|
51
56
|
except ExceptionGroup as eg:
|
52
57
|
raise ParsingError(
|
53
58
|
"Failed to extract file data",
|
@@ -87,7 +92,14 @@ class SpreadSheetExtractor(Extractor):
|
|
87
92
|
sheet_text = self._convert_sheet_to_text_sync(workbook, sheet_name)
|
88
93
|
results.append(sheet_text)
|
89
94
|
|
90
|
-
|
95
|
+
result = ExtractionResult(
|
96
|
+
content="\n\n".join(results),
|
97
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
98
|
+
metadata=self._extract_spreadsheet_metadata(workbook),
|
99
|
+
chunks=[],
|
100
|
+
)
|
101
|
+
|
102
|
+
return self._apply_quality_processing(result)
|
91
103
|
except Exception as e:
|
92
104
|
raise ParsingError(
|
93
105
|
"Failed to extract file data",
|
@@ -181,3 +193,166 @@ class SpreadSheetExtractor(Extractor):
|
|
181
193
|
result = "\n".join(markdown_lines)
|
182
194
|
|
183
195
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
196
|
+
|
197
|
+
def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
198
|
+
"""Enhanced sheet processing with better table structure preservation."""
|
199
|
+
try:
|
200
|
+
# pandas is optional dependency
|
201
|
+
import pandas as pd
|
202
|
+
|
203
|
+
from kreuzberg._utils._table import enhance_table_markdown
|
204
|
+
|
205
|
+
sheet = workbook.get_sheet_by_name(sheet_name)
|
206
|
+
data = sheet.to_python()
|
207
|
+
|
208
|
+
if not data or not any(row for row in data):
|
209
|
+
return f"## {sheet_name}\n\n*Empty sheet*"
|
210
|
+
|
211
|
+
# Convert to DataFrame
|
212
|
+
df = pd.DataFrame(data)
|
213
|
+
|
214
|
+
# Clean up empty rows and columns
|
215
|
+
df = df.dropna(how="all").dropna(axis=1, how="all")
|
216
|
+
|
217
|
+
if df.empty:
|
218
|
+
return f"## {sheet_name}\n\n*No data*"
|
219
|
+
|
220
|
+
# Create a mock TableData for enhanced formatting
|
221
|
+
from PIL import Image
|
222
|
+
|
223
|
+
from kreuzberg._types import TableData
|
224
|
+
|
225
|
+
# Create a 1x1 transparent image as placeholder
|
226
|
+
placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
|
227
|
+
mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
|
228
|
+
|
229
|
+
enhanced_markdown = enhance_table_markdown(mock_table)
|
230
|
+
return f"## {sheet_name}\n\n{enhanced_markdown}"
|
231
|
+
|
232
|
+
except (ImportError, AttributeError, ValueError):
|
233
|
+
# Fallback to original method if pandas/table enhancement fails
|
234
|
+
return self._convert_sheet_to_text_sync(workbook, sheet_name)
|
235
|
+
|
236
|
+
@staticmethod
|
237
|
+
def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
|
238
|
+
"""Extract metadata from spreadsheet using python-calamine.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
workbook: CalamineWorkbook instance
|
242
|
+
|
243
|
+
Returns:
|
244
|
+
Metadata dict using existing metadata keys where possible
|
245
|
+
"""
|
246
|
+
metadata: Metadata = {}
|
247
|
+
|
248
|
+
# Extract basic document properties
|
249
|
+
SpreadSheetExtractor._extract_document_properties(workbook, metadata)
|
250
|
+
|
251
|
+
# Add structural information
|
252
|
+
SpreadSheetExtractor._add_structure_info(workbook, metadata)
|
253
|
+
|
254
|
+
# Analyze content complexity
|
255
|
+
SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
|
256
|
+
|
257
|
+
return metadata
|
258
|
+
|
259
|
+
@staticmethod
|
260
|
+
def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
261
|
+
"""Extract basic document properties from workbook."""
|
262
|
+
with contextlib.suppress(AttributeError, Exception):
|
263
|
+
if not (hasattr(workbook, "metadata") and workbook.metadata):
|
264
|
+
return
|
265
|
+
|
266
|
+
props = workbook.metadata
|
267
|
+
|
268
|
+
# Basic properties mapping
|
269
|
+
property_mapping = {
|
270
|
+
"title": "title",
|
271
|
+
"author": "authors", # Convert to list
|
272
|
+
"subject": "subject",
|
273
|
+
"comments": "comments",
|
274
|
+
"keywords": "keywords", # Process separately
|
275
|
+
"category": "categories", # Convert to list
|
276
|
+
"company": "organization",
|
277
|
+
"manager": "modified_by",
|
278
|
+
}
|
279
|
+
|
280
|
+
for prop_name, meta_key in property_mapping.items():
|
281
|
+
if hasattr(props, prop_name) and (value := getattr(props, prop_name)):
|
282
|
+
if meta_key in ("authors", "categories"):
|
283
|
+
metadata[meta_key] = [value] # type: ignore[literal-required]
|
284
|
+
elif meta_key == "keywords":
|
285
|
+
keywords = [k.strip() for k in value.replace(";", ",").split(",") if k.strip()]
|
286
|
+
if keywords:
|
287
|
+
metadata[meta_key] = keywords # type: ignore[literal-required]
|
288
|
+
else:
|
289
|
+
metadata[meta_key] = value # type: ignore[literal-required]
|
290
|
+
|
291
|
+
# Handle dates separately
|
292
|
+
SpreadSheetExtractor._extract_date_properties(props, metadata)
|
293
|
+
|
294
|
+
@staticmethod
|
295
|
+
def _extract_date_properties(props: Any, metadata: Metadata) -> None:
|
296
|
+
"""Extract and format date properties."""
|
297
|
+
date_mapping = {"created": "created_at", "modified": "modified_at"}
|
298
|
+
|
299
|
+
for prop_name, meta_key in date_mapping.items():
|
300
|
+
if hasattr(props, prop_name) and (date_value := getattr(props, prop_name)):
|
301
|
+
with contextlib.suppress(Exception):
|
302
|
+
if hasattr(date_value, "isoformat"):
|
303
|
+
metadata[meta_key] = date_value.isoformat() # type: ignore[literal-required]
|
304
|
+
else:
|
305
|
+
metadata[meta_key] = str(date_value) # type: ignore[literal-required]
|
306
|
+
|
307
|
+
@staticmethod
|
308
|
+
def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
309
|
+
"""Add structural information about the spreadsheet."""
|
310
|
+
if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
|
311
|
+
return
|
312
|
+
|
313
|
+
sheet_count = len(workbook.sheet_names)
|
314
|
+
structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
|
315
|
+
|
316
|
+
# Don't list too many sheet names (magic number made constant)
|
317
|
+
max_sheet_names_to_list = 5
|
318
|
+
if sheet_count <= max_sheet_names_to_list:
|
319
|
+
structure_info += f": {', '.join(workbook.sheet_names)}"
|
320
|
+
|
321
|
+
metadata["description"] = structure_info
|
322
|
+
|
323
|
+
@staticmethod
|
324
|
+
def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
325
|
+
"""Analyze spreadsheet content for complexity indicators."""
|
326
|
+
with contextlib.suppress(Exception):
|
327
|
+
has_formulas = False
|
328
|
+
total_cells = 0
|
329
|
+
|
330
|
+
# Check only first few sheets for performance
|
331
|
+
max_sheets_to_check = 3
|
332
|
+
max_rows_to_check = 50
|
333
|
+
|
334
|
+
for sheet_name in workbook.sheet_names[:max_sheets_to_check]:
|
335
|
+
with contextlib.suppress(Exception):
|
336
|
+
sheet = workbook.get_sheet_by_name(sheet_name)
|
337
|
+
data = sheet.to_python()
|
338
|
+
|
339
|
+
for row in data[:max_rows_to_check]:
|
340
|
+
if not row: # Skip empty rows
|
341
|
+
continue
|
342
|
+
|
343
|
+
total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
|
344
|
+
|
345
|
+
# Check for formulas (simple heuristic)
|
346
|
+
if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
|
347
|
+
has_formulas = True
|
348
|
+
break
|
349
|
+
|
350
|
+
# Build summary
|
351
|
+
summary_parts = []
|
352
|
+
if total_cells > 0:
|
353
|
+
summary_parts.append(f"Contains {total_cells}+ data cells")
|
354
|
+
if has_formulas:
|
355
|
+
summary_parts.append("includes formulas")
|
356
|
+
|
357
|
+
if summary_parts and "summary" not in metadata:
|
358
|
+
metadata["summary"] = f"Spreadsheet that {', '.join(summary_parts)}."
|