kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -18,11 +18,8 @@ from playa import parse
|
|
18
18
|
from kreuzberg._extractors._base import Extractor
|
19
19
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
20
20
|
from kreuzberg._ocr import get_ocr_backend
|
21
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
22
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
23
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
24
21
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
25
|
-
from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
|
22
|
+
from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
|
26
23
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
27
24
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
28
25
|
from kreuzberg._utils._string import normalize_spaces
|
@@ -65,7 +62,6 @@ class PDFExtractor(Extractor):
|
|
65
62
|
if self._validate_extracted_text(content):
|
66
63
|
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
67
64
|
except ParsingError:
|
68
|
-
# If searchable text extraction fails, continue to OCR or empty result
|
69
65
|
pass
|
70
66
|
|
71
67
|
if not result and self.config.ocr_backend is not None:
|
@@ -77,7 +73,7 @@ class PDFExtractor(Extractor):
|
|
77
73
|
result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
78
74
|
|
79
75
|
if self.config.extract_tables:
|
80
|
-
# GMFT is optional dependency
|
76
|
+
# GMFT is optional dependency ~keep
|
81
77
|
try:
|
82
78
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
83
79
|
|
@@ -85,7 +81,6 @@ class PDFExtractor(Extractor):
|
|
85
81
|
except ImportError: # pragma: no cover
|
86
82
|
result.tables = []
|
87
83
|
|
88
|
-
# Enhance metadata with table information
|
89
84
|
if result.tables:
|
90
85
|
table_summary = generate_table_summary(result.tables)
|
91
86
|
result.metadata = result.metadata | {
|
@@ -126,7 +121,7 @@ class PDFExtractor(Extractor):
|
|
126
121
|
|
127
122
|
tables = []
|
128
123
|
if self.config.extract_tables:
|
129
|
-
# GMFT is optional dependency
|
124
|
+
# GMFT is optional dependency ~keep
|
130
125
|
try:
|
131
126
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
132
127
|
|
@@ -134,7 +129,6 @@ class PDFExtractor(Extractor):
|
|
134
129
|
except ImportError:
|
135
130
|
tables = []
|
136
131
|
|
137
|
-
# Use playa for better text structure preservation when not using OCR
|
138
132
|
if not self.config.force_ocr and self._validate_extracted_text(text):
|
139
133
|
text = self._extract_with_playa_sync(path, fallback_text=text)
|
140
134
|
|
@@ -148,7 +142,6 @@ class PDFExtractor(Extractor):
|
|
148
142
|
chunks=[],
|
149
143
|
)
|
150
144
|
|
151
|
-
# Enhance metadata with table information
|
152
145
|
if tables:
|
153
146
|
table_summary = generate_table_summary(tables)
|
154
147
|
result.metadata = result.metadata | {
|
@@ -158,25 +151,9 @@ class PDFExtractor(Extractor):
|
|
158
151
|
f"{table_summary['total_rows']} total rows",
|
159
152
|
}
|
160
153
|
|
161
|
-
# Apply quality processing
|
162
154
|
return self._apply_quality_processing(result)
|
163
155
|
|
164
156
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
165
|
-
"""Check if text extracted from PDF is valid or corrupted.
|
166
|
-
|
167
|
-
This checks for indicators of corrupted PDF text extraction:
|
168
|
-
1. Empty or whitespace-only text
|
169
|
-
2. High concentration of control characters and null bytes
|
170
|
-
3. High concentration of Unicode replacement characters
|
171
|
-
|
172
|
-
Args:
|
173
|
-
text: The extracted text to validate
|
174
|
-
corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
|
175
|
-
characters (default: 0.05 or 5%)
|
176
|
-
|
177
|
-
Returns:
|
178
|
-
True if the text appears valid, False if it seems corrupted
|
179
|
-
"""
|
180
157
|
if not text or not text.strip():
|
181
158
|
return False
|
182
159
|
|
@@ -188,17 +165,6 @@ class PDFExtractor(Extractor):
|
|
188
165
|
return (len(corruption_matches) / len(text)) < corruption_threshold
|
189
166
|
|
190
167
|
async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
|
191
|
-
"""Convert a PDF file to images.
|
192
|
-
|
193
|
-
Args:
|
194
|
-
input_file: The path to the PDF file.
|
195
|
-
|
196
|
-
Raises:
|
197
|
-
ParsingError: If the PDF file could not be converted to images.
|
198
|
-
|
199
|
-
Returns:
|
200
|
-
A list of Pillow Images.
|
201
|
-
"""
|
202
168
|
document: pypdfium2.PdfDocument | None = None
|
203
169
|
last_error = None
|
204
170
|
|
@@ -206,7 +172,7 @@ class PDFExtractor(Extractor):
|
|
206
172
|
try:
|
207
173
|
with pypdfium_file_lock(input_file):
|
208
174
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
209
|
-
return [page.render(scale=
|
175
|
+
return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
|
210
176
|
except pypdfium2.PdfiumError as e: # noqa: PERF203
|
211
177
|
last_error = e
|
212
178
|
if not should_retry(e, attempt + 1):
|
@@ -238,39 +204,18 @@ class PDFExtractor(Extractor):
|
|
238
204
|
) from last_error
|
239
205
|
|
240
206
|
async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
241
|
-
"""Extract text from a scanned PDF file using OCR.
|
242
|
-
|
243
|
-
Args:
|
244
|
-
input_file: The path to the PDF file.
|
245
|
-
ocr_backend: The OCR backend to use.
|
246
|
-
|
247
|
-
Returns:
|
248
|
-
The extraction result with text content and metadata.
|
249
|
-
"""
|
250
207
|
images = await self._convert_pdf_to_images(input_file)
|
251
208
|
backend = get_ocr_backend(ocr_backend)
|
252
209
|
ocr_results = await run_taskgroup_batched(
|
253
210
|
*[backend.process_image(image, **self.config.get_config_dict()) for image in images],
|
254
211
|
batch_size=cpu_count(),
|
255
212
|
)
|
256
|
-
# Use list comprehension and join for efficient string building
|
257
213
|
content = "\n".join(result.content for result in ocr_results)
|
258
214
|
|
259
215
|
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
260
216
|
|
261
217
|
@staticmethod
|
262
218
|
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
263
|
-
"""Extract text from a searchable PDF file using pypdfium2.
|
264
|
-
|
265
|
-
Args:
|
266
|
-
input_file: The path to the PDF file.
|
267
|
-
|
268
|
-
Raises:
|
269
|
-
ParsingError: If the text could not be extracted from the PDF file.
|
270
|
-
|
271
|
-
Returns:
|
272
|
-
The extracted text.
|
273
|
-
"""
|
274
219
|
document: pypdfium2.PdfDocument | None = None
|
275
220
|
try:
|
276
221
|
with pypdfium_file_lock(input_file):
|
@@ -318,7 +263,6 @@ class PDFExtractor(Extractor):
|
|
318
263
|
await run_sync(document.close)
|
319
264
|
|
320
265
|
def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
|
321
|
-
"""Extract searchable text from PDF using pypdfium2 (sync version)."""
|
322
266
|
pdf = None
|
323
267
|
try:
|
324
268
|
with pypdfium_file_lock(path):
|
@@ -339,7 +283,6 @@ class PDFExtractor(Extractor):
|
|
339
283
|
pdf.close()
|
340
284
|
|
341
285
|
def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
|
342
|
-
"""Extract text from PDF using OCR (sync version)."""
|
343
286
|
pdf = None
|
344
287
|
try:
|
345
288
|
images = []
|
@@ -352,23 +295,7 @@ class PDFExtractor(Extractor):
|
|
352
295
|
bitmap.close()
|
353
296
|
page.close()
|
354
297
|
|
355
|
-
|
356
|
-
temp_files = []
|
357
|
-
|
358
|
-
try:
|
359
|
-
for i, img in enumerate(images):
|
360
|
-
fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
|
361
|
-
temp_files.append((fd, temp_path))
|
362
|
-
img.save(temp_path, format="PNG")
|
363
|
-
os.close(fd)
|
364
|
-
image_paths.append(temp_path)
|
365
|
-
|
366
|
-
return self._process_pdf_images_with_ocr(image_paths)
|
367
|
-
|
368
|
-
finally:
|
369
|
-
for _, temp_path in temp_files:
|
370
|
-
with contextlib.suppress(OSError):
|
371
|
-
Path(temp_path).unlink()
|
298
|
+
return self._process_pdf_images_with_ocr_direct(images)
|
372
299
|
|
373
300
|
except Exception as e:
|
374
301
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
@@ -378,7 +305,6 @@ class PDFExtractor(Extractor):
|
|
378
305
|
pdf.close()
|
379
306
|
|
380
307
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
381
|
-
"""Process PDF images with the configured OCR backend."""
|
382
308
|
backend = get_ocr_backend(self.config.ocr_backend)
|
383
309
|
paths = [Path(p) for p in image_paths]
|
384
310
|
|
@@ -401,18 +327,48 @@ class PDFExtractor(Extractor):
|
|
401
327
|
case _:
|
402
328
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
403
329
|
|
404
|
-
|
330
|
+
return "\n\n".join(result.content for result in results)
|
331
|
+
|
332
|
+
def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
|
333
|
+
"""Process PIL images directly without temp files."""
|
334
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
335
|
+
|
336
|
+
match self.config.ocr_backend:
|
337
|
+
case "tesseract":
|
338
|
+
config = (
|
339
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
340
|
+
)
|
341
|
+
results = []
|
342
|
+
for image in images:
|
343
|
+
result = backend.process_image_sync(image, **asdict(config))
|
344
|
+
results.append(result)
|
345
|
+
case "paddleocr":
|
346
|
+
paddle_config = (
|
347
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
348
|
+
)
|
349
|
+
results = []
|
350
|
+
for image in images:
|
351
|
+
result = backend.process_image_sync(image, **asdict(paddle_config))
|
352
|
+
results.append(result)
|
353
|
+
case "easyocr":
|
354
|
+
easy_config = (
|
355
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
356
|
+
)
|
357
|
+
results = []
|
358
|
+
for image in images:
|
359
|
+
result = backend.process_image_sync(image, **asdict(easy_config))
|
360
|
+
results.append(result)
|
361
|
+
case _:
|
362
|
+
raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
|
363
|
+
|
405
364
|
return "\n\n".join(result.content for result in results)
|
406
365
|
|
407
366
|
def _parse_with_password_attempts(self, content: bytes) -> Document:
|
408
|
-
"""Parse PDF with password attempts."""
|
409
|
-
# Normalize password to list
|
410
367
|
if isinstance(self.config.pdf_password, str):
|
411
368
|
passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
|
412
369
|
else:
|
413
370
|
passwords = list(self.config.pdf_password)
|
414
371
|
|
415
|
-
# Try each password in sequence
|
416
372
|
last_exception = None
|
417
373
|
for password in passwords:
|
418
374
|
try:
|
@@ -421,21 +377,17 @@ class PDFExtractor(Extractor):
|
|
421
377
|
last_exception = e
|
422
378
|
continue
|
423
379
|
|
424
|
-
# If all passwords failed, raise the last exception
|
425
380
|
if last_exception:
|
426
381
|
raise last_exception from None
|
427
382
|
|
428
|
-
# Fallback to no password
|
429
383
|
return parse(content, max_workers=1, password="")
|
430
384
|
|
431
385
|
def _get_passwords_to_try(self) -> list[str]:
|
432
|
-
"""Get list of passwords to try in sequence."""
|
433
386
|
if isinstance(self.config.pdf_password, str):
|
434
387
|
return [self.config.pdf_password] if self.config.pdf_password else [""]
|
435
388
|
return list(self.config.pdf_password) if self.config.pdf_password else [""]
|
436
389
|
|
437
390
|
async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
|
438
|
-
"""Extract PDF metadata with password attempts."""
|
439
391
|
passwords = self._get_passwords_to_try()
|
440
392
|
|
441
393
|
last_exception = None
|
@@ -446,7 +398,6 @@ class PDFExtractor(Extractor):
|
|
446
398
|
last_exception = e
|
447
399
|
continue
|
448
400
|
|
449
|
-
# If all passwords failed, try with empty password as fallback
|
450
401
|
try:
|
451
402
|
return await extract_pdf_metadata(content, password="")
|
452
403
|
except Exception:
|
@@ -455,7 +406,6 @@ class PDFExtractor(Extractor):
|
|
455
406
|
raise
|
456
407
|
|
457
408
|
def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
|
458
|
-
"""Extract PDF metadata with password attempts (sync version)."""
|
459
409
|
passwords = self._get_passwords_to_try()
|
460
410
|
|
461
411
|
last_exception = None
|
@@ -466,7 +416,6 @@ class PDFExtractor(Extractor):
|
|
466
416
|
last_exception = e
|
467
417
|
continue
|
468
418
|
|
469
|
-
# If all passwords failed, try with empty password as fallback
|
470
419
|
try:
|
471
420
|
return extract_pdf_metadata_sync(content, password="")
|
472
421
|
except Exception:
|
@@ -475,12 +424,10 @@ class PDFExtractor(Extractor):
|
|
475
424
|
raise
|
476
425
|
|
477
426
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
478
|
-
"""Extract text using playa for better structure preservation."""
|
479
427
|
with contextlib.suppress(Exception):
|
480
428
|
content = path.read_bytes()
|
481
429
|
document = self._parse_with_password_attempts(content)
|
482
430
|
|
483
|
-
# Extract text while preserving structure
|
484
431
|
pages_text = []
|
485
432
|
for page in document.pages:
|
486
433
|
page_text = page.extract_text()
|
@@ -1,12 +1,3 @@
|
|
1
|
-
"""This module provides functions to extract textual content from files.
|
2
|
-
|
3
|
-
It includes vendored code:
|
4
|
-
|
5
|
-
- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
|
6
|
-
See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
|
7
|
-
Refer to the markitdown repository for it's license (MIT).
|
8
|
-
"""
|
9
|
-
|
10
1
|
from __future__ import annotations
|
11
2
|
|
12
3
|
import re
|
@@ -30,7 +21,6 @@ if TYPE_CHECKING: # pragma: no cover
|
|
30
21
|
|
31
22
|
from kreuzberg._types import Metadata
|
32
23
|
|
33
|
-
# Pre-compiled regex patterns for performance
|
34
24
|
_NON_WORD_PATTERN = re.compile(r"\W")
|
35
25
|
|
36
26
|
|
@@ -201,15 +191,12 @@ class PresentationExtractor(Extractor):
|
|
201
191
|
"""
|
202
192
|
metadata: Metadata = {}
|
203
193
|
|
204
|
-
# Extract core properties
|
205
194
|
PresentationExtractor._extract_core_properties(presentation, metadata)
|
206
195
|
|
207
|
-
# Extract fonts used in presentation
|
208
196
|
fonts = PresentationExtractor._extract_fonts(presentation)
|
209
197
|
if fonts:
|
210
198
|
metadata["fonts"] = list(fonts)
|
211
199
|
|
212
|
-
# Add structural information
|
213
200
|
PresentationExtractor._add_presentation_structure_info(presentation, metadata, fonts)
|
214
201
|
|
215
202
|
return metadata
|
@@ -217,7 +204,6 @@ class PresentationExtractor(Extractor):
|
|
217
204
|
@staticmethod
|
218
205
|
def _extract_core_properties(presentation: Presentation, metadata: Metadata) -> None:
|
219
206
|
"""Extract core document properties from presentation."""
|
220
|
-
# Property mapping for core metadata
|
221
207
|
property_mapping = [
|
222
208
|
("authors", "author"),
|
223
209
|
("comments", "comments"),
|
@@ -236,7 +222,6 @@ class PresentationExtractor(Extractor):
|
|
236
222
|
if core_property := getattr(presentation.core_properties, core_property_key, None):
|
237
223
|
metadata[metadata_key] = core_property # type: ignore[literal-required]
|
238
224
|
|
239
|
-
# Handle special list properties
|
240
225
|
if presentation.core_properties.language:
|
241
226
|
metadata["languages"] = [presentation.core_properties.language]
|
242
227
|
|
@@ -265,7 +250,6 @@ class PresentationExtractor(Extractor):
|
|
265
250
|
if slide_count == 0:
|
266
251
|
return
|
267
252
|
|
268
|
-
# Build description
|
269
253
|
structure_info = f"Presentation with {slide_count} slide{'s' if slide_count != 1 else ''}"
|
270
254
|
|
271
255
|
slides_with_notes = sum(1 for slide in presentation.slides if slide.has_notes_slide)
|
@@ -274,7 +258,6 @@ class PresentationExtractor(Extractor):
|
|
274
258
|
|
275
259
|
metadata["description"] = structure_info
|
276
260
|
|
277
|
-
# Build summary if not already present
|
278
261
|
if "summary" not in metadata:
|
279
262
|
summary_parts = [f"PowerPoint presentation with {slide_count} slides"]
|
280
263
|
if slides_with_notes > 0:
|
@@ -10,15 +10,17 @@ from io import StringIO
|
|
10
10
|
from pathlib import Path
|
11
11
|
from typing import Any
|
12
12
|
|
13
|
+
import polars as pl
|
13
14
|
from anyio import Path as AsyncPath
|
14
15
|
from PIL import Image
|
15
16
|
from python_calamine import CalamineWorkbook
|
16
17
|
|
17
18
|
from kreuzberg._extractors._base import Extractor
|
18
19
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
|
19
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
20
|
+
from kreuzberg._types import ExtractionResult, Metadata, TableData
|
20
21
|
from kreuzberg._utils._string import normalize_spaces
|
21
22
|
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
23
|
+
from kreuzberg._utils._table import enhance_table_markdown
|
22
24
|
from kreuzberg._utils._tmp import create_temp_file
|
23
25
|
from kreuzberg.exceptions import ParsingError
|
24
26
|
|
@@ -108,14 +110,6 @@ class SpreadSheetExtractor(Extractor):
|
|
108
110
|
|
109
111
|
@staticmethod
|
110
112
|
def _convert_cell_to_str(value: Any) -> str:
|
111
|
-
"""Convert a cell value to string representation.
|
112
|
-
|
113
|
-
Args:
|
114
|
-
value: The cell value to convert.
|
115
|
-
|
116
|
-
Returns:
|
117
|
-
String representation of the cell value.
|
118
|
-
"""
|
119
113
|
if value is None:
|
120
114
|
return ""
|
121
115
|
if isinstance(value, bool):
|
@@ -139,7 +133,7 @@ class SpreadSheetExtractor(Extractor):
|
|
139
133
|
csv_buffer.close()
|
140
134
|
|
141
135
|
csv_path, unlink = await create_temp_file(".csv")
|
142
|
-
await AsyncPath(csv_path).write_text(csv_data)
|
136
|
+
await AsyncPath(csv_path).write_text(csv_data, encoding="utf-8")
|
143
137
|
|
144
138
|
csv_reader = csv.reader(StringIO(csv_data))
|
145
139
|
rows = list(csv_reader)
|
@@ -162,7 +156,6 @@ class SpreadSheetExtractor(Extractor):
|
|
162
156
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
163
157
|
|
164
158
|
def _convert_sheet_to_text_sync(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
165
|
-
"""Synchronous version of _convert_sheet_to_text."""
|
166
159
|
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
167
160
|
|
168
161
|
csv_buffer = StringIO()
|
@@ -195,82 +188,57 @@ class SpreadSheetExtractor(Extractor):
|
|
195
188
|
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
196
189
|
|
197
190
|
def _enhance_sheet_with_table_data(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
198
|
-
"""Enhanced sheet processing with better table structure preservation."""
|
199
191
|
try:
|
200
|
-
# pandas is optional dependency
|
201
|
-
import pandas as pd # noqa: PLC0415
|
202
|
-
|
203
|
-
from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
|
204
|
-
|
205
192
|
sheet = workbook.get_sheet_by_name(sheet_name)
|
206
193
|
data = sheet.to_python()
|
207
194
|
|
208
195
|
if not data or not any(row for row in data):
|
209
196
|
return f"## {sheet_name}\n\n*Empty sheet*"
|
210
197
|
|
211
|
-
|
212
|
-
df = pd.DataFrame(data)
|
198
|
+
df = pl.DataFrame(data)
|
213
199
|
|
214
|
-
|
215
|
-
df = df.
|
200
|
+
df = df.filter(~pl.all_horizontal(pl.all().is_null()))
|
201
|
+
df = df.select([col for col in df.columns if not df[col].is_null().all()])
|
216
202
|
|
217
|
-
if df.
|
203
|
+
if df.is_empty():
|
218
204
|
return f"## {sheet_name}\n\n*No data*"
|
219
205
|
|
220
|
-
# Create a mock TableData for enhanced formatting
|
221
|
-
from kreuzberg._types import TableData # noqa: PLC0415
|
222
|
-
|
223
|
-
# Create a 1x1 transparent image as placeholder
|
224
206
|
placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
|
225
207
|
mock_table: TableData = {"df": df, "text": "", "page_number": 0, "cropped_image": placeholder_image}
|
226
208
|
|
227
209
|
enhanced_markdown = enhance_table_markdown(mock_table)
|
228
210
|
return f"## {sheet_name}\n\n{enhanced_markdown}"
|
229
211
|
|
230
|
-
except (
|
231
|
-
# Fallback to original method if pandas/table enhancement fails
|
212
|
+
except (AttributeError, ValueError):
|
232
213
|
return self._convert_sheet_to_text_sync(workbook, sheet_name)
|
233
214
|
|
234
215
|
@staticmethod
|
235
216
|
def _extract_spreadsheet_metadata(workbook: CalamineWorkbook) -> Metadata:
|
236
|
-
"""Extract metadata from spreadsheet using python-calamine.
|
237
|
-
|
238
|
-
Args:
|
239
|
-
workbook: CalamineWorkbook instance
|
240
|
-
|
241
|
-
Returns:
|
242
|
-
Metadata dict using existing metadata keys where possible
|
243
|
-
"""
|
244
217
|
metadata: Metadata = {}
|
245
218
|
|
246
|
-
# Extract basic document properties
|
247
219
|
SpreadSheetExtractor._extract_document_properties(workbook, metadata)
|
248
220
|
|
249
|
-
# Add structural information
|
250
221
|
SpreadSheetExtractor._add_structure_info(workbook, metadata)
|
251
222
|
|
252
|
-
# Analyze content complexity
|
253
223
|
SpreadSheetExtractor._analyze_content_complexity(workbook, metadata)
|
254
224
|
|
255
225
|
return metadata
|
256
226
|
|
257
227
|
@staticmethod
|
258
228
|
def _extract_document_properties(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
259
|
-
"""Extract basic document properties from workbook."""
|
260
229
|
with contextlib.suppress(AttributeError, Exception):
|
261
230
|
if not (hasattr(workbook, "metadata") and workbook.metadata):
|
262
231
|
return
|
263
232
|
|
264
233
|
props = workbook.metadata
|
265
234
|
|
266
|
-
# Basic properties mapping
|
267
235
|
property_mapping = {
|
268
236
|
"title": "title",
|
269
|
-
"author": "authors",
|
237
|
+
"author": "authors",
|
270
238
|
"subject": "subject",
|
271
239
|
"comments": "comments",
|
272
|
-
"keywords": "keywords",
|
273
|
-
"category": "categories",
|
240
|
+
"keywords": "keywords",
|
241
|
+
"category": "categories",
|
274
242
|
"company": "organization",
|
275
243
|
"manager": "modified_by",
|
276
244
|
}
|
@@ -286,12 +254,10 @@ class SpreadSheetExtractor(Extractor):
|
|
286
254
|
else:
|
287
255
|
metadata[meta_key] = value # type: ignore[literal-required]
|
288
256
|
|
289
|
-
# Handle dates separately
|
290
257
|
SpreadSheetExtractor._extract_date_properties(props, metadata)
|
291
258
|
|
292
259
|
@staticmethod
|
293
260
|
def _extract_date_properties(props: Any, metadata: Metadata) -> None:
|
294
|
-
"""Extract and format date properties."""
|
295
261
|
date_mapping = {"created": "created_at", "modified": "modified_at"}
|
296
262
|
|
297
263
|
for prop_name, meta_key in date_mapping.items():
|
@@ -304,14 +270,12 @@ class SpreadSheetExtractor(Extractor):
|
|
304
270
|
|
305
271
|
@staticmethod
|
306
272
|
def _add_structure_info(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
307
|
-
"""Add structural information about the spreadsheet."""
|
308
273
|
if not (hasattr(workbook, "sheet_names") and workbook.sheet_names):
|
309
274
|
return
|
310
275
|
|
311
276
|
sheet_count = len(workbook.sheet_names)
|
312
277
|
structure_info = f"Spreadsheet with {sheet_count} sheet{'s' if sheet_count != 1 else ''}"
|
313
278
|
|
314
|
-
# Don't list too many sheet names (magic number made constant)
|
315
279
|
max_sheet_names_to_list = 5
|
316
280
|
if sheet_count <= max_sheet_names_to_list:
|
317
281
|
structure_info += f": {', '.join(workbook.sheet_names)}"
|
@@ -320,12 +284,10 @@ class SpreadSheetExtractor(Extractor):
|
|
320
284
|
|
321
285
|
@staticmethod
|
322
286
|
def _analyze_content_complexity(workbook: CalamineWorkbook, metadata: Metadata) -> None:
|
323
|
-
"""Analyze spreadsheet content for complexity indicators."""
|
324
287
|
with contextlib.suppress(Exception):
|
325
288
|
has_formulas = False
|
326
289
|
total_cells = 0
|
327
290
|
|
328
|
-
# Check only first few sheets for performance
|
329
291
|
max_sheets_to_check = 3
|
330
292
|
max_rows_to_check = 50
|
331
293
|
|
@@ -335,17 +297,15 @@ class SpreadSheetExtractor(Extractor):
|
|
335
297
|
data = sheet.to_python()
|
336
298
|
|
337
299
|
for row in data[:max_rows_to_check]:
|
338
|
-
if not row:
|
300
|
+
if not row:
|
339
301
|
continue
|
340
302
|
|
341
303
|
total_cells += sum(1 for cell in row if cell is not None and str(cell).strip())
|
342
304
|
|
343
|
-
# Check for formulas (simple heuristic)
|
344
305
|
if any(isinstance(cell, str) and cell.startswith("=") for cell in row):
|
345
306
|
has_formulas = True
|
346
307
|
break
|
347
308
|
|
348
|
-
# Build summary
|
349
309
|
summary_parts = []
|
350
310
|
if total_cells > 0:
|
351
311
|
summary_parts.append(f"Contains {total_cells}+ data cells")
|
@@ -28,7 +28,6 @@ from kreuzberg._utils._sync import run_sync
|
|
28
28
|
if TYPE_CHECKING:
|
29
29
|
from pathlib import Path
|
30
30
|
|
31
|
-
# Define text field keywords as a set for O(1) membership testing
|
32
31
|
_TEXT_FIELD_KEYWORDS = frozenset({"title", "name", "subject", "description", "content", "body", "text", "message"})
|
33
32
|
|
34
33
|
|
@@ -79,7 +78,6 @@ class StructuredDataExtractor(Extractor):
|
|
79
78
|
text_parts: list[str] = []
|
80
79
|
metadata: dict[str, Any] = {}
|
81
80
|
|
82
|
-
# Use match statement for cleaner code and avoid multiple isinstance calls
|
83
81
|
if isinstance(data, dict):
|
84
82
|
text_parts = self._extract_from_dict(data, metadata)
|
85
83
|
elif isinstance(data, list):
|
@@ -96,7 +94,7 @@ class StructuredDataExtractor(Extractor):
|
|
96
94
|
chunks=[],
|
97
95
|
)
|
98
96
|
|
99
|
-
except (
|
97
|
+
except (ValueError, TypeError) as e:
|
100
98
|
return ExtractionResult(
|
101
99
|
content=normalize_spaces(text_content),
|
102
100
|
mime_type=PLAIN_TEXT_MIME_TYPE,
|
@@ -117,7 +115,6 @@ class StructuredDataExtractor(Extractor):
|
|
117
115
|
if isinstance(value, str) and value.strip():
|
118
116
|
text_parts.append(f"{full_key}: {value}")
|
119
117
|
|
120
|
-
# Check if key contains any text field keywords efficiently
|
121
118
|
key_lower = key.lower()
|
122
119
|
if any(keyword in key_lower for keyword in _TEXT_FIELD_KEYWORDS):
|
123
120
|
metadata[full_key] = value
|