kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -84,8 +84,6 @@ NodeType = Literal[
|
|
84
84
|
|
85
85
|
|
86
86
|
class PandocExtractor(Extractor):
|
87
|
-
"""Extractor for documents supported by Pandoc."""
|
88
|
-
|
89
87
|
_checked_version: bool = False
|
90
88
|
|
91
89
|
MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
|
@@ -153,14 +151,6 @@ class PandocExtractor(Extractor):
|
|
153
151
|
}
|
154
152
|
|
155
153
|
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
156
|
-
"""Extract text and metadata from bytes content using Pandoc.
|
157
|
-
|
158
|
-
Args:
|
159
|
-
content: The content bytes to process.
|
160
|
-
|
161
|
-
Returns:
|
162
|
-
ExtractionResult with the extracted text and metadata.
|
163
|
-
"""
|
164
154
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
165
155
|
input_file, unlink = await create_temp_file(f".{extension}")
|
166
156
|
|
@@ -171,17 +161,6 @@ class PandocExtractor(Extractor):
|
|
171
161
|
await unlink()
|
172
162
|
|
173
163
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
174
|
-
"""Extract text and metadata from a file using Pandoc.
|
175
|
-
|
176
|
-
Args:
|
177
|
-
path: The path to the file to process.
|
178
|
-
|
179
|
-
Raises:
|
180
|
-
ParsingError: If the file data could not be extracted.
|
181
|
-
|
182
|
-
Returns:
|
183
|
-
ExtractionResult with the extracted text and metadata.
|
184
|
-
"""
|
185
164
|
await self._validate_pandoc_version()
|
186
165
|
self._get_pandoc_type_from_mime_type(self.mime_type)
|
187
166
|
|
@@ -198,14 +177,6 @@ class PandocExtractor(Extractor):
|
|
198
177
|
raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
|
199
178
|
|
200
179
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
201
|
-
"""Pure sync implementation of extract_bytes.
|
202
|
-
|
203
|
-
Args:
|
204
|
-
content: The content bytes to process.
|
205
|
-
|
206
|
-
Returns:
|
207
|
-
ExtractionResult with the extracted text and metadata.
|
208
|
-
"""
|
209
180
|
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
210
181
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
211
182
|
|
@@ -219,17 +190,6 @@ class PandocExtractor(Extractor):
|
|
219
190
|
Path(temp_path).unlink()
|
220
191
|
|
221
192
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
222
|
-
"""Pure sync implementation of extract_path.
|
223
|
-
|
224
|
-
Args:
|
225
|
-
path: The path to the file to process.
|
226
|
-
|
227
|
-
Returns:
|
228
|
-
ExtractionResult with the extracted text and metadata.
|
229
|
-
|
230
|
-
Raises:
|
231
|
-
ParsingError: When file processing fails.
|
232
|
-
"""
|
233
193
|
self._validate_pandoc_version_sync()
|
234
194
|
self._get_pandoc_type_from_mime_type(self.mime_type)
|
235
195
|
|
@@ -244,18 +204,13 @@ class PandocExtractor(Extractor):
|
|
244
204
|
raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
|
245
205
|
|
246
206
|
async def _validate_pandoc_version(self) -> None:
|
247
|
-
"""Validate that the installed Pandoc version meets the minimum requirement.
|
248
|
-
|
249
|
-
Raises:
|
250
|
-
MissingDependencyError: If Pandoc is not installed or version is too low
|
251
|
-
"""
|
252
207
|
try:
|
253
208
|
if self._checked_version:
|
254
209
|
return
|
255
210
|
|
256
211
|
command = ["pandoc", "--version"]
|
257
212
|
result = await run_process(command)
|
258
|
-
stdout = result.stdout.decode()
|
213
|
+
stdout = result.stdout.decode("utf-8")
|
259
214
|
|
260
215
|
version_match = re.search(
|
261
216
|
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
|
@@ -299,14 +254,6 @@ class PandocExtractor(Extractor):
|
|
299
254
|
|
300
255
|
@staticmethod
|
301
256
|
def _get_pandoc_key(key: str) -> str | None:
|
302
|
-
"""Map Pandoc metadata keys to our standard metadata keys.
|
303
|
-
|
304
|
-
Args:
|
305
|
-
key: The key from Pandoc metadata
|
306
|
-
|
307
|
-
Returns:
|
308
|
-
The mapped key name for our system, or None if not mapped
|
309
|
-
"""
|
310
257
|
if key == "abstract":
|
311
258
|
return "summary"
|
312
259
|
|
@@ -325,17 +272,6 @@ class PandocExtractor(Extractor):
|
|
325
272
|
return key
|
326
273
|
|
327
274
|
def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
|
328
|
-
"""Get Pandoc format type from MIME type.
|
329
|
-
|
330
|
-
Args:
|
331
|
-
mime_type: The MIME type to look up
|
332
|
-
|
333
|
-
Returns:
|
334
|
-
The corresponding Pandoc type
|
335
|
-
|
336
|
-
Raises:
|
337
|
-
ValidationError: If mime_type is not supported
|
338
|
-
"""
|
339
275
|
if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
340
276
|
return pandoc_type
|
341
277
|
|
@@ -349,17 +285,6 @@ class PandocExtractor(Extractor):
|
|
349
285
|
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
350
286
|
|
351
287
|
async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
|
352
|
-
"""Extract metadata from a file using Pandoc.
|
353
|
-
|
354
|
-
Args:
|
355
|
-
input_file: The file to extract metadata from
|
356
|
-
|
357
|
-
Returns:
|
358
|
-
The extracted metadata
|
359
|
-
|
360
|
-
Raises:
|
361
|
-
ParsingError: If metadata extraction fails
|
362
|
-
"""
|
363
288
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
364
289
|
metadata_file, unlink = await create_temp_file(".json")
|
365
290
|
try:
|
@@ -389,17 +314,6 @@ class PandocExtractor(Extractor):
|
|
389
314
|
await unlink()
|
390
315
|
|
391
316
|
async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
|
392
|
-
"""Extract text content from a file using Pandoc.
|
393
|
-
|
394
|
-
Args:
|
395
|
-
input_file: The file to extract content from
|
396
|
-
|
397
|
-
Returns:
|
398
|
-
The extracted text content
|
399
|
-
|
400
|
-
Raises:
|
401
|
-
ParsingError: If content extraction fails
|
402
|
-
"""
|
403
317
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
404
318
|
output_path, unlink = await create_temp_file(".md")
|
405
319
|
try:
|
@@ -431,14 +345,6 @@ class PandocExtractor(Extractor):
|
|
431
345
|
await unlink()
|
432
346
|
|
433
347
|
def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
|
434
|
-
"""Extract structured metadata from Pandoc JSON metadata.
|
435
|
-
|
436
|
-
Args:
|
437
|
-
raw_meta: The raw metadata from Pandoc
|
438
|
-
|
439
|
-
Returns:
|
440
|
-
Structured metadata
|
441
|
-
"""
|
442
348
|
meta: Metadata = {}
|
443
349
|
|
444
350
|
if (
|
@@ -485,16 +391,6 @@ class PandocExtractor(Extractor):
|
|
485
391
|
return meta
|
486
392
|
|
487
393
|
def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
|
488
|
-
"""Extract text from an inline node in a document structure.
|
489
|
-
|
490
|
-
Args:
|
491
|
-
node: The node to extract text from
|
492
|
-
type_field: The field name for the node type
|
493
|
-
content_field: The field name for the node content
|
494
|
-
|
495
|
-
Returns:
|
496
|
-
The extracted text or None if no text could be extracted
|
497
|
-
"""
|
498
394
|
if node_type := node.get(type_field):
|
499
395
|
if node_type == "Str":
|
500
396
|
return node.get(content_field)
|
@@ -505,29 +401,11 @@ class PandocExtractor(Extractor):
|
|
505
401
|
return None
|
506
402
|
|
507
403
|
def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
|
508
|
-
"""Extract text from a list of inline nodes.
|
509
|
-
|
510
|
-
Args:
|
511
|
-
nodes: The list of nodes to extract text from
|
512
|
-
|
513
|
-
Returns:
|
514
|
-
The extracted text or None if no text could be extracted
|
515
|
-
"""
|
516
404
|
texts = [text for node in nodes if (text := self._extract_inline_text(node))]
|
517
405
|
result = "".join(texts).strip()
|
518
406
|
return result if result else None
|
519
407
|
|
520
408
|
def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
|
521
|
-
"""Extract a metadata value from a node.
|
522
|
-
|
523
|
-
Args:
|
524
|
-
node: The node to extract metadata from
|
525
|
-
type_field: The field name for the node type
|
526
|
-
content_field: The field name for the node content
|
527
|
-
|
528
|
-
Returns:
|
529
|
-
The extracted metadata value or None if no metadata could be extracted
|
530
|
-
"""
|
531
409
|
if not isinstance(node, dict) or type_field not in node:
|
532
410
|
return None
|
533
411
|
|
@@ -577,12 +455,17 @@ class PandocExtractor(Extractor):
|
|
577
455
|
return None
|
578
456
|
|
579
457
|
def _validate_pandoc_version_sync(self) -> None:
|
580
|
-
"""Synchronous version of _validate_pandoc_version."""
|
581
458
|
try:
|
582
459
|
if self._checked_version:
|
583
460
|
return
|
584
461
|
|
585
|
-
result = subprocess.run(
|
462
|
+
result = subprocess.run(
|
463
|
+
["pandoc", "--version"], # noqa: S607
|
464
|
+
capture_output=True,
|
465
|
+
text=True,
|
466
|
+
check=False,
|
467
|
+
encoding="utf-8",
|
468
|
+
)
|
586
469
|
|
587
470
|
if result.returncode != 0:
|
588
471
|
raise MissingDependencyError(
|
@@ -621,7 +504,6 @@ class PandocExtractor(Extractor):
|
|
621
504
|
) from e
|
622
505
|
|
623
506
|
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
624
|
-
"""Synchronous version of _handle_extract_metadata."""
|
625
507
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
626
508
|
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
627
509
|
os.close(fd)
|
@@ -638,7 +520,7 @@ class PandocExtractor(Extractor):
|
|
638
520
|
str(metadata_file),
|
639
521
|
]
|
640
522
|
|
641
|
-
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
523
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
|
642
524
|
|
643
525
|
if result.returncode != 0:
|
644
526
|
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
@@ -655,7 +537,6 @@ class PandocExtractor(Extractor):
|
|
655
537
|
Path(metadata_file).unlink()
|
656
538
|
|
657
539
|
def _extract_file_sync(self, path: Path) -> str:
|
658
|
-
"""Synchronous version of _handle_extract_file."""
|
659
540
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
660
541
|
fd, output_path = tempfile.mkstemp(suffix=".md")
|
661
542
|
os.close(fd)
|
@@ -673,7 +554,7 @@ class PandocExtractor(Extractor):
|
|
673
554
|
str(output_path),
|
674
555
|
]
|
675
556
|
|
676
|
-
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
557
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
|
677
558
|
|
678
559
|
if result.returncode != 0:
|
679
560
|
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
@@ -691,8 +572,6 @@ class PandocExtractor(Extractor):
|
|
691
572
|
|
692
573
|
|
693
574
|
class MarkdownExtractor(PandocExtractor):
|
694
|
-
"""Extractor for Markdown-based document formats."""
|
695
|
-
|
696
575
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
697
576
|
"text/x-markdown",
|
698
577
|
"text/x-commonmark",
|
@@ -704,8 +583,6 @@ class MarkdownExtractor(PandocExtractor):
|
|
704
583
|
|
705
584
|
|
706
585
|
class OfficeDocumentExtractor(PandocExtractor):
|
707
|
-
"""Extractor for Office document formats (Word, ODT)."""
|
708
|
-
|
709
586
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
710
587
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
711
588
|
"application/vnd.oasis.opendocument.text",
|
@@ -713,8 +590,6 @@ class OfficeDocumentExtractor(PandocExtractor):
|
|
713
590
|
|
714
591
|
|
715
592
|
class EbookExtractor(PandocExtractor):
|
716
|
-
"""Extractor for e-book formats (EPUB, FB2)."""
|
717
|
-
|
718
593
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
719
594
|
"application/epub+zip",
|
720
595
|
"application/x-fictionbook+xml",
|
@@ -722,8 +597,6 @@ class EbookExtractor(PandocExtractor):
|
|
722
597
|
|
723
598
|
|
724
599
|
class StructuredTextExtractor(PandocExtractor):
|
725
|
-
"""Extractor for structured text formats (RST, Org, etc.)."""
|
726
|
-
|
727
600
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
728
601
|
"text/x-rst",
|
729
602
|
"text/x-org",
|
@@ -733,8 +606,6 @@ class StructuredTextExtractor(PandocExtractor):
|
|
733
606
|
|
734
607
|
|
735
608
|
class LaTeXExtractor(PandocExtractor):
|
736
|
-
"""Extractor for LaTeX and Typst documents."""
|
737
|
-
|
738
609
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
739
610
|
"application/x-latex",
|
740
611
|
"application/x-typst",
|
@@ -742,8 +613,6 @@ class LaTeXExtractor(PandocExtractor):
|
|
742
613
|
|
743
614
|
|
744
615
|
class BibliographyExtractor(PandocExtractor):
|
745
|
-
"""Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
|
746
|
-
|
747
616
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
748
617
|
"application/x-bibtex",
|
749
618
|
"application/x-biblatex",
|
@@ -754,8 +623,6 @@ class BibliographyExtractor(PandocExtractor):
|
|
754
623
|
|
755
624
|
|
756
625
|
class XMLBasedExtractor(PandocExtractor):
|
757
|
-
"""Extractor for XML-based document formats (DocBook, JATS, OPML)."""
|
758
|
-
|
759
626
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
760
627
|
"application/docbook+xml",
|
761
628
|
"application/x-jats+xml",
|
@@ -764,8 +631,6 @@ class XMLBasedExtractor(PandocExtractor):
|
|
764
631
|
|
765
632
|
|
766
633
|
class TabularDataExtractor(PandocExtractor):
|
767
|
-
"""Extractor for tabular data formats (CSV, TSV)."""
|
768
|
-
|
769
634
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
770
635
|
"text/csv",
|
771
636
|
"text/tab-separated-values",
|
@@ -773,8 +638,6 @@ class TabularDataExtractor(PandocExtractor):
|
|
773
638
|
|
774
639
|
|
775
640
|
class MiscFormatExtractor(PandocExtractor):
|
776
|
-
"""Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
|
777
|
-
|
778
641
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
|
779
642
|
"application/rtf",
|
780
643
|
"text/troff",
|
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -18,11 +18,8 @@ from playa import parse
|
|
18
18
|
from kreuzberg._extractors._base import Extractor
|
19
19
|
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
20
20
|
from kreuzberg._ocr import get_ocr_backend
|
21
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
22
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
23
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
24
21
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
25
|
-
from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
|
22
|
+
from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
|
26
23
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
27
24
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
28
25
|
from kreuzberg._utils._string import normalize_spaces
|
@@ -65,7 +62,6 @@ class PDFExtractor(Extractor):
|
|
65
62
|
if self._validate_extracted_text(content):
|
66
63
|
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
67
64
|
except ParsingError:
|
68
|
-
# If searchable text extraction fails, continue to OCR or empty result
|
69
65
|
pass
|
70
66
|
|
71
67
|
if not result and self.config.ocr_backend is not None:
|
@@ -77,7 +73,7 @@ class PDFExtractor(Extractor):
|
|
77
73
|
result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
78
74
|
|
79
75
|
if self.config.extract_tables:
|
80
|
-
# GMFT is optional dependency
|
76
|
+
# GMFT is optional dependency ~keep
|
81
77
|
try:
|
82
78
|
from kreuzberg._gmft import extract_tables # noqa: PLC0415
|
83
79
|
|
@@ -85,7 +81,6 @@ class PDFExtractor(Extractor):
|
|
85
81
|
except ImportError: # pragma: no cover
|
86
82
|
result.tables = []
|
87
83
|
|
88
|
-
# Enhance metadata with table information
|
89
84
|
if result.tables:
|
90
85
|
table_summary = generate_table_summary(result.tables)
|
91
86
|
result.metadata = result.metadata | {
|
@@ -98,7 +93,6 @@ class PDFExtractor(Extractor):
|
|
98
93
|
return self._apply_quality_processing(result)
|
99
94
|
|
100
95
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
101
|
-
"""Pure sync implementation of PDF extraction from bytes."""
|
102
96
|
fd, temp_path = tempfile.mkstemp(suffix=".pdf")
|
103
97
|
try:
|
104
98
|
with os.fdopen(fd, "wb") as f:
|
@@ -115,7 +109,6 @@ class PDFExtractor(Extractor):
|
|
115
109
|
Path(temp_path).unlink()
|
116
110
|
|
117
111
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
118
|
-
"""Pure sync implementation of PDF extraction from path."""
|
119
112
|
try:
|
120
113
|
text = self._extract_pdf_searchable_text_sync(path)
|
121
114
|
except ParsingError:
|
@@ -126,7 +119,7 @@ class PDFExtractor(Extractor):
|
|
126
119
|
|
127
120
|
tables = []
|
128
121
|
if self.config.extract_tables:
|
129
|
-
# GMFT is optional dependency
|
122
|
+
# GMFT is optional dependency ~keep
|
130
123
|
try:
|
131
124
|
from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
|
132
125
|
|
@@ -134,7 +127,6 @@ class PDFExtractor(Extractor):
|
|
134
127
|
except ImportError:
|
135
128
|
tables = []
|
136
129
|
|
137
|
-
# Use playa for better text structure preservation when not using OCR
|
138
130
|
if not self.config.force_ocr and self._validate_extracted_text(text):
|
139
131
|
text = self._extract_with_playa_sync(path, fallback_text=text)
|
140
132
|
|
@@ -148,7 +140,6 @@ class PDFExtractor(Extractor):
|
|
148
140
|
chunks=[],
|
149
141
|
)
|
150
142
|
|
151
|
-
# Enhance metadata with table information
|
152
143
|
if tables:
|
153
144
|
table_summary = generate_table_summary(tables)
|
154
145
|
result.metadata = result.metadata | {
|
@@ -158,25 +149,9 @@ class PDFExtractor(Extractor):
|
|
158
149
|
f"{table_summary['total_rows']} total rows",
|
159
150
|
}
|
160
151
|
|
161
|
-
# Apply quality processing
|
162
152
|
return self._apply_quality_processing(result)
|
163
153
|
|
164
154
|
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
165
|
-
"""Check if text extracted from PDF is valid or corrupted.
|
166
|
-
|
167
|
-
This checks for indicators of corrupted PDF text extraction:
|
168
|
-
1. Empty or whitespace-only text
|
169
|
-
2. High concentration of control characters and null bytes
|
170
|
-
3. High concentration of Unicode replacement characters
|
171
|
-
|
172
|
-
Args:
|
173
|
-
text: The extracted text to validate
|
174
|
-
corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
|
175
|
-
characters (default: 0.05 or 5%)
|
176
|
-
|
177
|
-
Returns:
|
178
|
-
True if the text appears valid, False if it seems corrupted
|
179
|
-
"""
|
180
155
|
if not text or not text.strip():
|
181
156
|
return False
|
182
157
|
|
@@ -188,17 +163,6 @@ class PDFExtractor(Extractor):
|
|
188
163
|
return (len(corruption_matches) / len(text)) < corruption_threshold
|
189
164
|
|
190
165
|
async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
|
191
|
-
"""Convert a PDF file to images.
|
192
|
-
|
193
|
-
Args:
|
194
|
-
input_file: The path to the PDF file.
|
195
|
-
|
196
|
-
Raises:
|
197
|
-
ParsingError: If the PDF file could not be converted to images.
|
198
|
-
|
199
|
-
Returns:
|
200
|
-
A list of Pillow Images.
|
201
|
-
"""
|
202
166
|
document: pypdfium2.PdfDocument | None = None
|
203
167
|
last_error = None
|
204
168
|
|
@@ -206,7 +170,7 @@ class PDFExtractor(Extractor):
|
|
206
170
|
try:
|
207
171
|
with pypdfium_file_lock(input_file):
|
208
172
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
209
|
-
return [page.render(scale=
|
173
|
+
return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
|
210
174
|
except pypdfium2.PdfiumError as e: # noqa: PERF203
|
211
175
|
last_error = e
|
212
176
|
if not should_retry(e, attempt + 1):
|
@@ -238,39 +202,18 @@ class PDFExtractor(Extractor):
|
|
238
202
|
) from last_error
|
239
203
|
|
240
204
|
async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
241
|
-
"""Extract text from a scanned PDF file using OCR.
|
242
|
-
|
243
|
-
Args:
|
244
|
-
input_file: The path to the PDF file.
|
245
|
-
ocr_backend: The OCR backend to use.
|
246
|
-
|
247
|
-
Returns:
|
248
|
-
The extraction result with text content and metadata.
|
249
|
-
"""
|
250
205
|
images = await self._convert_pdf_to_images(input_file)
|
251
206
|
backend = get_ocr_backend(ocr_backend)
|
252
207
|
ocr_results = await run_taskgroup_batched(
|
253
208
|
*[backend.process_image(image, **self.config.get_config_dict()) for image in images],
|
254
209
|
batch_size=cpu_count(),
|
255
210
|
)
|
256
|
-
# Use list comprehension and join for efficient string building
|
257
211
|
content = "\n".join(result.content for result in ocr_results)
|
258
212
|
|
259
213
|
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
260
214
|
|
261
215
|
@staticmethod
|
262
216
|
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
263
|
-
"""Extract text from a searchable PDF file using pypdfium2.
|
264
|
-
|
265
|
-
Args:
|
266
|
-
input_file: The path to the PDF file.
|
267
|
-
|
268
|
-
Raises:
|
269
|
-
ParsingError: If the text could not be extracted from the PDF file.
|
270
|
-
|
271
|
-
Returns:
|
272
|
-
The extracted text.
|
273
|
-
"""
|
274
217
|
document: pypdfium2.PdfDocument | None = None
|
275
218
|
try:
|
276
219
|
with pypdfium_file_lock(input_file):
|
@@ -318,7 +261,6 @@ class PDFExtractor(Extractor):
|
|
318
261
|
await run_sync(document.close)
|
319
262
|
|
320
263
|
def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
|
321
|
-
"""Extract searchable text from PDF using pypdfium2 (sync version)."""
|
322
264
|
pdf = None
|
323
265
|
try:
|
324
266
|
with pypdfium_file_lock(path):
|
@@ -339,7 +281,6 @@ class PDFExtractor(Extractor):
|
|
339
281
|
pdf.close()
|
340
282
|
|
341
283
|
def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
|
342
|
-
"""Extract text from PDF using OCR (sync version)."""
|
343
284
|
pdf = None
|
344
285
|
try:
|
345
286
|
images = []
|
@@ -352,23 +293,7 @@ class PDFExtractor(Extractor):
|
|
352
293
|
bitmap.close()
|
353
294
|
page.close()
|
354
295
|
|
355
|
-
|
356
|
-
temp_files = []
|
357
|
-
|
358
|
-
try:
|
359
|
-
for i, img in enumerate(images):
|
360
|
-
fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
|
361
|
-
temp_files.append((fd, temp_path))
|
362
|
-
img.save(temp_path, format="PNG")
|
363
|
-
os.close(fd)
|
364
|
-
image_paths.append(temp_path)
|
365
|
-
|
366
|
-
return self._process_pdf_images_with_ocr(image_paths)
|
367
|
-
|
368
|
-
finally:
|
369
|
-
for _, temp_path in temp_files:
|
370
|
-
with contextlib.suppress(OSError):
|
371
|
-
Path(temp_path).unlink()
|
296
|
+
return self._process_pdf_images_with_ocr_direct(images)
|
372
297
|
|
373
298
|
except Exception as e:
|
374
299
|
raise ParsingError(f"Failed to OCR PDF: {e}") from e
|
@@ -378,7 +303,6 @@ class PDFExtractor(Extractor):
|
|
378
303
|
pdf.close()
|
379
304
|
|
380
305
|
def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
|
381
|
-
"""Process PDF images with the configured OCR backend."""
|
382
306
|
backend = get_ocr_backend(self.config.ocr_backend)
|
383
307
|
paths = [Path(p) for p in image_paths]
|
384
308
|
|
@@ -401,18 +325,47 @@ class PDFExtractor(Extractor):
|
|
401
325
|
case _:
|
402
326
|
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
403
327
|
|
404
|
-
|
328
|
+
return "\n\n".join(result.content for result in results)
|
329
|
+
|
330
|
+
def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
|
331
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
332
|
+
|
333
|
+
match self.config.ocr_backend:
|
334
|
+
case "tesseract":
|
335
|
+
config = (
|
336
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
337
|
+
)
|
338
|
+
results = []
|
339
|
+
for image in images:
|
340
|
+
result = backend.process_image_sync(image, **asdict(config))
|
341
|
+
results.append(result)
|
342
|
+
case "paddleocr":
|
343
|
+
paddle_config = (
|
344
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
345
|
+
)
|
346
|
+
results = []
|
347
|
+
for image in images:
|
348
|
+
result = backend.process_image_sync(image, **asdict(paddle_config))
|
349
|
+
results.append(result)
|
350
|
+
case "easyocr":
|
351
|
+
easy_config = (
|
352
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
353
|
+
)
|
354
|
+
results = []
|
355
|
+
for image in images:
|
356
|
+
result = backend.process_image_sync(image, **asdict(easy_config))
|
357
|
+
results.append(result)
|
358
|
+
case _:
|
359
|
+
raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
|
360
|
+
|
405
361
|
return "\n\n".join(result.content for result in results)
|
406
362
|
|
407
363
|
def _parse_with_password_attempts(self, content: bytes) -> Document:
|
408
|
-
"""Parse PDF with password attempts."""
|
409
|
-
# Normalize password to list
|
410
364
|
if isinstance(self.config.pdf_password, str):
|
411
365
|
passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
|
412
366
|
else:
|
413
367
|
passwords = list(self.config.pdf_password)
|
414
368
|
|
415
|
-
# Try each password in sequence
|
416
369
|
last_exception = None
|
417
370
|
for password in passwords:
|
418
371
|
try:
|
@@ -421,21 +374,17 @@ class PDFExtractor(Extractor):
|
|
421
374
|
last_exception = e
|
422
375
|
continue
|
423
376
|
|
424
|
-
# If all passwords failed, raise the last exception
|
425
377
|
if last_exception:
|
426
378
|
raise last_exception from None
|
427
379
|
|
428
|
-
# Fallback to no password
|
429
380
|
return parse(content, max_workers=1, password="")
|
430
381
|
|
431
382
|
def _get_passwords_to_try(self) -> list[str]:
|
432
|
-
"""Get list of passwords to try in sequence."""
|
433
383
|
if isinstance(self.config.pdf_password, str):
|
434
384
|
return [self.config.pdf_password] if self.config.pdf_password else [""]
|
435
385
|
return list(self.config.pdf_password) if self.config.pdf_password else [""]
|
436
386
|
|
437
387
|
async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
|
438
|
-
"""Extract PDF metadata with password attempts."""
|
439
388
|
passwords = self._get_passwords_to_try()
|
440
389
|
|
441
390
|
last_exception = None
|
@@ -446,7 +395,6 @@ class PDFExtractor(Extractor):
|
|
446
395
|
last_exception = e
|
447
396
|
continue
|
448
397
|
|
449
|
-
# If all passwords failed, try with empty password as fallback
|
450
398
|
try:
|
451
399
|
return await extract_pdf_metadata(content, password="")
|
452
400
|
except Exception:
|
@@ -455,7 +403,6 @@ class PDFExtractor(Extractor):
|
|
455
403
|
raise
|
456
404
|
|
457
405
|
def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
|
458
|
-
"""Extract PDF metadata with password attempts (sync version)."""
|
459
406
|
passwords = self._get_passwords_to_try()
|
460
407
|
|
461
408
|
last_exception = None
|
@@ -466,7 +413,6 @@ class PDFExtractor(Extractor):
|
|
466
413
|
last_exception = e
|
467
414
|
continue
|
468
415
|
|
469
|
-
# If all passwords failed, try with empty password as fallback
|
470
416
|
try:
|
471
417
|
return extract_pdf_metadata_sync(content, password="")
|
472
418
|
except Exception:
|
@@ -475,12 +421,10 @@ class PDFExtractor(Extractor):
|
|
475
421
|
raise
|
476
422
|
|
477
423
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
478
|
-
"""Extract text using playa for better structure preservation."""
|
479
424
|
with contextlib.suppress(Exception):
|
480
425
|
content = path.read_bytes()
|
481
426
|
document = self._parse_with_password_attempts(content)
|
482
427
|
|
483
|
-
# Extract text while preserving structure
|
484
428
|
pages_text = []
|
485
429
|
for page in document.pages:
|
486
430
|
page_text = page.extract_text()
|