kreuzberg 3.9.0__py3-none-any.whl → 3.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_pdf.py +76 -5
- kreuzberg/_playa.py +6 -4
- kreuzberg/_types.py +2 -0
- {kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/METADATA +14 -11
- {kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/RECORD +8 -8
- {kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_extractors/_pdf.py
CHANGED
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
|
|
22
22
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
23
23
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
24
24
|
from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
|
25
|
-
from kreuzberg._types import ExtractionResult, OcrBackendType
|
25
|
+
from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
|
26
26
|
from kreuzberg._utils._errors import create_error_context, should_retry
|
27
27
|
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
28
28
|
from kreuzberg._utils._string import normalize_spaces
|
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
|
|
33
33
|
|
34
34
|
if TYPE_CHECKING: # pragma: no cover
|
35
35
|
from PIL.Image import Image
|
36
|
+
from playa.document import Document
|
36
37
|
|
37
38
|
|
38
39
|
class PDFExtractor(Extractor):
|
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
|
|
45
46
|
file_path, unlink = await create_temp_file(".pdf")
|
46
47
|
await AsyncPath(file_path).write_bytes(content)
|
47
48
|
try:
|
48
|
-
metadata = await
|
49
|
+
metadata = await self._extract_metadata_with_password_attempts(content)
|
49
50
|
result = await self.extract_path_async(file_path)
|
50
51
|
|
51
52
|
result.metadata = metadata
|
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
|
|
73
74
|
if not result:
|
74
75
|
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
75
76
|
|
76
|
-
result.metadata = await
|
77
|
+
result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
|
77
78
|
|
78
79
|
if self.config.extract_tables:
|
79
80
|
# GMFT is optional dependency
|
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
|
|
107
108
|
|
108
109
|
result = self.extract_path_sync(Path(temp_path))
|
109
110
|
|
110
|
-
metadata =
|
111
|
+
metadata = self._extract_metadata_with_password_attempts_sync(content)
|
111
112
|
result.metadata = metadata
|
112
113
|
|
113
114
|
return result
|
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
|
|
406
407
|
# Use list comprehension and join for efficient string building
|
407
408
|
return "\n\n".join(result.content for result in results)
|
408
409
|
|
410
|
+
def _parse_with_password_attempts(self, content: bytes) -> Document:
|
411
|
+
"""Parse PDF with password attempts."""
|
412
|
+
# Normalize password to list
|
413
|
+
if isinstance(self.config.pdf_password, str):
|
414
|
+
passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
|
415
|
+
else:
|
416
|
+
passwords = list(self.config.pdf_password)
|
417
|
+
|
418
|
+
# Try each password in sequence
|
419
|
+
last_exception = None
|
420
|
+
for password in passwords:
|
421
|
+
try:
|
422
|
+
return parse(content, max_workers=1, password=password)
|
423
|
+
except Exception as e: # noqa: PERF203, BLE001
|
424
|
+
last_exception = e
|
425
|
+
continue
|
426
|
+
|
427
|
+
# If all passwords failed, raise the last exception
|
428
|
+
if last_exception:
|
429
|
+
raise last_exception from None
|
430
|
+
|
431
|
+
# Fallback to no password
|
432
|
+
return parse(content, max_workers=1, password="")
|
433
|
+
|
434
|
+
def _get_passwords_to_try(self) -> list[str]:
|
435
|
+
"""Get list of passwords to try in sequence."""
|
436
|
+
if isinstance(self.config.pdf_password, str):
|
437
|
+
return [self.config.pdf_password] if self.config.pdf_password else [""]
|
438
|
+
return list(self.config.pdf_password) if self.config.pdf_password else [""]
|
439
|
+
|
440
|
+
async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
|
441
|
+
"""Extract PDF metadata with password attempts."""
|
442
|
+
passwords = self._get_passwords_to_try()
|
443
|
+
|
444
|
+
last_exception = None
|
445
|
+
for password in passwords:
|
446
|
+
try:
|
447
|
+
return await extract_pdf_metadata(content, password=password)
|
448
|
+
except Exception as e: # noqa: PERF203, BLE001
|
449
|
+
last_exception = e
|
450
|
+
continue
|
451
|
+
|
452
|
+
# If all passwords failed, try with empty password as fallback
|
453
|
+
try:
|
454
|
+
return await extract_pdf_metadata(content, password="")
|
455
|
+
except Exception:
|
456
|
+
if last_exception:
|
457
|
+
raise last_exception from None
|
458
|
+
raise
|
459
|
+
|
460
|
+
def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
|
461
|
+
"""Extract PDF metadata with password attempts (sync version)."""
|
462
|
+
passwords = self._get_passwords_to_try()
|
463
|
+
|
464
|
+
last_exception = None
|
465
|
+
for password in passwords:
|
466
|
+
try:
|
467
|
+
return extract_pdf_metadata_sync(content, password=password)
|
468
|
+
except Exception as e: # noqa: PERF203, BLE001
|
469
|
+
last_exception = e
|
470
|
+
continue
|
471
|
+
|
472
|
+
# If all passwords failed, try with empty password as fallback
|
473
|
+
try:
|
474
|
+
return extract_pdf_metadata_sync(content, password="")
|
475
|
+
except Exception:
|
476
|
+
if last_exception:
|
477
|
+
raise last_exception from None
|
478
|
+
raise
|
479
|
+
|
409
480
|
def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
|
410
481
|
"""Extract text using playa for better structure preservation."""
|
411
482
|
with contextlib.suppress(Exception):
|
412
483
|
content = path.read_bytes()
|
413
|
-
document =
|
484
|
+
document = self._parse_with_password_attempts(content)
|
414
485
|
|
415
486
|
# Extract text while preserving structure
|
416
487
|
pages_text = []
|
kreuzberg/_playa.py
CHANGED
@@ -24,11 +24,12 @@ FULL_DATE_LENGTH = 14
|
|
24
24
|
BOM_CHAR = "\ufeff"
|
25
25
|
|
26
26
|
|
27
|
-
async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
|
27
|
+
async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
|
28
28
|
"""Extract metadata from a PDF document.
|
29
29
|
|
30
30
|
Args:
|
31
31
|
pdf_content: The bytes of the PDF document.
|
32
|
+
password: Password for encrypted PDF files.
|
32
33
|
|
33
34
|
Raises:
|
34
35
|
ParsingError: If the PDF metadata could not be extracted.
|
@@ -37,7 +38,7 @@ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
|
|
37
38
|
A dictionary of metadata extracted from the PDF.
|
38
39
|
"""
|
39
40
|
try:
|
40
|
-
document = parse(pdf_content, max_workers=1)
|
41
|
+
document = parse(pdf_content, max_workers=1, password=password)
|
41
42
|
metadata: Metadata = {}
|
42
43
|
|
43
44
|
for raw_info in document.info:
|
@@ -275,13 +276,14 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
|
|
275
276
|
result["subtitle"] = subtitle
|
276
277
|
|
277
278
|
|
278
|
-
def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
|
279
|
+
def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
|
279
280
|
"""Synchronous version of extract_pdf_metadata.
|
280
281
|
|
281
282
|
Extract metadata from a PDF document without using async/await.
|
282
283
|
|
283
284
|
Args:
|
284
285
|
pdf_content: The bytes of the PDF document.
|
286
|
+
password: Password for encrypted PDF files.
|
285
287
|
|
286
288
|
Raises:
|
287
289
|
ParsingError: If the PDF metadata could not be extracted.
|
@@ -290,7 +292,7 @@ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
|
|
290
292
|
A dictionary of metadata extracted from the PDF.
|
291
293
|
"""
|
292
294
|
try:
|
293
|
-
document = parse(pdf_content, max_workers=1)
|
295
|
+
document = parse(pdf_content, max_workers=1, password=password)
|
294
296
|
metadata: Metadata = {}
|
295
297
|
|
296
298
|
for raw_info in document.info:
|
kreuzberg/_types.py
CHANGED
@@ -357,6 +357,8 @@ class ExtractionConfig:
|
|
357
357
|
"""The mode to use for document classification."""
|
358
358
|
enable_quality_processing: bool = True
|
359
359
|
"""Whether to apply quality post-processing to improve extraction results."""
|
360
|
+
pdf_password: str | list[str] = ""
|
361
|
+
"""Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
|
360
362
|
|
361
363
|
def __post_init__(self) -> None:
|
362
364
|
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.10.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -29,12 +29,12 @@ Classifier: Topic :: Text Processing :: General
|
|
29
29
|
Classifier: Typing :: Typed
|
30
30
|
Requires-Python: >=3.10
|
31
31
|
Requires-Dist: anyio>=4.9.0
|
32
|
-
Requires-Dist: chardetng-py>=0.3.
|
32
|
+
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
35
|
-
Requires-Dist: mcp>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.9.0
|
35
|
+
Requires-Dist: mcp>=1.12.2
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: playa-pdf>=0.6.
|
37
|
+
Requires-Dist: playa-pdf>=0.6.4
|
38
38
|
Requires-Dist: psutil>=7.0.0
|
39
39
|
Requires-Dist: pypdfium2==4.30.0
|
40
40
|
Requires-Dist: python-calamine>=0.3.2
|
@@ -53,7 +53,8 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
|
|
53
53
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
54
54
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
55
55
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
56
|
-
Requires-Dist:
|
56
|
+
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
|
57
|
+
Requires-Dist: rich>=14.1.0; extra == 'all'
|
57
58
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
58
59
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
59
60
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
@@ -67,8 +68,10 @@ Provides-Extra: chunking
|
|
67
68
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
68
69
|
Provides-Extra: cli
|
69
70
|
Requires-Dist: click>=8.2.1; extra == 'cli'
|
70
|
-
Requires-Dist: rich>=14.
|
71
|
+
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
71
72
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
73
|
+
Provides-Extra: crypto
|
74
|
+
Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
|
72
75
|
Provides-Extra: easyocr
|
73
76
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
74
77
|
Provides-Extra: entity-extraction
|
@@ -130,14 +133,14 @@ Kreuzberg leverages established open source technologies:
|
|
130
133
|
### Extract Text with CLI
|
131
134
|
|
132
135
|
```bash
|
133
|
-
# Extract text from any file to
|
134
|
-
uvx kreuzberg extract document.pdf > output.
|
136
|
+
# Extract text from any file to text format
|
137
|
+
uvx kreuzberg extract document.pdf > output.txt
|
135
138
|
|
136
139
|
# With all features (OCR, table extraction, etc.)
|
137
|
-
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format
|
140
|
+
uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
|
138
141
|
|
139
142
|
# Extract with rich metadata
|
140
|
-
uvx kreuzberg extract report.pdf --show-metadata --format json
|
143
|
+
uvx kreuzberg extract report.pdf --show-metadata --output-format json
|
141
144
|
```
|
142
145
|
|
143
146
|
### Python Usage
|
@@ -8,9 +8,9 @@ kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR
|
|
8
8
|
kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
|
9
9
|
kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
|
10
10
|
kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
|
11
|
-
kreuzberg/_playa.py,sha256=
|
11
|
+
kreuzberg/_playa.py,sha256=cJ000ZPHRhbpbP7odRuzMKn38teR6RbodoHgksbfjGE,12059
|
12
12
|
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
13
|
-
kreuzberg/_types.py,sha256=
|
13
|
+
kreuzberg/_types.py,sha256=ecT2dRg7dr06p7Dxv23YJ7Ur2m4FUCt6xGtuoS7MQaI,15259
|
14
14
|
kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
16
|
kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
|
@@ -23,7 +23,7 @@ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO
|
|
23
23
|
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
24
24
|
kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
|
25
25
|
kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
|
26
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
26
|
+
kreuzberg/_extractors/_pdf.py,sha256=pn45qKYkMcmG-PzeeF5jRjrw1NwaKU3589dhpn7HvE8,19918
|
27
27
|
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
28
28
|
kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
|
29
29
|
kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
|
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
|
|
47
47
|
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
48
48
|
kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
|
49
49
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
50
|
-
kreuzberg-3.
|
51
|
-
kreuzberg-3.
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
54
|
-
kreuzberg-3.
|
50
|
+
kreuzberg-3.10.0.dist-info/METADATA,sha256=4U1mSEAbT3zRir--SPZmYy09LfEfu5vUz6CUhQL8uzA,12047
|
51
|
+
kreuzberg-3.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.10.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
53
|
+
kreuzberg-3.10.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.10.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|