kreuzberg 3.9.0__py3-none-any.whl → 3.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
22
22
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
23
  from kreuzberg._ocr._tesseract import TesseractConfig
24
24
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
25
- from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
26
26
  from kreuzberg._utils._errors import create_error_context, should_retry
27
27
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
28
28
  from kreuzberg._utils._string import normalize_spaces
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
33
33
 
34
34
  if TYPE_CHECKING: # pragma: no cover
35
35
  from PIL.Image import Image
36
+ from playa.document import Document
36
37
 
37
38
 
38
39
  class PDFExtractor(Extractor):
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
45
46
  file_path, unlink = await create_temp_file(".pdf")
46
47
  await AsyncPath(file_path).write_bytes(content)
47
48
  try:
48
- metadata = await extract_pdf_metadata(content)
49
+ metadata = await self._extract_metadata_with_password_attempts(content)
49
50
  result = await self.extract_path_async(file_path)
50
51
 
51
52
  result.metadata = metadata
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
73
74
  if not result:
74
75
  result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
75
76
 
76
- result.metadata = await extract_pdf_metadata(content_bytes)
77
+ result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
77
78
 
78
79
  if self.config.extract_tables:
79
80
  # GMFT is optional dependency
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
107
108
 
108
109
  result = self.extract_path_sync(Path(temp_path))
109
110
 
110
- metadata = extract_pdf_metadata_sync(content)
111
+ metadata = self._extract_metadata_with_password_attempts_sync(content)
111
112
  result.metadata = metadata
112
113
 
113
114
  return result
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
406
407
  # Use list comprehension and join for efficient string building
407
408
  return "\n\n".join(result.content for result in results)
408
409
 
410
+ def _parse_with_password_attempts(self, content: bytes) -> Document:
411
+ """Parse PDF with password attempts."""
412
+ # Normalize password to list
413
+ if isinstance(self.config.pdf_password, str):
414
+ passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
415
+ else:
416
+ passwords = list(self.config.pdf_password)
417
+
418
+ # Try each password in sequence
419
+ last_exception = None
420
+ for password in passwords:
421
+ try:
422
+ return parse(content, max_workers=1, password=password)
423
+ except Exception as e: # noqa: PERF203, BLE001
424
+ last_exception = e
425
+ continue
426
+
427
+ # If all passwords failed, raise the last exception
428
+ if last_exception:
429
+ raise last_exception from None
430
+
431
+ # Fallback to no password
432
+ return parse(content, max_workers=1, password="")
433
+
434
+ def _get_passwords_to_try(self) -> list[str]:
435
+ """Get list of passwords to try in sequence."""
436
+ if isinstance(self.config.pdf_password, str):
437
+ return [self.config.pdf_password] if self.config.pdf_password else [""]
438
+ return list(self.config.pdf_password) if self.config.pdf_password else [""]
439
+
440
+ async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
441
+ """Extract PDF metadata with password attempts."""
442
+ passwords = self._get_passwords_to_try()
443
+
444
+ last_exception = None
445
+ for password in passwords:
446
+ try:
447
+ return await extract_pdf_metadata(content, password=password)
448
+ except Exception as e: # noqa: PERF203, BLE001
449
+ last_exception = e
450
+ continue
451
+
452
+ # If all passwords failed, try with empty password as fallback
453
+ try:
454
+ return await extract_pdf_metadata(content, password="")
455
+ except Exception:
456
+ if last_exception:
457
+ raise last_exception from None
458
+ raise
459
+
460
+ def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
461
+ """Extract PDF metadata with password attempts (sync version)."""
462
+ passwords = self._get_passwords_to_try()
463
+
464
+ last_exception = None
465
+ for password in passwords:
466
+ try:
467
+ return extract_pdf_metadata_sync(content, password=password)
468
+ except Exception as e: # noqa: PERF203, BLE001
469
+ last_exception = e
470
+ continue
471
+
472
+ # If all passwords failed, try with empty password as fallback
473
+ try:
474
+ return extract_pdf_metadata_sync(content, password="")
475
+ except Exception:
476
+ if last_exception:
477
+ raise last_exception from None
478
+ raise
479
+
409
480
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
410
481
  """Extract text using playa for better structure preservation."""
411
482
  with contextlib.suppress(Exception):
412
483
  content = path.read_bytes()
413
- document = parse(content, max_workers=1)
484
+ document = self._parse_with_password_attempts(content)
414
485
 
415
486
  # Extract text while preserving structure
416
487
  pages_text = []
kreuzberg/_playa.py CHANGED
@@ -24,11 +24,12 @@ FULL_DATE_LENGTH = 14
24
24
  BOM_CHAR = "\ufeff"
25
25
 
26
26
 
27
- async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
27
+ async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
28
28
  """Extract metadata from a PDF document.
29
29
 
30
30
  Args:
31
31
  pdf_content: The bytes of the PDF document.
32
+ password: Password for encrypted PDF files.
32
33
 
33
34
  Raises:
34
35
  ParsingError: If the PDF metadata could not be extracted.
@@ -37,7 +38,7 @@ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
37
38
  A dictionary of metadata extracted from the PDF.
38
39
  """
39
40
  try:
40
- document = parse(pdf_content, max_workers=1)
41
+ document = parse(pdf_content, max_workers=1, password=password)
41
42
  metadata: Metadata = {}
42
43
 
43
44
  for raw_info in document.info:
@@ -275,13 +276,14 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
275
276
  result["subtitle"] = subtitle
276
277
 
277
278
 
278
- def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
279
+ def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
279
280
  """Synchronous version of extract_pdf_metadata.
280
281
 
281
282
  Extract metadata from a PDF document without using async/await.
282
283
 
283
284
  Args:
284
285
  pdf_content: The bytes of the PDF document.
286
+ password: Password for encrypted PDF files.
285
287
 
286
288
  Raises:
287
289
  ParsingError: If the PDF metadata could not be extracted.
@@ -290,7 +292,7 @@ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
290
292
  A dictionary of metadata extracted from the PDF.
291
293
  """
292
294
  try:
293
- document = parse(pdf_content, max_workers=1)
295
+ document = parse(pdf_content, max_workers=1, password=password)
294
296
  metadata: Metadata = {}
295
297
 
296
298
  for raw_info in document.info:
kreuzberg/_types.py CHANGED
@@ -357,6 +357,8 @@ class ExtractionConfig:
357
357
  """The mode to use for document classification."""
358
358
  enable_quality_processing: bool = True
359
359
  """Whether to apply quality post-processing to improve extraction results."""
360
+ pdf_password: str | list[str] = ""
361
+ """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
360
362
 
361
363
  def __post_init__(self) -> None:
362
364
  if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.9.0
3
+ Version: 3.10.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -29,12 +29,12 @@ Classifier: Topic :: Text Processing :: General
29
29
  Classifier: Typing :: Typed
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.9.0
32
- Requires-Dist: chardetng-py>=0.3.4
32
+ Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.8.0
35
- Requires-Dist: mcp>=1.11.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
+ Requires-Dist: mcp>=1.12.2
36
36
  Requires-Dist: msgspec>=0.18.0
37
- Requires-Dist: playa-pdf>=0.6.1
37
+ Requires-Dist: playa-pdf>=0.6.4
38
38
  Requires-Dist: psutil>=7.0.0
39
39
  Requires-Dist: pypdfium2==4.30.0
40
40
  Requires-Dist: python-calamine>=0.3.2
@@ -53,7 +53,8 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
53
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
54
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
55
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
- Requires-Dist: rich>=14.0.0; extra == 'all'
56
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
57
+ Requires-Dist: rich>=14.1.0; extra == 'all'
57
58
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
59
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
59
60
  Requires-Dist: spacy>=3.8.7; extra == 'all'
@@ -67,8 +68,10 @@ Provides-Extra: chunking
67
68
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
68
69
  Provides-Extra: cli
69
70
  Requires-Dist: click>=8.2.1; extra == 'cli'
70
- Requires-Dist: rich>=14.0.0; extra == 'cli'
71
+ Requires-Dist: rich>=14.1.0; extra == 'cli'
71
72
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
+ Provides-Extra: crypto
74
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
72
75
  Provides-Extra: easyocr
73
76
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
74
77
  Provides-Extra: entity-extraction
@@ -130,14 +133,14 @@ Kreuzberg leverages established open source technologies:
130
133
  ### Extract Text with CLI
131
134
 
132
135
  ```bash
133
- # Extract text from any file to markdown
134
- uvx kreuzberg extract document.pdf > output.md
136
+ # Extract text from any file to text format
137
+ uvx kreuzberg extract document.pdf > output.txt
135
138
 
136
139
  # With all features (OCR, table extraction, etc.)
137
- uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
140
+ uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
138
141
 
139
142
  # Extract with rich metadata
140
- uvx kreuzberg extract report.pdf --show-metadata --format json
143
+ uvx kreuzberg extract report.pdf --show-metadata --output-format json
141
144
  ```
142
145
 
143
146
  ### Python Usage
@@ -8,9 +8,9 @@ kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR
8
8
  kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
9
9
  kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
10
10
  kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
11
- kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
11
+ kreuzberg/_playa.py,sha256=cJ000ZPHRhbpbP7odRuzMKn38teR6RbodoHgksbfjGE,12059
12
12
  kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
13
- kreuzberg/_types.py,sha256=Si-Kb58HgE4ckGyZnJFqbWRbCNbdyC_Y0-p75aQP838,15065
13
+ kreuzberg/_types.py,sha256=ecT2dRg7dr06p7Dxv23YJ7Ur2m4FUCt6xGtuoS7MQaI,15259
14
14
  kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
15
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
16
  kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
@@ -23,7 +23,7 @@ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO
23
23
  kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
24
24
  kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
25
25
  kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
26
- kreuzberg/_extractors/_pdf.py,sha256=UlliWggWHuVwwJE-bRa7H9-_cieSa8kdrQP3x_GOxxY,17018
26
+ kreuzberg/_extractors/_pdf.py,sha256=pn45qKYkMcmG-PzeeF5jRjrw1NwaKU3589dhpn7HvE8,19918
27
27
  kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
28
28
  kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
29
29
  kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
47
47
  kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
48
48
  kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
49
49
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
- kreuzberg-3.9.0.dist-info/METADATA,sha256=C83JYzqxhGHhrqWDUmo0eJwK_2szx9ZQt3cnkocgwBY,11876
51
- kreuzberg-3.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
- kreuzberg-3.9.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
- kreuzberg-3.9.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
- kreuzberg-3.9.0.dist-info/RECORD,,
50
+ kreuzberg-3.10.0.dist-info/METADATA,sha256=4U1mSEAbT3zRir--SPZmYy09LfEfu5vUz6CUhQL8uzA,12047
51
+ kreuzberg-3.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.10.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
+ kreuzberg-3.10.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.10.0.dist-info/RECORD,,