PyPI - kreuzberg - Versions diffs - 3.9.0__py3-none-any.whl → 3.10.0__py3-none-any.whl - Mend

kreuzberg 3.9.0py3-none-any.whl → 3.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
 from kreuzberg._ocr._paddleocr import PaddleOCRConfig
 from kreuzberg._ocr._tesseract import TesseractConfig
 from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
-from kreuzberg._types import ExtractionResult, OcrBackendType
+from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
 from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
 if TYPE_CHECKING:  # pragma: no cover
     from PIL.Image import Image
+    from playa.document import Document
 class PDFExtractor(Extractor):
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
         file_path, unlink = await create_temp_file(".pdf")
         await AsyncPath(file_path).write_bytes(content)
         try:
-            metadata = await extract_pdf_metadata(content)
+            metadata = await self._extract_metadata_with_password_attempts(content)
             result = await self.extract_path_async(file_path)
             result.metadata = metadata
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
         if not result:
             result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
-        result.metadata = await extract_pdf_metadata(content_bytes)
+        result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
         if self.config.extract_tables:
             # GMFT is optional dependency
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
             result = self.extract_path_sync(Path(temp_path))
-            metadata = extract_pdf_metadata_sync(content)
+            metadata = self._extract_metadata_with_password_attempts_sync(content)
             result.metadata = metadata
             return result
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
         # Use list comprehension and join for efficient string building
         return "\n\n".join(result.content for result in results)
+    def _parse_with_password_attempts(self, content: bytes) -> Document:
+        """Parse PDF with password attempts."""
+        # Normalize password to list
+        if isinstance(self.config.pdf_password, str):
+            passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
+        else:
+            passwords = list(self.config.pdf_password)
+        # Try each password in sequence
+        last_exception = None
+        for password in passwords:
+            try:
+                return parse(content, max_workers=1, password=password)
+            except Exception as e:  # noqa: PERF203, BLE001
+                last_exception = e
+                continue
+        # If all passwords failed, raise the last exception
+        if last_exception:
+            raise last_exception from None
+        # Fallback to no password
+        return parse(content, max_workers=1, password="")
+    def _get_passwords_to_try(self) -> list[str]:
+        """Get list of passwords to try in sequence."""
+        if isinstance(self.config.pdf_password, str):
+            return [self.config.pdf_password] if self.config.pdf_password else [""]
+        return list(self.config.pdf_password) if self.config.pdf_password else [""]
+    async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
+        """Extract PDF metadata with password attempts."""
+        passwords = self._get_passwords_to_try()
+        last_exception = None
+        for password in passwords:
+            try:
+                return await extract_pdf_metadata(content, password=password)
+            except Exception as e:  # noqa: PERF203, BLE001
+                last_exception = e
+                continue
+        # If all passwords failed, try with empty password as fallback
+        try:
+            return await extract_pdf_metadata(content, password="")
+        except Exception:
+            if last_exception:
+                raise last_exception from None
+            raise
+    def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
+        """Extract PDF metadata with password attempts (sync version)."""
+        passwords = self._get_passwords_to_try()
+        last_exception = None
+        for password in passwords:
+            try:
+                return extract_pdf_metadata_sync(content, password=password)
+            except Exception as e:  # noqa: PERF203, BLE001
+                last_exception = e
+                continue
+        # If all passwords failed, try with empty password as fallback
+        try:
+            return extract_pdf_metadata_sync(content, password="")
+        except Exception:
+            if last_exception:
+                raise last_exception from None
+            raise
     def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
         """Extract text using playa for better structure preservation."""
         with contextlib.suppress(Exception):
             content = path.read_bytes()
-            document = parse(content, max_workers=1)
+            document = self._parse_with_password_attempts(content)
             # Extract text while preserving structure
             pages_text = []

kreuzberg/_playa.py CHANGED Viewed

@@ -24,11 +24,12 @@ FULL_DATE_LENGTH = 14
 BOM_CHAR = "\ufeff"
-async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
+async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
     """Extract metadata from a PDF document.
     Args:
         pdf_content: The bytes of the PDF document.
+        password: Password for encrypted PDF files.
     Raises:
         ParsingError: If the PDF metadata could not be extracted.
@@ -37,7 +38,7 @@ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
         A dictionary of metadata extracted from the PDF.
     """
     try:
-        document = parse(pdf_content, max_workers=1)
+        document = parse(pdf_content, max_workers=1, password=password)
         metadata: Metadata = {}
         for raw_info in document.info:
@@ -275,13 +276,14 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
             result["subtitle"] = subtitle
-def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
+def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
     """Synchronous version of extract_pdf_metadata.
     Extract metadata from a PDF document without using async/await.
     Args:
         pdf_content: The bytes of the PDF document.
+        password: Password for encrypted PDF files.
     Raises:
         ParsingError: If the PDF metadata could not be extracted.
@@ -290,7 +292,7 @@ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
         A dictionary of metadata extracted from the PDF.
     """
     try:
-        document = parse(pdf_content, max_workers=1)
+        document = parse(pdf_content, max_workers=1, password=password)
         metadata: Metadata = {}
         for raw_info in document.info:

kreuzberg/_types.py CHANGED Viewed

@@ -357,6 +357,8 @@ class ExtractionConfig:
     """The mode to use for document classification."""
     enable_quality_processing: bool = True
     """Whether to apply quality post-processing to improve extraction results."""
+    pdf_password: str | list[str] = ""
+    """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
     def __post_init__(self) -> None:
         if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):

{kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.9.0
+Version: 3.10.0
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -29,12 +29,12 @@ Classifier: Topic :: Text Processing :: General
 Classifier: Typing :: Typed
 Requires-Python: >=3.10
 Requires-Dist: anyio>=4.9.0
-Requires-Dist: chardetng-py>=0.3.4
+Requires-Dist: chardetng-py>=0.3.5
 Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
-Requires-Dist: html-to-markdown[lxml]>=1.8.0
-Requires-Dist: mcp>=1.11.0
+Requires-Dist: html-to-markdown[lxml]>=1.9.0
+Requires-Dist: mcp>=1.12.2
 Requires-Dist: msgspec>=0.18.0
-Requires-Dist: playa-pdf>=0.6.1
+Requires-Dist: playa-pdf>=0.6.4
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: pypdfium2==4.30.0
 Requires-Dist: python-calamine>=0.3.2
@@ -53,7 +53,8 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
-Requires-Dist: rich>=14.0.0; extra == 'all'
+Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
+Requires-Dist: rich>=14.1.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
 Requires-Dist: spacy>=3.8.7; extra == 'all'
@@ -67,8 +68,10 @@ Provides-Extra: chunking
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
 Provides-Extra: cli
 Requires-Dist: click>=8.2.1; extra == 'cli'
-Requires-Dist: rich>=14.0.0; extra == 'cli'
+Requires-Dist: rich>=14.1.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
+Provides-Extra: crypto
+Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: entity-extraction
@@ -130,14 +133,14 @@ Kreuzberg leverages established open source technologies:
 ### Extract Text with CLI
 ```bash
-# Extract text from any file to markdown
-uvx kreuzberg extract document.pdf > output.md
+# Extract text from any file to text format
+uvx kreuzberg extract document.pdf > output.txt
 # With all features (OCR, table extraction, etc.)
-uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr --format markdown
+uvx --from "kreuzberg[all]" kreuzberg extract invoice.pdf --ocr-backend tesseract --output-format text
 # Extract with rich metadata
-uvx kreuzberg extract report.pdf --show-metadata --format json
+uvx kreuzberg extract report.pdf --show-metadata --output-format json
 ```
 ### Python Usage

{kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/RECORD RENAMED Viewed

@@ -8,9 +8,9 @@ kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR
 kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
 kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
 kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
-kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
+kreuzberg/_playa.py,sha256=cJ000ZPHRhbpbP7odRuzMKn38teR6RbodoHgksbfjGE,12059
 kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
-kreuzberg/_types.py,sha256=Si-Kb58HgE4ckGyZnJFqbWRbCNbdyC_Y0-p75aQP838,15065
+kreuzberg/_types.py,sha256=ecT2dRg7dr06p7Dxv23YJ7Ur2m4FUCt6xGtuoS7MQaI,15259
 kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
 kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
 kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
@@ -23,7 +23,7 @@ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO
 kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
 kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
 kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
-kreuzberg/_extractors/_pdf.py,sha256=UlliWggWHuVwwJE-bRa7H9-_cieSa8kdrQP3x_GOxxY,17018
+kreuzberg/_extractors/_pdf.py,sha256=pn45qKYkMcmG-PzeeF5jRjrw1NwaKU3589dhpn7HvE8,19918
 kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
 kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
 kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
 kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
 kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
 kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
-kreuzberg-3.9.0.dist-info/METADATA,sha256=C83JYzqxhGHhrqWDUmo0eJwK_2szx9ZQt3cnkocgwBY,11876
-kreuzberg-3.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-kreuzberg-3.9.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
-kreuzberg-3.9.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-3.9.0.dist-info/RECORD,,
+kreuzberg-3.10.0.dist-info/METADATA,sha256=4U1mSEAbT3zRir--SPZmYy09LfEfu5vUz6CUhQL8uzA,12047
+kreuzberg-3.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+kreuzberg-3.10.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
+kreuzberg-3.10.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-3.10.0.dist-info/RECORD,,

{kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{kreuzberg-3.9.0.dist-info → kreuzberg-3.10.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

kreuzberg 3.9.0__py3-none-any.whl → 3.10.0__py3-none-any.whl

kreuzberg 3.9.0py3-none-any.whl → 3.10.0py3-none-any.whl