PyPI - kreuzberg - Versions diffs - 3.9.1__tar.gz → 3.10.1__tar.gz - Mend

kreuzberg 3.9.1tar.gz → 3.10.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (239) hide show

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/.github/workflows/ci.yaml RENAMED Viewed

@@ -51,12 +51,103 @@ jobs:
       - name: Execute Pre-Commit
         run: uv run pre-commit run --show-diff-on-failure --color=always --all-files
+  # Coverage job runs first, only on Python 3.13 Ubuntu
+  coverage:
+    needs: validate
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+      - name: Install Python
+        uses: actions/setup-python@v5
+        id: setup-python
+        with:
+          python-version: "3.13"
+      - name: Cache Python Dependencies
+        id: python-cache
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cache/uv
+            .venv
+          key: python-dependencies-ubuntu-latest-3.13-${{ hashFiles('uv.lock') }}
+          restore-keys: |
+            python-dependencies-ubuntu-latest-3.13-
+      - name: Install Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            uv sync --all-packages --all-extras --dev
+          shell: bash
+      - name: Install System Dependencies
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 5
+          max_attempts: 3
+          retry_wait_seconds: 30
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y tesseract-ocr tesseract-ocr-deu pandoc
+          shell: bash
+      - name: Run Tests with Coverage
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          retry_wait_seconds: 10
+          command: |
+            uv run coverage erase
+            uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml --reruns 2 --reruns-delay 1
+            uv run coverage report --precision=2
+          shell: bash
+      - name: Upload Coverage to DeepSource
+        if: always() && github.event_name == 'push'
+        env:
+          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
+        run: |
+          # Install DeepSource CLI
+          curl -fsSL https://deepsource.io/cli | sh
+          # Upload coverage report
+          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov
+      - name: Upload Coverage Artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-report-${{ github.sha }}
+          path: |
+            coverage.lcov
+            .coverage
+          retention-days: 7
+  # Full test matrix runs only after coverage succeeds
   test:
+    needs: coverage
+    runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
-        os: [ ubuntu-latest, macOS-latest, windows-latest ]
-        python: ${{ github.event_name == 'pull_request' && fromJSON('["3.13"]') || fromJSON('["3.10", "3.11", "3.12", "3.13"]') }}
-    runs-on: ${{ matrix.os }}
+        os: [ubuntu-latest, windows-latest, macos-latest]
+        python: ["3.10", "3.11", "3.12", "3.13"]
+        exclude:
+          # Skip Python 3.13 on macOS for now due to compatibility issues
+          - os: macos-latest
+            python: "3.13"
     timeout-minutes: 30
     steps:
       - name: Checkout
@@ -146,52 +237,12 @@ jobs:
             pandoc --version
           shell: pwsh
-      - name: Clean Coverage Data
-        run: |
-          rm -f .coverage .coverage.* coverage.lcov htmlcov/* || true
-        shell: bash
-      - name: Run Tests with Coverage
-        run: |
-          uv run coverage erase
-          uv run pytest -s -vvv --cov=kreuzberg --cov-report=lcov:coverage.lcov --cov-report=term --cov-config=pyproject.toml
-      - name: Upload Coverage Artifacts
-        if: matrix.os == 'ubuntu-latest' && matrix.python == '3.13'
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-report
-          path: coverage.lcov
-          retention-days: 1
-  upload-coverage:
-    needs: test
-    runs-on: ubuntu-latest
-    if: github.event_name == 'push' || github.event_name == 'pull_request'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha || github.sha }}
-      - name: Download Coverage Artifacts
-        uses: actions/download-artifact@v4
-        with:
-          name: coverage-report
-          path: .
-      - name: Install DeepSource CLI
+      - name: Run Tests (without coverage)
         uses: nick-fields/retry@v3
         with:
-          timeout_minutes: 3
+          timeout_minutes: 15
           max_attempts: 3
           retry_wait_seconds: 10
           command: |
-            curl -fsSL https://deepsource.io/cli | sh
+            uv run pytest -s -vvv --reruns 2 --reruns-delay 1
           shell: bash
-      - name: Upload Coverage to DeepSource
-        env:
-          DEEPSOURCE_DSN: ${{ secrets.DEEPSOURCE_DSN }}
-        run: |
-          ./bin/deepsource report --analyzer test-coverage --key python --value-file ./coverage.lcov

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/.pre-commit-config.yaml RENAMED Viewed

@@ -53,7 +53,7 @@ repos:
     hooks:
       - id: pyproject-fmt
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.5
+    rev: v0.12.7
     hooks:
       - id: ruff
         args: ["--fix", "--unsafe-fixes"]

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kreuzberg
-Version: 3.9.1
+Version: 3.10.1
 Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
 Project-URL: documentation, https://kreuzberg.dev
 Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -53,6 +53,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
 Requires-Dist: mailparse>=1.0.15; extra == 'all'
 Requires-Dist: paddleocr>=3.1.0; extra == 'all'
 Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
+Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
 Requires-Dist: rich>=14.1.0; extra == 'all'
 Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
 Requires-Dist: setuptools>=80.9.0; extra == 'all'
@@ -69,6 +70,8 @@ Provides-Extra: cli
 Requires-Dist: click>=8.2.1; extra == 'cli'
 Requires-Dist: rich>=14.1.0; extra == 'cli'
 Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
+Provides-Extra: crypto
+Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
 Provides-Extra: easyocr
 Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
 Provides-Extra: entity-extraction

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/ai-rulez.yaml RENAMED Viewed

@@ -193,16 +193,18 @@ rules:
       api = ["litestar[standard,structlog,opentelemetry]>=2.1.6"]
       cli = ["click>=8.2.1", "rich>=14.0.0", "tomli>=2.0.0; python_version<'3.11'"]
       chunking = ["semantic-text-splitter>=0.27.0"]
+      crypto = ["playa-pdf[crypto]>=0.6.4"]
       easyocr = ["easyocr>=1.7.2"]
       gmft = ["gmft>=0.4.2"]
       langdetect = ["fast-langdetect>=0.2.0"]
       paddleocr = ["paddleocr>=3.1.0", "paddlepaddle>=3.1.0", "setuptools>=80.9.0"]
-      all = ["kreuzberg[api,chunking,cli,easyocr,gmft,langdetect,paddleocr]"]
+      all = ["kreuzberg[api,chunking,cli,crypto,easyocr,gmft,langdetect,paddleocr]"]
       ```
       ### Installation Patterns
       - Basic: `pip install kreuzberg`
       - With features: `pip install "kreuzberg[api,cli]"`
+      - With crypto support: `pip install "kreuzberg[crypto]"`
       - All features: `pip install "kreuzberg[all]"`
       - Development: `uv sync --all-extras`
@@ -211,6 +213,14 @@ rules:
       - **System**: tesseract-ocr, pandoc (via package manager)
       - **Development**: Uses dependency groups in pyproject.toml
+      ### Crypto Support
+      The `crypto` extra adds cryptographic support for PDF processing:
+      - **Purpose**: Enables AES encryption/decryption for password-protected PDFs
+      - **Dependencies**: Adds cryptography (~22MB), cffi, and pycparser
+      - **Usage**: Required for PDFs with AES encryption (RC4 is supported in base installation)
+      - **Password Support**: Supports single password or list of passwords to try in sequence
+      - **Size Impact**: Increases installation size by ~24MB due to cryptography package
 sections:
   - title: "Language Detection"
     content: |

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/benchmark_baseline.py RENAMED Viewed

@@ -108,7 +108,7 @@ async def run_baseline_benchmark() -> dict[str, object] | None:
     return results  # type: ignore[return-value]
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     baseline_results = asyncio.run(run_baseline_benchmark())
     baseline_file = Path("baseline_results.json")

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/end_to_end_benchmark.py RENAMED Viewed

@@ -195,7 +195,7 @@ async def run_end_to_end_benchmark(trials: int = 20) -> dict[str, Any]:
     }
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     print("🧪 REPRODUCIBLE CACHE BENCHMARK")
     print("Testing msgpack implementation with statistical rigor...")
     print()

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/src/kreuzberg_benchmarks/benchmarks.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Core benchmark implementations comparing sync vs async performance."""
+# mypy: disable-error-code=unused-ignore
 from __future__ import annotations

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/benchmarks/statistical_benchmark.py RENAMED Viewed

@@ -187,7 +187,7 @@ async def run_statistical_benchmark() -> dict[str, Any]:
     }
-if __name__ == "__main__":
+if __name__ == "__main__":  # pragma: no cover
     print("🧪 STATISTICAL CACHE BENCHMARK")
     print("Testing msgpack implementation with proper error analysis...")
     print()

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_api/main.py RENAMED Viewed

@@ -30,7 +30,7 @@ try:
         HTTP_422_UNPROCESSABLE_ENTITY,
         HTTP_500_INTERNAL_SERVER_ERROR,
     )
-except ImportError as e:
+except ImportError as e:  # pragma: no cover
     raise MissingDependencyError.create_for_package(
         dependency_group="litestar",
         functionality="Litestar API and docker container",

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_chunker.py RENAMED Viewed

@@ -43,7 +43,7 @@ def get_chunker(
                 from semantic_text_splitter import TextSplitter  # noqa: PLC0415
                 _chunkers[key] = TextSplitter(max_characters, overlap_characters)
-        except ImportError as e:
+        except ImportError as e:  # pragma: no cover
             raise MissingDependencyError.create_for_package(
                 dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
             ) from e

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_config.py RENAMED Viewed

@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
 if sys.version_info >= (3, 11):
     import tomllib
-else:
+else:  # pragma: no cover
     import tomli as tomllib  # type: ignore[import-not-found]
 from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
     # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
     if config_path.name == "kreuzberg.toml":
         return data  # type: ignore[no-any-return]
-    return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+    # For other files, check if they have [tool.kreuzberg] section
+    if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
+        return data.get("tool", {}).get("kreuzberg", {})  # type: ignore[no-any-return]
+    # Otherwise assume root-level configuration
+    return data  # type: ignore[no-any-return]
 def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
         "extract_keywords",
         "auto_detect_language",
         "enable_quality_processing",
+        "auto_detect_document_type",
+        "document_type_confidence_threshold",
+        "document_classification_mode",
+        "keyword_count",
     }
     extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
     # Handle OCR backend configuration
     ocr_backend = extraction_config.get("ocr_backend")
     if ocr_backend and ocr_backend != "none":
+        # Validate OCR backend
+        valid_backends = {"tesseract", "easyocr", "paddleocr"}
+        if ocr_backend not in valid_backends:
+            raise ValidationError(
+                f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
+                context={"provided": ocr_backend, "valid": sorted(valid_backends)},
+            )
         ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
         if ocr_config:
             extraction_config["ocr_config"] = ocr_config
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
     "extract_keywords",
     "auto_detect_language",
     "enable_quality_processing",
+    "auto_detect_document_type",
+    "document_type_confidence_threshold",
+    "document_classification_mode",
+    "keyword_count",
 ]

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_document_classification.py RENAMED Viewed

@@ -4,13 +4,12 @@ import re
 from typing import TYPE_CHECKING
 from kreuzberg._ocr import get_ocr_backend
+from kreuzberg._types import ExtractionConfig, ExtractionResult  # noqa: TC001
 from kreuzberg.exceptions import MissingDependencyError
 if TYPE_CHECKING:
     from pathlib import Path
-    from kreuzberg._types import ExtractionConfig, ExtractionResult
 DOCUMENT_CLASSIFIERS = {
     "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
     Raises:
         MissingDependencyError: If the deep-translator package is not installed
     """
+    # Combine content with metadata for classification
+    text_to_classify = result.content
+    if result.metadata:
+        # Add metadata values to the text for classification
+        metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
+        text_to_classify = f"{text_to_classify} {metadata_text}"
     try:
         from deep_translator import GoogleTranslator  # noqa: PLC0415
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError(
             "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
         ) from e
-    return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
+    try:
+        return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
+    except Exception:  # noqa: BLE001
+        # Fall back to original content in lowercase if translation fails
+        return text_to_classify.lower()
 def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
         A tuple containing the detected document type and the confidence score,
         or (None, None) if no type is detected with sufficient confidence.
     """
+    if not config.auto_detect_document_type:
+        return None, None
     translated_text = _get_translated_text(result)
     scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
@@ -108,7 +121,8 @@ def classify_document_from_layout(
         A tuple containing the detected document type and the confidence score,
         or (None, None) if no type is detected with sufficient confidence.
     """
-    translated_text = _get_translated_text(result)
+    if not config.auto_detect_document_type:
+        return None, None
     if result.layout is None or result.layout.empty:
         return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
     if not all(col in layout_df.columns for col in ["text", "top", "height"]):
         return None, None
+    # Use layout text for classification, not the content
+    layout_text = " ".join(layout_df["text"].astype(str).tolist())
+    # Translate layout text directly for classification
+    text_to_classify = layout_text
+    if result.metadata:
+        # Add metadata values to the text for classification
+        metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
+        text_to_classify = f"{text_to_classify} {metadata_text}"
+    try:
+        from deep_translator import GoogleTranslator  # noqa: PLC0415
+        translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
+    except Exception:  # noqa: BLE001
+        # Fall back to original content in lowercase if translation fails
+        translated_text = text_to_classify.lower()
     layout_df["translated_text"] = translated_text
     page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
     if config.document_classification_mode == "vision" and file_path:
         layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
         result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
+    elif result.layout is not None and not result.layout.empty:
+        # Use layout-based classification if layout data is available
+        result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
     else:
         result.document_type, result.document_type_confidence = classify_document(result, config)
     return result

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_entity_extraction.py RENAMED Viewed

@@ -139,7 +139,7 @@ def extract_entities(
     try:
         import spacy  # noqa: F401, PLC0415
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             package_name="spacy",
             dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
         return [(kw, float(score)) for kw, score in keywords]
     except (RuntimeError, OSError, ValueError):
         return []
-    except ImportError as e:
+    except ImportError as e:  # pragma: no cover
         raise MissingDependencyError.create_for_package(
             package_name="keybert",
             dependency_group="entity-extraction",

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_email.py RENAMED Viewed

@@ -19,12 +19,12 @@ if TYPE_CHECKING:
 # Import optional dependencies at module level with proper error handling
 try:
     import mailparse
-except ImportError:
+except ImportError:  # pragma: no cover
     mailparse = None
 try:
     import html2text  # type: ignore[import-not-found]
-except ImportError:
+except ImportError:  # pragma: no cover
     html2text = None
 # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
         to_info = parsed_email.get("to")
         if to_info:
+            # Store the raw value in metadata (could be string, dict, or list)
             if isinstance(to_info, list) and to_info:
+                # For metadata, use first recipient's email if it's a list
                 to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
+                metadata["email_to"] = to_email
             elif isinstance(to_info, dict):
-                to_email = to_info.get("email", "")
+                metadata["email_to"] = to_info.get("email", "")
             else:
-                to_email = str(to_info)
-            metadata["email_to"] = to_email
-            text_parts.append(f"To: {to_email}")
+                metadata["email_to"] = str(to_info)
+            # For display, format all recipients
+            to_formatted = self._format_email_field(to_info)
+            text_parts.append(f"To: {to_formatted}")
         date = parsed_email.get("date")
         if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
         cc = parsed_email.get("cc")
         if cc:
             metadata["email_cc"] = cc
-            text_parts.append(f"CC: {cc}")
+            cc_formatted = self._format_email_field(cc)
+            text_parts.append(f"CC: {cc_formatted}")
         bcc = parsed_email.get("bcc")
         if bcc:
             metadata["email_bcc"] = bcc
-            text_parts.append(f"BCC: {bcc}")
+            bcc_formatted = self._format_email_field(bcc)
+            text_parts.append(f"BCC: {bcc_formatted}")
+    def _format_email_field(self, field: Any) -> str:
+        """Format email field (to, cc, bcc) for display."""
+        if isinstance(field, list):
+            emails = []
+            for item in field:
+                if isinstance(item, dict):
+                    email = item.get("email", "")
+                    if email:
+                        emails.append(email)
+                else:
+                    emails.append(str(item))
+            return ", ".join(emails)
+        if isinstance(field, dict):
+            return str(field.get("email", ""))
+        return str(field)
     def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
         """Extract and process email body content."""

{kreuzberg-3.9.1 → kreuzberg-3.10.1}/kreuzberg/_extractors/_pdf.py RENAMED Viewed

@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
 from kreuzberg._ocr._paddleocr import PaddleOCRConfig
 from kreuzberg._ocr._tesseract import TesseractConfig
 from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
-from kreuzberg._types import ExtractionResult, OcrBackendType
+from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
 from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
 if TYPE_CHECKING:  # pragma: no cover
     from PIL.Image import Image
+    from playa.document import Document
 class PDFExtractor(Extractor):
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
         file_path, unlink = await create_temp_file(".pdf")
         await AsyncPath(file_path).write_bytes(content)
         try:
-            metadata = await extract_pdf_metadata(content)
+            metadata = await self._extract_metadata_with_password_attempts(content)
             result = await self.extract_path_async(file_path)
             result.metadata = metadata
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
         if not result:
             result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
-        result.metadata = await extract_pdf_metadata(content_bytes)
+        result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
         if self.config.extract_tables:
             # GMFT is optional dependency
@@ -81,7 +82,7 @@ class PDFExtractor(Extractor):
                 from kreuzberg._gmft import extract_tables  # noqa: PLC0415
                 result.tables = await extract_tables(path, self.config.gmft_config)
-            except ImportError:
+            except ImportError:  # pragma: no cover
                 result.tables = []
             # Enhance metadata with table information
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
             result = self.extract_path_sync(Path(temp_path))
-            metadata = extract_pdf_metadata_sync(content)
+            metadata = self._extract_metadata_with_password_attempts_sync(content)
             result.metadata = metadata
             return result
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
         # Use list comprehension and join for efficient string building
         return "\n\n".join(result.content for result in results)
+    def _parse_with_password_attempts(self, content: bytes) -> Document:
+        """Parse PDF with password attempts."""
+        # Normalize password to list
+        if isinstance(self.config.pdf_password, str):
+            passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
+        else:
+            passwords = list(self.config.pdf_password)
+        # Try each password in sequence
+        last_exception = None
+        for password in passwords:
+            try:
+                return parse(content, max_workers=1, password=password)
+            except Exception as e:  # noqa: PERF203, BLE001
+                last_exception = e
+                continue
+        # If all passwords failed, raise the last exception
+        if last_exception:
+            raise last_exception from None
+        # Fallback to no password
+        return parse(content, max_workers=1, password="")
+    def _get_passwords_to_try(self) -> list[str]:
+        """Get list of passwords to try in sequence."""
+        if isinstance(self.config.pdf_password, str):
+            return [self.config.pdf_password] if self.config.pdf_password else [""]
+        return list(self.config.pdf_password) if self.config.pdf_password else [""]
+    async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
+        """Extract PDF metadata with password attempts."""
+        passwords = self._get_passwords_to_try()
+        last_exception = None
+        for password in passwords:
+            try:
+                return await extract_pdf_metadata(content, password=password)
+            except Exception as e:  # noqa: PERF203, BLE001
+                last_exception = e
+                continue
+        # If all passwords failed, try with empty password as fallback
+        try:
+            return await extract_pdf_metadata(content, password="")
+        except Exception:
+            if last_exception:
+                raise last_exception from None
+            raise
+    def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
+        """Extract PDF metadata with password attempts (sync version)."""
+        passwords = self._get_passwords_to_try()
+        last_exception = None
+        for password in passwords:
+            try:
+                return extract_pdf_metadata_sync(content, password=password)
+            except Exception as e:  # noqa: PERF203, BLE001
+                last_exception = e
+                continue
+        # If all passwords failed, try with empty password as fallback
+        try:
+            return extract_pdf_metadata_sync(content, password="")
+        except Exception:
+            if last_exception:
+                raise last_exception from None
+            raise
     def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
         """Extract text using playa for better structure preservation."""
         with contextlib.suppress(Exception):
             content = path.read_bytes()
-            document = parse(content, max_workers=1)
+            document = self._parse_with_password_attempts(content)
             # Extract text while preserving structure
             pages_text = []

kreuzberg 3.9.1__tar.gz → 3.10.1__tar.gz

kreuzberg 3.9.1tar.gz → 3.10.1tar.gz