PyPI - kreuzberg - Versions diffs - 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl - Mend

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

kreuzberg/__init__.py +14 -13
kreuzberg/__main__.py +0 -2
kreuzberg/_api/main.py +119 -9
kreuzberg/_chunker.py +0 -15
kreuzberg/_config.py +212 -292
kreuzberg/_document_classification.py +20 -47
kreuzberg/_entity_extraction.py +1 -122
kreuzberg/_extractors/_base.py +4 -71
kreuzberg/_extractors/_email.py +1 -15
kreuzberg/_extractors/_html.py +9 -12
kreuzberg/_extractors/_image.py +1 -25
kreuzberg/_extractors/_pandoc.py +10 -147
kreuzberg/_extractors/_pdf.py +38 -94
kreuzberg/_extractors/_presentation.py +0 -99
kreuzberg/_extractors/_spread_sheet.py +13 -55
kreuzberg/_extractors/_structured.py +1 -4
kreuzberg/_gmft.py +14 -199
kreuzberg/_language_detection.py +1 -36
kreuzberg/_mcp/__init__.py +0 -2
kreuzberg/_mcp/server.py +3 -10
kreuzberg/_mime_types.py +1 -19
kreuzberg/_ocr/_base.py +4 -76
kreuzberg/_ocr/_easyocr.py +124 -186
kreuzberg/_ocr/_paddleocr.py +154 -224
kreuzberg/_ocr/_table_extractor.py +184 -0
kreuzberg/_ocr/_tesseract.py +797 -361
kreuzberg/_playa.py +5 -31
kreuzberg/_registry.py +0 -36
kreuzberg/_types.py +588 -93
kreuzberg/_utils/_cache.py +84 -138
kreuzberg/_utils/_device.py +0 -74
kreuzberg/_utils/_document_cache.py +0 -75
kreuzberg/_utils/_errors.py +0 -50
kreuzberg/_utils/_ocr_cache.py +136 -0
kreuzberg/_utils/_pdf_lock.py +0 -16
kreuzberg/_utils/_process_pool.py +17 -64
kreuzberg/_utils/_quality.py +0 -60
kreuzberg/_utils/_ref.py +32 -0
kreuzberg/_utils/_serialization.py +0 -30
kreuzberg/_utils/_string.py +9 -59
kreuzberg/_utils/_sync.py +0 -77
kreuzberg/_utils/_table.py +49 -101
kreuzberg/_utils/_tmp.py +0 -9
kreuzberg/cli.py +54 -74
kreuzberg/extraction.py +39 -32
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
kreuzberg-3.13.1.dist-info/RECORD +57 -0
kreuzberg-3.11.4.dist-info/RECORD +0 -54
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
{kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0

kreuzberg/_extractors/_pandoc.py CHANGED Viewed

@@ -84,8 +84,6 @@ NodeType = Literal[
 class PandocExtractor(Extractor):
-    """Extractor for documents supported by Pandoc."""
     _checked_version: bool = False
     MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
@@ -153,14 +151,6 @@ class PandocExtractor(Extractor):
     }
     async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
-        """Extract text and metadata from bytes content using Pandoc.
-        Args:
-            content: The content bytes to process.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        """
         extension = self._get_pandoc_type_from_mime_type(self.mime_type)
         input_file, unlink = await create_temp_file(f".{extension}")
@@ -171,17 +161,6 @@ class PandocExtractor(Extractor):
             await unlink()
     async def extract_path_async(self, path: Path) -> ExtractionResult:
-        """Extract text and metadata from a file using Pandoc.
-        Args:
-            path: The path to the file to process.
-        Raises:
-            ParsingError: If the file data could not be extracted.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        """
         await self._validate_pandoc_version()
         self._get_pandoc_type_from_mime_type(self.mime_type)
@@ -198,14 +177,6 @@ class PandocExtractor(Extractor):
             raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of extract_bytes.
-        Args:
-            content: The content bytes to process.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        """
         extension = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
@@ -219,17 +190,6 @@ class PandocExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of extract_path.
-        Args:
-            path: The path to the file to process.
-        Returns:
-            ExtractionResult with the extracted text and metadata.
-        Raises:
-            ParsingError: When file processing fails.
-        """
         self._validate_pandoc_version_sync()
         self._get_pandoc_type_from_mime_type(self.mime_type)
@@ -244,18 +204,13 @@ class PandocExtractor(Extractor):
             raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
     async def _validate_pandoc_version(self) -> None:
-        """Validate that the installed Pandoc version meets the minimum requirement.
-        Raises:
-            MissingDependencyError: If Pandoc is not installed or version is too low
-        """
         try:
             if self._checked_version:
                 return
             command = ["pandoc", "--version"]
             result = await run_process(command)
-            stdout = result.stdout.decode()
+            stdout = result.stdout.decode("utf-8")
             version_match = re.search(
                 r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
@@ -299,14 +254,6 @@ class PandocExtractor(Extractor):
     @staticmethod
     def _get_pandoc_key(key: str) -> str | None:
-        """Map Pandoc metadata keys to our standard metadata keys.
-        Args:
-            key: The key from Pandoc metadata
-        Returns:
-            The mapped key name for our system, or None if not mapped
-        """
         if key == "abstract":
             return "summary"
@@ -325,17 +272,6 @@ class PandocExtractor(Extractor):
         return key
     def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
-        """Get Pandoc format type from MIME type.
-        Args:
-            mime_type: The MIME type to look up
-        Returns:
-            The corresponding Pandoc type
-        Raises:
-            ValidationError: If mime_type is not supported
-        """
         if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
             return pandoc_type
@@ -349,17 +285,6 @@ class PandocExtractor(Extractor):
         raise ValidationError(f"Unsupported mime type: {mime_type}")
     async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
-        """Extract metadata from a file using Pandoc.
-        Args:
-            input_file: The file to extract metadata from
-        Returns:
-            The extracted metadata
-        Raises:
-            ParsingError: If metadata extraction fails
-        """
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         metadata_file, unlink = await create_temp_file(".json")
         try:
@@ -389,17 +314,6 @@ class PandocExtractor(Extractor):
             await unlink()
     async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
-        """Extract text content from a file using Pandoc.
-        Args:
-            input_file: The file to extract content from
-        Returns:
-            The extracted text content
-        Raises:
-            ParsingError: If content extraction fails
-        """
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         output_path, unlink = await create_temp_file(".md")
         try:
@@ -431,14 +345,6 @@ class PandocExtractor(Extractor):
             await unlink()
     def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
-        """Extract structured metadata from Pandoc JSON metadata.
-        Args:
-            raw_meta: The raw metadata from Pandoc
-        Returns:
-            Structured metadata
-        """
         meta: Metadata = {}
         if (
@@ -485,16 +391,6 @@ class PandocExtractor(Extractor):
         return meta
     def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
-        """Extract text from an inline node in a document structure.
-        Args:
-            node: The node to extract text from
-            type_field: The field name for the node type
-            content_field: The field name for the node content
-        Returns:
-            The extracted text or None if no text could be extracted
-        """
         if node_type := node.get(type_field):
             if node_type == "Str":
                 return node.get(content_field)
@@ -505,29 +401,11 @@ class PandocExtractor(Extractor):
         return None
     def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
-        """Extract text from a list of inline nodes.
-        Args:
-            nodes: The list of nodes to extract text from
-        Returns:
-            The extracted text or None if no text could be extracted
-        """
         texts = [text for node in nodes if (text := self._extract_inline_text(node))]
         result = "".join(texts).strip()
         return result if result else None
     def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
-        """Extract a metadata value from a node.
-        Args:
-            node: The node to extract metadata from
-            type_field: The field name for the node type
-            content_field: The field name for the node content
-        Returns:
-            The extracted metadata value or None if no metadata could be extracted
-        """
         if not isinstance(node, dict) or type_field not in node:
             return None
@@ -577,12 +455,17 @@ class PandocExtractor(Extractor):
         return None
     def _validate_pandoc_version_sync(self) -> None:
-        """Synchronous version of _validate_pandoc_version."""
         try:
             if self._checked_version:
                 return
-            result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False)  # noqa: S607
+            result = subprocess.run(
+                ["pandoc", "--version"],  # noqa: S607
+                capture_output=True,
+                text=True,
+                check=False,
+                encoding="utf-8",
+            )
             if result.returncode != 0:
                 raise MissingDependencyError(
@@ -621,7 +504,6 @@ class PandocExtractor(Extractor):
             ) from e
     def _extract_metadata_sync(self, path: Path) -> Metadata:
-        """Synchronous version of _handle_extract_metadata."""
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, metadata_file = tempfile.mkstemp(suffix=".json")
         os.close(fd)
@@ -638,7 +520,7 @@ class PandocExtractor(Extractor):
                 str(metadata_file),
             ]
-            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
             if result.returncode != 0:
                 raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
@@ -655,7 +537,6 @@ class PandocExtractor(Extractor):
                 Path(metadata_file).unlink()
     def _extract_file_sync(self, path: Path) -> str:
-        """Synchronous version of _handle_extract_file."""
         pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
         fd, output_path = tempfile.mkstemp(suffix=".md")
         os.close(fd)
@@ -673,7 +554,7 @@ class PandocExtractor(Extractor):
                 str(output_path),
             ]
-            result = subprocess.run(command, capture_output=True, text=True, check=False)
+            result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
             if result.returncode != 0:
                 raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
@@ -691,8 +572,6 @@ class PandocExtractor(Extractor):
 class MarkdownExtractor(PandocExtractor):
-    """Extractor for Markdown-based document formats."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "text/x-markdown",
         "text/x-commonmark",
@@ -704,8 +583,6 @@ class MarkdownExtractor(PandocExtractor):
 class OfficeDocumentExtractor(PandocExtractor):
-    """Extractor for Office document formats (Word, ODT)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
         "application/vnd.oasis.opendocument.text",
@@ -713,8 +590,6 @@ class OfficeDocumentExtractor(PandocExtractor):
 class EbookExtractor(PandocExtractor):
-    """Extractor for e-book formats (EPUB, FB2)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/epub+zip",
         "application/x-fictionbook+xml",
@@ -722,8 +597,6 @@ class EbookExtractor(PandocExtractor):
 class StructuredTextExtractor(PandocExtractor):
-    """Extractor for structured text formats (RST, Org, etc.)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "text/x-rst",
         "text/x-org",
@@ -733,8 +606,6 @@ class StructuredTextExtractor(PandocExtractor):
 class LaTeXExtractor(PandocExtractor):
-    """Extractor for LaTeX and Typst documents."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/x-latex",
         "application/x-typst",
@@ -742,8 +613,6 @@ class LaTeXExtractor(PandocExtractor):
 class BibliographyExtractor(PandocExtractor):
-    """Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/x-bibtex",
         "application/x-biblatex",
@@ -754,8 +623,6 @@ class BibliographyExtractor(PandocExtractor):
 class XMLBasedExtractor(PandocExtractor):
-    """Extractor for XML-based document formats (DocBook, JATS, OPML)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/docbook+xml",
         "application/x-jats+xml",
@@ -764,8 +631,6 @@ class XMLBasedExtractor(PandocExtractor):
 class TabularDataExtractor(PandocExtractor):
-    """Extractor for tabular data formats (CSV, TSV)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "text/csv",
         "text/tab-separated-values",
@@ -773,8 +638,6 @@ class TabularDataExtractor(PandocExtractor):
 class MiscFormatExtractor(PandocExtractor):
-    """Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
     SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
         "application/rtf",
         "text/troff",

kreuzberg/_extractors/_pdf.py CHANGED Viewed

@@ -18,11 +18,8 @@ from playa import parse
 from kreuzberg._extractors._base import Extractor
 from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
 from kreuzberg._ocr import get_ocr_backend
-from kreuzberg._ocr._easyocr import EasyOCRConfig
-from kreuzberg._ocr._paddleocr import PaddleOCRConfig
-from kreuzberg._ocr._tesseract import TesseractConfig
 from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
-from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
+from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
 from kreuzberg._utils._errors import create_error_context, should_retry
 from kreuzberg._utils._pdf_lock import pypdfium_file_lock
 from kreuzberg._utils._string import normalize_spaces
@@ -65,7 +62,6 @@ class PDFExtractor(Extractor):
                 if self._validate_extracted_text(content):
                     result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
             except ParsingError:
-                # If searchable text extraction fails, continue to OCR or empty result
                 pass
         if not result and self.config.ocr_backend is not None:
@@ -77,7 +73,7 @@ class PDFExtractor(Extractor):
         result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
         if self.config.extract_tables:
-            # GMFT is optional dependency
+            # GMFT is optional dependency ~keep
             try:
                 from kreuzberg._gmft import extract_tables  # noqa: PLC0415
@@ -85,7 +81,6 @@ class PDFExtractor(Extractor):
             except ImportError:  # pragma: no cover
                 result.tables = []
-            # Enhance metadata with table information
             if result.tables:
                 table_summary = generate_table_summary(result.tables)
                 result.metadata = result.metadata | {
@@ -98,7 +93,6 @@ class PDFExtractor(Extractor):
         return self._apply_quality_processing(result)
     def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
-        """Pure sync implementation of PDF extraction from bytes."""
         fd, temp_path = tempfile.mkstemp(suffix=".pdf")
         try:
             with os.fdopen(fd, "wb") as f:
@@ -115,7 +109,6 @@ class PDFExtractor(Extractor):
                 Path(temp_path).unlink()
     def extract_path_sync(self, path: Path) -> ExtractionResult:
-        """Pure sync implementation of PDF extraction from path."""
         try:
             text = self._extract_pdf_searchable_text_sync(path)
         except ParsingError:
@@ -126,7 +119,7 @@ class PDFExtractor(Extractor):
         tables = []
         if self.config.extract_tables:
-            # GMFT is optional dependency
+            # GMFT is optional dependency ~keep
             try:
                 from kreuzberg._gmft import extract_tables_sync  # noqa: PLC0415
@@ -134,7 +127,6 @@ class PDFExtractor(Extractor):
             except ImportError:
                 tables = []
-        # Use playa for better text structure preservation when not using OCR
         if not self.config.force_ocr and self._validate_extracted_text(text):
             text = self._extract_with_playa_sync(path, fallback_text=text)
@@ -148,7 +140,6 @@ class PDFExtractor(Extractor):
             chunks=[],
         )
-        # Enhance metadata with table information
         if tables:
             table_summary = generate_table_summary(tables)
             result.metadata = result.metadata | {
@@ -158,25 +149,9 @@ class PDFExtractor(Extractor):
                 f"{table_summary['total_rows']} total rows",
             }
-        # Apply quality processing
         return self._apply_quality_processing(result)
     def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
-        """Check if text extracted from PDF is valid or corrupted.
-        This checks for indicators of corrupted PDF text extraction:
-        1. Empty or whitespace-only text
-        2. High concentration of control characters and null bytes
-        3. High concentration of Unicode replacement characters
-        Args:
-            text: The extracted text to validate
-            corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
-                characters (default: 0.05 or 5%)
-        Returns:
-            True if the text appears valid, False if it seems corrupted
-        """
         if not text or not text.strip():
             return False
@@ -188,17 +163,6 @@ class PDFExtractor(Extractor):
         return (len(corruption_matches) / len(text)) < corruption_threshold
     async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
-        """Convert a PDF file to images.
-        Args:
-            input_file: The path to the PDF file.
-        Raises:
-            ParsingError: If the PDF file could not be converted to images.
-        Returns:
-            A list of Pillow Images.
-        """
         document: pypdfium2.PdfDocument | None = None
         last_error = None
@@ -206,7 +170,7 @@ class PDFExtractor(Extractor):
             try:
                 with pypdfium_file_lock(input_file):
                     document = await run_sync(pypdfium2.PdfDocument, str(input_file))
-                    return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
+                    return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
             except pypdfium2.PdfiumError as e:  # noqa: PERF203
                 last_error = e
                 if not should_retry(e, attempt + 1):
@@ -238,39 +202,18 @@ class PDFExtractor(Extractor):
         ) from last_error
     async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
-        """Extract text from a scanned PDF file using OCR.
-        Args:
-            input_file: The path to the PDF file.
-            ocr_backend: The OCR backend to use.
-        Returns:
-            The extraction result with text content and metadata.
-        """
         images = await self._convert_pdf_to_images(input_file)
         backend = get_ocr_backend(ocr_backend)
         ocr_results = await run_taskgroup_batched(
             *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
             batch_size=cpu_count(),
         )
-        # Use list comprehension and join for efficient string building
         content = "\n".join(result.content for result in ocr_results)
         return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
     @staticmethod
     async def _extract_pdf_searchable_text(input_file: Path) -> str:
-        """Extract text from a searchable PDF file using pypdfium2.
-        Args:
-            input_file: The path to the PDF file.
-        Raises:
-            ParsingError: If the text could not be extracted from the PDF file.
-        Returns:
-            The extracted text.
-        """
         document: pypdfium2.PdfDocument | None = None
         try:
             with pypdfium_file_lock(input_file):
@@ -318,7 +261,6 @@ class PDFExtractor(Extractor):
                     await run_sync(document.close)
     def _extract_pdf_searchable_text_sync(self, path: Path) -> str:
-        """Extract searchable text from PDF using pypdfium2 (sync version)."""
         pdf = None
         try:
             with pypdfium_file_lock(path):
@@ -339,7 +281,6 @@ class PDFExtractor(Extractor):
                     pdf.close()
     def _extract_pdf_with_ocr_sync(self, path: Path) -> str:
-        """Extract text from PDF using OCR (sync version)."""
         pdf = None
         try:
             images = []
@@ -352,23 +293,7 @@ class PDFExtractor(Extractor):
                     bitmap.close()
                     page.close()
-            image_paths = []
-            temp_files = []
-            try:
-                for i, img in enumerate(images):
-                    fd, temp_path = tempfile.mkstemp(suffix=f"_page_{i}.png")
-                    temp_files.append((fd, temp_path))
-                    img.save(temp_path, format="PNG")
-                    os.close(fd)
-                    image_paths.append(temp_path)
-                return self._process_pdf_images_with_ocr(image_paths)
-            finally:
-                for _, temp_path in temp_files:
-                    with contextlib.suppress(OSError):
-                        Path(temp_path).unlink()
+            return self._process_pdf_images_with_ocr_direct(images)
         except Exception as e:
             raise ParsingError(f"Failed to OCR PDF: {e}") from e
@@ -378,7 +303,6 @@ class PDFExtractor(Extractor):
                     pdf.close()
     def _process_pdf_images_with_ocr(self, image_paths: list[str]) -> str:
-        """Process PDF images with the configured OCR backend."""
         backend = get_ocr_backend(self.config.ocr_backend)
         paths = [Path(p) for p in image_paths]
@@ -401,18 +325,47 @@ class PDFExtractor(Extractor):
             case _:
                 raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
-        # Use list comprehension and join for efficient string building
+        return "\n\n".join(result.content for result in results)
+    def _process_pdf_images_with_ocr_direct(self, images: list[Image]) -> str:
+        backend = get_ocr_backend(self.config.ocr_backend)
+        match self.config.ocr_backend:
+            case "tesseract":
+                config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
+                )
+                results = []
+                for image in images:
+                    result = backend.process_image_sync(image, **asdict(config))
+                    results.append(result)
+            case "paddleocr":
+                paddle_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
+                )
+                results = []
+                for image in images:
+                    result = backend.process_image_sync(image, **asdict(paddle_config))
+                    results.append(result)
+            case "easyocr":
+                easy_config = (
+                    self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
+                )
+                results = []
+                for image in images:
+                    result = backend.process_image_sync(image, **asdict(easy_config))
+                    results.append(result)
+            case _:
+                raise NotImplementedError(f"Direct image OCR not implemented for {self.config.ocr_backend}")
         return "\n\n".join(result.content for result in results)
     def _parse_with_password_attempts(self, content: bytes) -> Document:
-        """Parse PDF with password attempts."""
-        # Normalize password to list
         if isinstance(self.config.pdf_password, str):
             passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
         else:
             passwords = list(self.config.pdf_password)
-        # Try each password in sequence
         last_exception = None
         for password in passwords:
             try:
@@ -421,21 +374,17 @@ class PDFExtractor(Extractor):
                 last_exception = e
                 continue
-        # If all passwords failed, raise the last exception
         if last_exception:
             raise last_exception from None
-        # Fallback to no password
         return parse(content, max_workers=1, password="")
     def _get_passwords_to_try(self) -> list[str]:
-        """Get list of passwords to try in sequence."""
         if isinstance(self.config.pdf_password, str):
             return [self.config.pdf_password] if self.config.pdf_password else [""]
         return list(self.config.pdf_password) if self.config.pdf_password else [""]
     async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
-        """Extract PDF metadata with password attempts."""
         passwords = self._get_passwords_to_try()
         last_exception = None
@@ -446,7 +395,6 @@ class PDFExtractor(Extractor):
                 last_exception = e
                 continue
-        # If all passwords failed, try with empty password as fallback
         try:
             return await extract_pdf_metadata(content, password="")
         except Exception:
@@ -455,7 +403,6 @@ class PDFExtractor(Extractor):
             raise
     def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
-        """Extract PDF metadata with password attempts (sync version)."""
         passwords = self._get_passwords_to_try()
         last_exception = None
@@ -466,7 +413,6 @@ class PDFExtractor(Extractor):
                 last_exception = e
                 continue
-        # If all passwords failed, try with empty password as fallback
         try:
             return extract_pdf_metadata_sync(content, password="")
         except Exception:
@@ -475,12 +421,10 @@ class PDFExtractor(Extractor):
             raise
     def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
-        """Extract text using playa for better structure preservation."""
         with contextlib.suppress(Exception):
             content = path.read_bytes()
             document = self._parse_with_password_attempts(content)
-            # Extract text while preserving structure
             pages_text = []
             for page in document.pages:
                 page_text = page.extract_text()

kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

kreuzberg 3.11.4py3-none-any.whl → 3.13.1py3-none-any.whl