PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +125 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +907 -513
natural_pdf/core/pdf.py +385 -287
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +708 -508
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/search/haystack_search_service.py CHANGED Viewed

@@ -1,27 +1,32 @@
 """Implementation of the SearchServiceProtocol using Haystack components."""
+import copy
 import logging
 import os
-from typing import List, Dict, Any, Optional, Union, Iterable
 from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Union
 from PIL import Image
-import copy
 # --- Haystack Imports ---
 try:
     import haystack
     from haystack import Pipeline
-    from haystack.dataclasses import Document as HaystackDocument
-    from haystack.document_stores.types import DocumentStore, DuplicatePolicy
-    from haystack_integrations.document_stores.chroma import ChromaDocumentStore
-    from haystack.document_stores.in_memory import InMemoryDocumentStore
     from haystack.components.embedders import (
+        SentenceTransformersDocumentEmbedder,
         SentenceTransformersTextEmbedder,
-        SentenceTransformersDocumentEmbedder
     )
     # Import necessary retrievers, rankers etc. as needed for search()
-    from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever # For InMem
-    from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever # Use embedding retriever
+    from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever  # For InMem
+    from haystack.dataclasses import Document as HaystackDocument
+    from haystack.document_stores.in_memory import InMemoryDocumentStore
+    from haystack.document_stores.types import DocumentStore, DuplicatePolicy
+    from haystack_integrations.components.retrievers.chroma import (  # Use embedding retriever
+        ChromaEmbeddingRetriever,
+    )
+    from haystack_integrations.document_stores.chroma import ChromaDocumentStore
     # Need Ranker if used
     try:
         from haystack.components.rankers import CohereRanker
@@ -39,7 +44,7 @@ except ImportError:
     SentenceTransformersDocumentEmbedder = None
     SentenceTransformersTextEmbedder = None
     InMemoryEmbeddingRetriever = None
-    ChromaEmbeddingRetriever = None # Fallback definition
+    ChromaEmbeddingRetriever = None  # Fallback definition
     CohereRanker = None
     Pipeline = None
     DuplicatePolicy = None
@@ -47,16 +52,27 @@ except ImportError:
 # --- ChromaDB Client Import (for management) ---
 try:
     import chromadb
     CHROMADB_AVAILABLE = True
 except ImportError:
     chromadb = None
     CHROMADB_AVAILABLE = False
+from .haystack_utils import HAS_HAYSTACK_EXTRAS  # <-- This is the canonical import
+from .search_options import (
+    BaseSearchOptions,
+    MultiModalSearchOptions,
+    SearchOptions,
+    TextSearchOptions,
+)
 # --- Local Imports ---
-from .search_service_protocol import SearchServiceProtocol, IndexConfigurationError, IndexExistsError
-from .search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
-from .search_service_protocol import Indexable
-from .haystack_utils import HAS_HAYSTACK_EXTRAS # <-- This is the canonical import
+from .search_service_protocol import (
+    Indexable,
+    IndexConfigurationError,
+    IndexExistsError,
+    SearchServiceProtocol,
+)
 # --- Logging ---
 logger = logging.getLogger(__name__)
@@ -65,6 +81,7 @@ logger = logging.getLogger(__name__)
 DEFAULT_PERSIST_PATH = "./natural_pdf_index"
 DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 class HaystackSearchService(SearchServiceProtocol):
     """
     Haystack-based implementation of the search service protocol.
@@ -77,9 +94,9 @@ class HaystackSearchService(SearchServiceProtocol):
     def __init__(
         self,
         collection_name: str,
-        persist: bool = False, # Store type configuration
+        persist: bool = False,  # Store type configuration
         default_persist_path: str = DEFAULT_PERSIST_PATH,
-        embedding_model: str = DEFAULT_EMBEDDING_MODEL # Renamed for clarity
+        embedding_model: str = DEFAULT_EMBEDDING_MODEL,  # Renamed for clarity
     ):
         """
         Initialize the service for a specific collection.
@@ -92,18 +109,24 @@ class HaystackSearchService(SearchServiceProtocol):
             embedding_model: The embedding model this service instance will use.
         """
         if not HAS_HAYSTACK_EXTRAS:
-            raise ImportError("HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]")
+            raise ImportError(
+                "HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
+            )
-        self.collection_name = collection_name # Store the collection name
-        self._persist = persist # Store the persistence type for this instance
+        self.collection_name = collection_name  # Store the collection name
+        self._persist = persist  # Store the persistence type for this instance
         self._default_persist_path = default_persist_path
-        self._embedding_model = embedding_model # Store the configured model
+        self._embedding_model = embedding_model  # Store the configured model
         # Dictionary to hold InMemoryDocumentStore instances if not persisting
-        self._in_memory_store: Optional[InMemoryDocumentStore] = None if persist else InMemoryDocumentStore()
-        self._chroma_store: Optional[ChromaDocumentStore] = None # Lazy load
+        self._in_memory_store: Optional[InMemoryDocumentStore] = (
+            None if persist else InMemoryDocumentStore()
+        )
+        self._chroma_store: Optional[ChromaDocumentStore] = None  # Lazy load
-        logger.info(f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'")
+        logger.info(
+            f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'"
+        )
     # --- Internal Helper Methods --- #
@@ -114,27 +137,34 @@ class HaystackSearchService(SearchServiceProtocol):
         # Use the instance's configured persistence type and collection name
         if self._persist:
             if self._chroma_store is None:
-                 # Lazy load Chroma store
-                 logger.debug(f"Initializing ChromaDocumentStore for collection '{self.collection_name}'.")
-                 self._chroma_store = ChromaDocumentStore(
-                     persist_path=self._default_persist_path,
-                     collection_name=self.collection_name # Use instance name
-                 )
+                # Lazy load Chroma store
+                logger.debug(
+                    f"Initializing ChromaDocumentStore for collection '{self.collection_name}'."
+                )
+                self._chroma_store = ChromaDocumentStore(
+                    persist_path=self._default_persist_path,
+                    collection_name=self.collection_name,  # Use instance name
+                )
             return self._chroma_store
         else:
             # Return the instance's InMemory store
-            if self._in_memory_store is None: # Should have been created in __init__ if persist=False
-                 logger.warning(f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now.")
-                 self._in_memory_store = InMemoryDocumentStore()
+            if (
+                self._in_memory_store is None
+            ):  # Should have been created in __init__ if persist=False
+                logger.warning(
+                    f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now."
+                )
+                self._in_memory_store = InMemoryDocumentStore()
             return self._in_memory_store
     def _get_document_embedder(
-        self,
-        device: Optional[str] = None
+        self, device: Optional[str] = None
     ) -> SentenceTransformersDocumentEmbedder:
         """Creates the Haystack document embedder component."""
-        model_name = self._embedding_model # Use instance model
-        logger.debug(f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}")
+        model_name = self._embedding_model  # Use instance model
+        logger.debug(
+            f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
+        )
         if not SentenceTransformersDocumentEmbedder:
             raise ImportError("SentenceTransformersDocumentEmbedder is required but not available.")
         try:
@@ -143,56 +173,80 @@ class HaystackSearchService(SearchServiceProtocol):
                 device=device,
             )
             embedder.warm_up()
-            logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
+            logger.info(
+                f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
+            )
             return embedder
         except Exception as e:
-            logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
-            raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
+            logger.error(
+                f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
+            )
+            raise RuntimeError(
+                f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
+            ) from e
-    def _get_text_embedder(
-        self,
-        device: Optional[str] = None
-    ) -> SentenceTransformersTextEmbedder:
+    def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
         """Creates the Haystack text embedder component (for queries)."""
-        model_name = self._embedding_model # Use instance model
-        logger.debug(f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}")
+        model_name = self._embedding_model  # Use instance model
+        logger.debug(
+            f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
+        )
         if not SentenceTransformersTextEmbedder:
             raise ImportError("SentenceTransformersTextEmbedder is required but not available.")
         try:
             embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
-            embedder.warm_up()
-            logger.info(f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}")
+            embedder.warm_up()
+            logger.info(
+                f"Created SentenceTransformersTextEmbedder. Model: {model_name}, Device: {getattr(embedder, 'device', 'unknown')}"
+            )
             return embedder
         except Exception as e:
-            logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
-            raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
+            logger.error(
+                f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True
+            )
+            raise RuntimeError(
+                f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
+            ) from e
     def _delete_chroma_collection(self) -> bool:
         """Internal helper to delete the ChromaDB collection managed by this service."""
         if not CHROMADB_AVAILABLE:
-            logger.error("Cannot delete ChromaDB collection because 'chromadb' library is not installed.")
+            logger.error(
+                "Cannot delete ChromaDB collection because 'chromadb' library is not installed."
+            )
             raise ImportError("'chromadb' library required for collection deletion.")
         if not self._persist:
-             logger.warning("Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring.")
-             return False # Cannot delete if not persistent
+            logger.warning(
+                "Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring."
+            )
+            return False  # Cannot delete if not persistent
         try:
-            collection_name_to_delete = self.collection_name # Use instance collection name
-            logger.warning(f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'.")
+            collection_name_to_delete = self.collection_name  # Use instance collection name
+            logger.warning(
+                f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'."
+            )
             chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
             try:
                 chroma_client.delete_collection(name=collection_name_to_delete)
-                logger.info(f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'.")
-                self._chroma_store = None # Reset lazy-loaded store
+                logger.info(
+                    f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'."
+                )
+                self._chroma_store = None  # Reset lazy-loaded store
                 return True
             except chromadb.errors.InvalidCollectionException:
-                logger.info(f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed.")
-                return True # Deletion is effectively successful
+                logger.info(
+                    f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed."
+                )
+                return True  # Deletion is effectively successful
             finally:
-                pass # Cleanup if needed
+                pass  # Cleanup if needed
         except ImportError as ie:
             raise ie
         except Exception as e:
-            logger.error(f"Error during ChromaDB collection deletion '{self.collection_name}': {e}", exc_info=True)
+            logger.error(
+                f"Error during ChromaDB collection deletion '{self.collection_name}': {e}",
+                exc_info=True,
+            )
             # Don't raise here, let index() decide based on force_reindex
             return False
@@ -200,37 +254,43 @@ class HaystackSearchService(SearchServiceProtocol):
     def index(
         self,
-        documents: Iterable[Indexable], # Accept Indexable objects
+        documents: Iterable[Indexable],  # Accept Indexable objects
         embedder_device: Optional[str] = None,
         force_reindex: bool = False,
     ) -> None:
         # Need to consume the iterable to log count, or log differently
         # Let's convert to list for now, assuming size isn't prohibitive
         indexable_list = list(documents)
-        logger.info(f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}")
+        logger.info(
+            f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
+        )
         if not indexable_list:
-             logger.warning("No documents provided for indexing. Skipping.")
-             return
+            logger.warning("No documents provided for indexing. Skipping.")
+            return
         # --- 1. Handle Reindexing (Deletion before store/embedder init) ---
         if force_reindex:
-             logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
-             if self._persist:
-                 # Attempt deletion, raises ImportError if chromadb missing
-                 deleted = self._delete_chroma_collection() # Uses self.collection_name
-                 if not deleted:
-                      # If deletion failed for other reasons, log and continue cautiously
-                      logger.warning("Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere.")
-             else:
-                 # For InMemory, force_reindex means we want a fresh store instance.
-                 # Re-initialize the instance's in-memory store
-                 logger.info(f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'.")
-                 self._in_memory_store = InMemoryDocumentStore() # Create a new one
+            logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
+            if self._persist:
+                # Attempt deletion, raises ImportError if chromadb missing
+                deleted = self._delete_chroma_collection()  # Uses self.collection_name
+                if not deleted:
+                    # If deletion failed for other reasons, log and continue cautiously
+                    logger.warning(
+                        "Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
+                    )
+            else:
+                # For InMemory, force_reindex means we want a fresh store instance.
+                # Re-initialize the instance's in-memory store
+                logger.info(
+                    f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'."
+                )
+                self._in_memory_store = InMemoryDocumentStore()  # Create a new one
         # REMOVED try...except around store retrieval
         # Let store initialization errors propagate directly
-        store = self._get_store() # No argument needed
+        store = self._get_store()  # No argument needed
         # --- 3. Create Embedder ---
         # Errors during embedder creation will propagate from the helper
@@ -243,42 +303,55 @@ class HaystackSearchService(SearchServiceProtocol):
         for item in indexable_list:
             doc_id = item.get_id()
             metadata = item.get_metadata()
-            content_obj = item.get_content() # This might be Page, Region, etc.
+            content_obj = item.get_content()  # This might be Page, Region, etc.
             # Determine content based on embedder type and content object
             # For now, assume text content is needed and try to extract it
             content_text = ""
             if isinstance(content_obj, str):
-                 # If get_content() already returned text
-                 content_text = content_obj
-            elif hasattr(content_obj, 'extract_text') and callable(getattr(content_obj, 'extract_text')):
-                 # If content object has extract_text (like Page or Region)
-                 try:
-                     content_text = content_obj.extract_text()
-                     if not isinstance(content_text, str):
-                         logger.warning(f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str().")
-                         content_text = str(content_obj)
-                 except Exception as extraction_error:
-                     logger.error(f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().", exc_info=False)
-                     content_text = str(content_obj)
+                # If get_content() already returned text
+                content_text = content_obj
+            elif hasattr(content_obj, "extract_text") and callable(
+                getattr(content_obj, "extract_text")
+            ):
+                # If content object has extract_text (like Page or Region)
+                try:
+                    content_text = content_obj.extract_text()
+                    if not isinstance(content_text, str):
+                        logger.warning(
+                            f"extract_text() on {type(content_obj)} did not return a string for doc '{doc_id}'. Using str()."
+                        )
+                        content_text = str(content_obj)
+                except Exception as extraction_error:
+                    logger.error(
+                        f"Error calling extract_text() on {type(content_obj)} for doc '{doc_id}': {extraction_error}. Using str().",
+                        exc_info=False,
+                    )
+                    content_text = str(content_obj)
             else:
-                 # Attempt to convert to string as fallback if no obvious text method
-                 logger.warning(f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str().")
-                 content_text = str(content_obj)
+                # Attempt to convert to string as fallback if no obvious text method
+                logger.warning(
+                    f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
+                )
+                content_text = str(content_obj)
             # Construct HaystackDocument using data from Indexable protocol methods
             haystack_doc = HaystackDocument(
-                id=doc_id, # Use ID from get_id()
+                id=doc_id,  # Use ID from get_id()
                 content=content_text,
-                meta=metadata # Use metadata from get_metadata()
+                meta=metadata,  # Use metadata from get_metadata()
             )
             haystack_docs_to_embed.append(haystack_doc)
         if not haystack_docs_to_embed:
-             logger.warning("No Haystack documents were prepared. Check conversion logic and input data.")
-             return
+            logger.warning(
+                "No Haystack documents were prepared. Check conversion logic and input data."
+            )
+            return
-        logger.info(f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'...")
+        logger.info(
+            f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
+        )
         try:
             # Embed the documents
             embedding_results = embedder.run(documents=haystack_docs_to_embed)
@@ -291,33 +364,42 @@ class HaystackSearchService(SearchServiceProtocol):
             error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
             if self._persist:
                 error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
-                error_msg += "Try deleting the persistent storage directory or using force_reindex=True."
+                error_msg += (
+                    "Try deleting the persistent storage directory or using force_reindex=True."
+                )
             else:
-                 error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
+                error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
             logger.error(error_msg, exc_info=True)
             raise IndexConfigurationError(error_msg) from dim_error
         # REMOVED broad except Exception for embedding errors. Let them propagate.
         # --- 5. Write Embedded Documents to Store ---
-        logger.info(f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'...")
+        logger.info(
+            f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'..."
+        )
         # REMOVED try...except around store writing. Let errors propagate.
         write_result = store.write_documents(
-             documents=embedded_docs,
-             policy=DuplicatePolicy.OVERWRITE # Or configure as needed
+            documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE  # Or configure as needed
+        )
+        logger.info(
+            f"Successfully wrote {write_result} documents to store '{self.collection_name}'."
         )
-        logger.info(f"Successfully wrote {write_result} documents to store '{self.collection_name}'.")
         # --- Add explicit count check after writing ---
-        logger.info(f"Store '{self.collection_name}' document count after write: {store.count_documents()}")
+        logger.info(
+            f"Store '{self.collection_name}' document count after write: {store.count_documents()}"
+        )
         # --- End count check ---
     def search(
         self,
-        query: Any, # Changed from Union[str, Path, Image.Image] to Any
+        query: Any,  # Changed from Union[str, Path, Image.Image] to Any
         options: BaseSearchOptions,
     ) -> List[Dict[str, Any]]:
-        logger.info(f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}")
+        logger.info(
+            f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}"
+        )
-        store = self._get_store() # Let errors propagate
+        store = self._get_store()  # Let errors propagate
         # --- 1. Handle Query Type and Embedding ---
         # This implementation currently only supports text query embedding.
@@ -325,44 +407,63 @@ class HaystackSearchService(SearchServiceProtocol):
         query_embedding = None
         query_text = ""
         if isinstance(query, (str, os.PathLike)):
-             if isinstance(query, os.PathLike):
-                 logger.warning("Image path query received, but multimodal search not fully implemented. Treating as text path string.")
-                 query_text = str(query)
-             else:
-                 query_text = query
-             text_embedder = self._get_text_embedder()
-             embedding_result = text_embedder.run(text=query_text)
-             query_embedding = embedding_result["embedding"]
-             if not query_embedding:
-                 raise ValueError("Text embedder did not return an embedding for the query.")
-             logger.debug(f"Successfully generated query text embedding (dim: {len(query_embedding)}).")
+            if isinstance(query, os.PathLike):
+                logger.warning(
+                    "Image path query received, but multimodal search not fully implemented. Treating as text path string."
+                )
+                query_text = str(query)
+            else:
+                query_text = query
+            text_embedder = self._get_text_embedder()
+            embedding_result = text_embedder.run(text=query_text)
+            query_embedding = embedding_result["embedding"]
+            if not query_embedding:
+                raise ValueError("Text embedder did not return an embedding for the query.")
+            logger.debug(
+                f"Successfully generated query text embedding (dim: {len(query_embedding)})."
+            )
         elif isinstance(query, Image.Image):
-             logger.error("Multimodal query (PIL Image) is not yet supported by this service implementation.")
-             raise NotImplementedError("Search with PIL Image queries is not implemented in HaystackSearchService.")
+            logger.error(
+                "Multimodal query (PIL Image) is not yet supported by this service implementation."
+            )
+            raise NotImplementedError(
+                "Search with PIL Image queries is not implemented in HaystackSearchService."
+            )
         # Check if query is Indexable and try extracting text?
-        elif hasattr(query, 'extract_text') and callable(getattr(query, 'extract_text')):
-             logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text for search.")
-             try:
-                 query_text = query.extract_text()
-                 if not query_text or not query_text.strip():
-                     logger.warning(f"Query object {type(query).__name__} provided empty text. Returning no results.")
-                     return []
-                 # Embed the extracted text
-                 text_embedder = self._get_text_embedder()
-                 embedding_result = text_embedder.run(text=query_text)
-                 query_embedding = embedding_result["embedding"]
-                 if not query_embedding:
-                     raise ValueError(f"Text embedder did not return an embedding for text extracted from {type(query).__name__}.")
-                 logger.debug(f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)}).")
-             except Exception as e:
-                 logger.error(f"Failed to extract or embed text from query object {type(query).__name__}: {e}", exc_info=True)
-                 raise RuntimeError("Query text extraction or embedding failed.") from e
+        elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
+            logger.debug(
+                f"Query type {type(query).__name__} has extract_text. Extracting text for search."
+            )
+            try:
+                query_text = query.extract_text()
+                if not query_text or not query_text.strip():
+                    logger.warning(
+                        f"Query object {type(query).__name__} provided empty text. Returning no results."
+                    )
+                    return []
+                # Embed the extracted text
+                text_embedder = self._get_text_embedder()
+                embedding_result = text_embedder.run(text=query_text)
+                query_embedding = embedding_result["embedding"]
+                if not query_embedding:
+                    raise ValueError(
+                        f"Text embedder did not return an embedding for text extracted from {type(query).__name__}."
+                    )
+                logger.debug(
+                    f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)})."
+                )
+            except Exception as e:
+                logger.error(
+                    f"Failed to extract or embed text from query object {type(query).__name__}: {e}",
+                    exc_info=True,
+                )
+                raise RuntimeError("Query text extraction or embedding failed.") from e
         else:
-             # Raise specific error for unsupported types by this implementation
-             raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
+            # Raise specific error for unsupported types by this implementation
+            raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
         # --- 2. Select Retriever based on Store Type ---
         retriever = None
@@ -371,10 +472,10 @@ class HaystackSearchService(SearchServiceProtocol):
                 raise ImportError("ChromaEmbeddingRetriever is required but not available.")
             retriever = ChromaEmbeddingRetriever(document_store=store)
         elif isinstance(store, InMemoryDocumentStore):
-             retriever = InMemoryEmbeddingRetriever(document_store=store)
+            retriever = InMemoryEmbeddingRetriever(document_store=store)
         else:
-             # Raise specific error for unsupported store
-             raise TypeError(f"Cannot perform search with store type {type(store)}.")
+            # Raise specific error for unsupported store
+            raise TypeError(f"Cannot perform search with store type {type(store)}.")
         # --- 3. Build Retrieval Pipeline ---
         pipeline = Pipeline()
@@ -385,13 +486,10 @@ class HaystackSearchService(SearchServiceProtocol):
         # --- 4. Prepare Filters (remains the same) ---
         haystack_filters = options.filters
         if haystack_filters:
-             logger.debug(f"Applying filters: {haystack_filters}")
+            logger.debug(f"Applying filters: {haystack_filters}")
         # --- 5. Prepare Retriever Input Data (Dynamically) ---
-        retriever_input_data = {
-            "filters": haystack_filters,
-            "top_k": options.top_k
-        }
+        retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
         # Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
         retriever_input_data["query_embedding"] = query_embedding
         logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
@@ -415,23 +513,25 @@ class HaystackSearchService(SearchServiceProtocol):
                     meta_with_hash = doc.meta
                     # No need to explicitly add hash here if Haystack store preserves it
                     result_dict = {
-                         "content_snippet": doc.content[:200] if doc.content else "",
-                         "score": doc.score if doc.score is not None else 0.0,
-                         "page_number": meta_with_hash.get("page_number", None),
-                         "pdf_path": meta_with_hash.get("pdf_path", None),
-                         "metadata": meta_with_hash, # Pass full metadata
-                         # "_haystack_document": doc # Optionally include full object
+                        "content_snippet": doc.content[:200] if doc.content else "",
+                        "score": doc.score if doc.score is not None else 0.0,
+                        "page_number": meta_with_hash.get("page_number", None),
+                        "pdf_path": meta_with_hash.get("pdf_path", None),
+                        "metadata": meta_with_hash,  # Pass full metadata
+                        # "_haystack_document": doc # Optionally include full object
                     }
                     final_results.append(result_dict)
                 return final_results
             else:
-                 logger.warning("Pipeline result did not contain expected retriever output.")
-                 return []
+                logger.warning("Pipeline result did not contain expected retriever output.")
+                return []
         except FileNotFoundError:
-             # Keep specific catch for collection not found during retrieval
-             logger.error(f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'.")
-             raise # Re-raise the specific FileNotFoundError
+            # Keep specific catch for collection not found during retrieval
+            logger.error(
+                f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'."
+            )
+            raise  # Re-raise the specific FileNotFoundError
         # REMOVED broad except Exception for pipeline execution. Let errors propagate.
     def delete_index(
@@ -449,9 +549,11 @@ class HaystackSearchService(SearchServiceProtocol):
             return self._delete_chroma_collection()
         else:
             # For InMemory, "deleting" means re-initializing the store
-            logger.info(f"Re-initializing InMemory store for '{self.collection_name}' as deletion request.")
+            logger.info(
+                f"Re-initializing InMemory store for '{self.collection_name}' as deletion request."
+            )
             self._in_memory_store = InMemoryDocumentStore()
-            return True # Considered successful
+            return True  # Considered successful
     def index_exists(
         self,
@@ -462,59 +564,80 @@ class HaystackSearchService(SearchServiceProtocol):
         For InMemory, it checks if the internal store object exists and has documents.
         """
         logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
-        store = self._get_store() # Get the store instance
+        store = self._get_store()  # Get the store instance
         try:
             count = store.count_documents()
             exists = count > 0
-            logger.debug(f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}")
+            logger.debug(
+                f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}"
+            )
             return exists
         except Exception as e:
-             # Catch errors during count_documents (e.g., connection error for persistent stores)
-             logger.warning(f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}", exc_info=False)
-             # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
-             # Assume not exists if count fails
-             return False
+            # Catch errors during count_documents (e.g., connection error for persistent stores)
+            logger.warning(
+                f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}",
+                exc_info=False,
+            )
+            # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
+            # Assume not exists if count fails
+            return False
     # --- Sync Methods Implementation ---
     def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
         """Retrieves documents, required for sync.
-           NOTE: Haystack's filter_documents is the closest match.
-                 Fetches all docs if filters=None.
+        NOTE: Haystack's filter_documents is the closest match.
+              Fetches all docs if filters=None.
         """
-        logger.debug(f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})...")
+        logger.debug(
+            f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})..."
+        )
         store = self._get_store()
         try:
             # Use filter_documents with no filters to get all
             # This might be inefficient for very large stores.
-            haystack_docs = store.filter_documents(filters=kwargs.get('filters')) # Pass filters if provided via kwargs
+            haystack_docs = store.filter_documents(
+                filters=kwargs.get("filters")
+            )  # Pass filters if provided via kwargs
             logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
             # Convert to simple dicts
             results = []
             for doc in haystack_docs:
-                 doc_dict = {"id": doc.id} # ID is essential
-                 if include_metadata:
-                     # Ensure content_hash is included if it exists in meta
-                     doc_dict["meta"] = doc.meta
-                 # Optionally include content? Protocol doesn't require it.
-                 # doc_dict["content"] = doc.content
-                 results.append(doc_dict)
+                doc_dict = {"id": doc.id}  # ID is essential
+                if include_metadata:
+                    # Ensure content_hash is included if it exists in meta
+                    doc_dict["meta"] = doc.meta
+                # Optionally include content? Protocol doesn't require it.
+                # doc_dict["content"] = doc.content
+                results.append(doc_dict)
             return results
         except Exception as e:
-            logger.error(f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True)
-            raise RuntimeError(f"Failed to list documents from store '{self.collection_name}'.") from e
+            logger.error(
+                f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True
+            )
+            raise RuntimeError(
+                f"Failed to list documents from store '{self.collection_name}'."
+            ) from e
     def delete_documents(self, ids: List[str]) -> None:
         """Deletes documents by ID, required for sync."""
         if not ids:
             logger.debug("No document IDs provided for deletion. Skipping.")
             return
-        logger.warning(f"Request to delete {len(ids)} documents from collection '{self.collection_name}'.")
+        logger.warning(
+            f"Request to delete {len(ids)} documents from collection '{self.collection_name}'."
+        )
         store = self._get_store()
         try:
             store.delete_documents(ids=ids)
-            logger.info(f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}")
+            logger.info(
+                f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}"
+            )
         except Exception as e:
-            logger.error(f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}", exc_info=True)
-            raise RuntimeError(f"Failed to delete documents from store '{self.collection_name}'.") from e
+            logger.error(
+                f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}",
+                exc_info=True,
+            )
+            raise RuntimeError(
+                f"Failed to delete documents from store '{self.collection_name}'."
+            ) from e

natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl