PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/search/haystack_search_service.py CHANGED Viewed

@@ -1,13 +1,19 @@
 """Implementation of the SearchServiceProtocol using Haystack components."""
-import copy
 import logging
 import os
+import shutil
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
 from PIL import Image
+# Import sentence-transformers for dimension calculation
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
 # --- Haystack Imports ---
 try:
     import haystack
@@ -17,15 +23,23 @@ try:
         SentenceTransformersTextEmbedder,
     )
-    # Import necessary retrievers, rankers etc. as needed for search()
-    from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever  # For InMem
+    # Import InMemory Store & Retriever unconditionally
+    from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
     from haystack.dataclasses import Document as HaystackDocument
     from haystack.document_stores.in_memory import InMemoryDocumentStore
     from haystack.document_stores.types import DocumentStore, DuplicatePolicy
-    from haystack_integrations.components.retrievers.chroma import (  # Use embedding retriever
-        ChromaEmbeddingRetriever,
-    )
-    from haystack_integrations.document_stores.chroma import ChromaDocumentStore
+    # Conditional LanceDB Imports
+    try:
+        from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
+        LANCEDB_HAYSTACK_AVAILABLE = True
+    except ImportError:
+        LanceDBDocumentStore = None
+        LanceDBEmbeddingRetriever = None
+        LANCEDB_HAYSTACK_AVAILABLE = False
+    # Removed Chroma Imports
     # Need Ranker if used
     try:
@@ -33,36 +47,35 @@ try:
     except ImportError:
         CohereRanker = None
-    # Don't define here, it's imported later
 except ImportError:
     # Set flags/placeholders if Haystack isn't installed
-    # Don't define here, it's imported later
     DocumentStore = object
     HaystackDocument = Dict
-    ChromaDocumentStore = None
     InMemoryDocumentStore = None
+    LanceDBDocumentStore = None
     SentenceTransformersDocumentEmbedder = None
     SentenceTransformersTextEmbedder = None
     InMemoryEmbeddingRetriever = None
-    ChromaEmbeddingRetriever = None  # Fallback definition
+    LanceDBEmbeddingRetriever = None
     CohereRanker = None
     Pipeline = None
     DuplicatePolicy = None
+    LANCEDB_HAYSTACK_AVAILABLE = False
-# --- ChromaDB Client Import (for management) ---
+# LanceDB Client Import (for management)
 try:
-    import chromadb
+    import lancedb
-    CHROMADB_AVAILABLE = True
+    LANCEDB_CLIENT_AVAILABLE = True
 except ImportError:
-    chromadb = None
-    CHROMADB_AVAILABLE = False
+    lancedb = None
+    LANCEDB_CLIENT_AVAILABLE = False
-from .haystack_utils import HAS_HAYSTACK_EXTRAS  # <-- This is the canonical import
+# Removed ChromaDB Client Import
+from .haystack_utils import HAS_HAYSTACK_EXTRAS
 from .search_options import (
     BaseSearchOptions,
-    MultiModalSearchOptions,
-    SearchOptions,
     TextSearchOptions,
 )
@@ -70,11 +83,9 @@ from .search_options import (
 from .search_service_protocol import (
     Indexable,
     IndexConfigurationError,
-    IndexExistsError,
     SearchServiceProtocol,
 )
-# --- Logging ---
 logger = logging.getLogger(__name__)
 # --- Default Configuration Values ---
@@ -86,74 +97,129 @@ class HaystackSearchService(SearchServiceProtocol):
     """
     Haystack-based implementation of the search service protocol.
-    Manages ChromaDB (persistent) or InMemory (non-persistent) DocumentStores
+    Manages LanceDB (persistent) or InMemory (non-persistent) DocumentStores
     and uses Haystack components for embedding and retrieval.
-    A single instance of this service is tied to a specific collection name.
+    A single instance of this service is tied to a specific table name (LanceDB)
+    or implicitly managed (InMemory).
     """
     def __init__(
         self,
-        collection_name: str,
-        persist: bool = False,  # Store type configuration
-        default_persist_path: str = DEFAULT_PERSIST_PATH,
-        embedding_model: str = DEFAULT_EMBEDDING_MODEL,  # Renamed for clarity
+        table_name: str,
+        persist: bool = False,
+        uri: str = DEFAULT_PERSIST_PATH,
+        embedding_model: str = DEFAULT_EMBEDDING_MODEL,
     ):
         """
-        Initialize the service for a specific collection.
+        Initialize the service for a specific LanceDB table or an InMemory store.
         Args:
-            collection_name: The name of the index/collection this service instance manages.
-            persist: If True, this service instance manages persistent ChromaDB stores.
-                    If False, it manages transient InMemory stores.
-            default_persist_path: Default path for persistent ChromaDB storage.
+            table_name: The name of the LanceDB table (if persist=True).
+            persist: If True, this service instance manages a persistent LanceDB store.
+                    If False, it manages a transient InMemory store.
+            uri: Path/URI for the LanceDB database directory (if persist=True).
             embedding_model: The embedding model this service instance will use.
+                               Required for LanceDB to know embedding dimensions.
         """
         if not HAS_HAYSTACK_EXTRAS:
             raise ImportError(
                 "HaystackSearchService requires Haystack extras. Install with: pip install natural-pdf[haystack]"
             )
-        self.collection_name = collection_name  # Store the collection name
-        self._persist = persist  # Store the persistence type for this instance
-        self._default_persist_path = default_persist_path
-        self._embedding_model = embedding_model  # Store the configured model
+        self.table_name = table_name
+        self._persist = persist
+        self._uri = uri
+        self._embedding_model = embedding_model
+        self._embedding_dims: Optional[int] = None
-        # Dictionary to hold InMemoryDocumentStore instances if not persisting
-        self._in_memory_store: Optional[InMemoryDocumentStore] = (
-            None if persist else InMemoryDocumentStore()
-        )
-        self._chroma_store: Optional[ChromaDocumentStore] = None  # Lazy load
+        # Store instances (lazy loaded)
+        self._in_memory_store: Optional[InMemoryDocumentStore] = None
+        self._lancedb_store: Optional[LanceDBDocumentStore] = None
-        logger.info(
-            f"HaystackSearchService initialized for collection='{self.collection_name}' (persist={self._persist}, model='{self._embedding_model}'). Default path: '{self._default_persist_path}'"
-        )
+        # Eagerly create InMemoryStore if not persisting
+        if not self._persist:
+            if not InMemoryDocumentStore:
+                raise ImportError(
+                    "InMemoryDocumentStore not available. Cannot create non-persistent service."
+                )
+            self._in_memory_store = InMemoryDocumentStore()
+            logger.info(
+                f"HaystackSearchService initialized for InMemory store (table_name '{self.table_name}' ignored). Model: '{self._embedding_model}'"
+            )
+        else:
+            # Check LanceDB availability if persisting
+            if not LANCEDB_HAYSTACK_AVAILABLE:
+                raise ImportError(
+                    "LanceDB persistent store requires lancedb-haystack. Install with: pip install lancedb-haystack"
+                )
+            if not SentenceTransformer:
+                raise ImportError(
+                    "LanceDB persistent store requires sentence-transformers to determine embedding dimensions. Install with: pip install sentence-transformers"
+                )
+            # Calculate embedding dimensions needed for LanceDB initialization
+            self._calculate_embedding_dims()
+            logger.info(
+                f"HaystackSearchService initialized for LanceDB table='{self.table_name}' at uri='{self._uri}'. Model: '{self._embedding_model}', Dims: {self._embedding_dims}"
+            )
-    # --- Internal Helper Methods --- #
+    # --- Internal Helper Methods ---
-    def _get_store(
-        self,
-    ) -> DocumentStore:
-        """Gets or creates the appropriate Haystack DocumentStore instance for this service's collection."""
-        # Use the instance's configured persistence type and collection name
+    def _calculate_embedding_dims(self) -> None:
+        """Calculates and stores embedding dimensions from the model name."""
+        if self._embedding_dims is None:
+            if not SentenceTransformer:
+                raise ImportError(
+                    "sentence-transformers library is required to determine embedding dimensions."
+                )
+            try:
+                model = SentenceTransformer(self._embedding_model)
+                dims = model.get_sentence_embedding_dimension()
+                if not dims:
+                    raise ValueError(
+                        f"Could not determine embedding dimension for model: {self._embedding_model}"
+                    )
+                self._embedding_dims = dims
+                logger.debug(
+                    f"Determined embedding dimension: {self._embedding_dims} for model '{self._embedding_model}'"
+                )
+            except Exception as e:
+                logger.error(
+                    f"Failed to load SentenceTransformer model '{self._embedding_model}' to get dimensions: {e}",
+                    exc_info=True,
+                )
+                raise RuntimeError(
+                    f"Failed to determine embedding dimension for model '{self._embedding_model}'."
+                ) from e
+    def _get_store(self) -> DocumentStore:
+        """Gets or creates the appropriate Haystack DocumentStore instance."""
         if self._persist:
-            if self._chroma_store is None:
-                # Lazy load Chroma store
+            if not LanceDBDocumentStore:
+                raise ImportError("LanceDBDocumentStore not available.")
+            if self._lancedb_store is None:
                 logger.debug(
-                    f"Initializing ChromaDocumentStore for collection '{self.collection_name}'."
+                    f"Initializing LanceDBDocumentStore for table '{self.table_name}' at uri '{self._uri}'."
                 )
-                self._chroma_store = ChromaDocumentStore(
-                    persist_path=self._default_persist_path,
-                    collection_name=self.collection_name,  # Use instance name
+                if self._embedding_dims is None:
+                    logger.warning(
+                        "Embedding dimensions not calculated before getting store. Calculating now."
+                    )
+                    self._calculate_embedding_dims()
+                self._lancedb_store = LanceDBDocumentStore(
+                    database=self._uri,
+                    table_name=self.table_name,
+                    embedding_dims=self._embedding_dims,
                 )
-            return self._chroma_store
-        else:
-            # Return the instance's InMemory store
-            if (
-                self._in_memory_store is None
-            ):  # Should have been created in __init__ if persist=False
-                logger.warning(
-                    f"In-memory store for collection '{self.collection_name}' was not initialized. Creating now."
+                logger.info(
+                    f"Initialized LanceDBDocumentStore for table '{self.table_name}' (Dims: {self._embedding_dims})"
                 )
+            return self._lancedb_store
+        else:
+            if self._in_memory_store is None:
+                logger.warning("In-memory store was not initialized. Creating now.")
+                if not InMemoryDocumentStore:
+                    raise ImportError("InMemoryDocumentStore not available.")
                 self._in_memory_store = InMemoryDocumentStore()
             return self._in_memory_store
@@ -161,7 +227,7 @@ class HaystackSearchService(SearchServiceProtocol):
         self, device: Optional[str] = None
     ) -> SentenceTransformersDocumentEmbedder:
         """Creates the Haystack document embedder component."""
-        model_name = self._embedding_model  # Use instance model
+        model_name = self._embedding_model
         logger.debug(
             f"Creating SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {device or 'auto'}"
         )
@@ -187,7 +253,7 @@ class HaystackSearchService(SearchServiceProtocol):
     def _get_text_embedder(self, device: Optional[str] = None) -> SentenceTransformersTextEmbedder:
         """Creates the Haystack text embedder component (for queries)."""
-        model_name = self._embedding_model  # Use instance model
+        model_name = self._embedding_model
         logger.debug(
             f"Creating SentenceTransformersTextEmbedder. Model: {model_name}, Device: {device or 'auto'}"
         )
@@ -208,113 +274,97 @@ class HaystackSearchService(SearchServiceProtocol):
                 f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
             ) from e
-    def _delete_chroma_collection(self) -> bool:
-        """Internal helper to delete the ChromaDB collection managed by this service."""
-        if not CHROMADB_AVAILABLE:
-            logger.error(
-                "Cannot delete ChromaDB collection because 'chromadb' library is not installed."
-            )
-            raise ImportError("'chromadb' library required for collection deletion.")
+    def _delete_lancedb_table(self) -> bool:
+        """Internal helper to delete the LanceDB table managed by this service."""
         if not self._persist:
             logger.warning(
-                "Attempted to delete ChromaDB collection for a non-persistent service instance. Ignoring."
+                "Attempted to delete LanceDB table for a non-persistent service instance. Ignoring."
             )
-            return False  # Cannot delete if not persistent
+            return False
+        if not LANCEDB_CLIENT_AVAILABLE:
+            logger.error("Cannot delete LanceDB table because 'lancedb' library is not installed.")
+            raise ImportError("'lancedb' library required for table deletion.")
+        table_name_to_delete = self.table_name
+        db_uri = self._uri
+        logger.warning(
+            f"Attempting to delete existing LanceDB table '{table_name_to_delete}' at uri '{db_uri}'."
+        )
         try:
-            collection_name_to_delete = self.collection_name  # Use instance collection name
-            logger.warning(
-                f"Attempting to delete existing ChromaDB collection '{collection_name_to_delete}' at path '{self._default_persist_path}'."
-            )
-            chroma_client = chromadb.PersistentClient(path=self._default_persist_path)
-            try:
-                chroma_client.delete_collection(name=collection_name_to_delete)
+            db = lancedb.connect(db_uri)
+            table_names = db.table_names()
+            if table_name_to_delete in table_names:
+                db.drop_table(table_name_to_delete)
                 logger.info(
-                    f"Successfully deleted existing ChromaDB collection '{collection_name_to_delete}'."
+                    f"Successfully deleted existing LanceDB table '{table_name_to_delete}'."
                 )
-                self._chroma_store = None  # Reset lazy-loaded store
-                return True
-            except chromadb.errors.InvalidCollectionException:
+            else:
                 logger.info(
-                    f"ChromaDB collection '{collection_name_to_delete}' did not exist. No deletion needed."
+                    f"LanceDB table '{table_name_to_delete}' did not exist. No deletion needed."
                 )
-                return True  # Deletion is effectively successful
-            finally:
-                pass  # Cleanup if needed
-        except ImportError as ie:
-            raise ie
+            self._lancedb_store = None
+            return True
         except Exception as e:
             logger.error(
-                f"Error during ChromaDB collection deletion '{self.collection_name}': {e}",
+                f"Error during LanceDB table deletion '{table_name_to_delete}' at '{db_uri}': {e}",
                 exc_info=True,
             )
-            # Don't raise here, let index() decide based on force_reindex
             return False
-    # --- Protocol Methods Implementation --- #
+    # --- Protocol Methods Implementation ---
     def index(
         self,
-        documents: Iterable[Indexable],  # Accept Indexable objects
+        documents: Iterable[Indexable],
         embedder_device: Optional[str] = None,
         force_reindex: bool = False,
     ) -> None:
-        # Need to consume the iterable to log count, or log differently
-        # Let's convert to list for now, assuming size isn't prohibitive
         indexable_list = list(documents)
         logger.info(
-            f"Index request for collection='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
+            f"Index request for table='{self.table_name}', docs={len(indexable_list)}, model='{self._embedding_model}', force={force_reindex}, persist={self._persist}"
         )
         if not indexable_list:
             logger.warning("No documents provided for indexing. Skipping.")
             return
-        # --- 1. Handle Reindexing (Deletion before store/embedder init) ---
+        # Handle Reindexing
         if force_reindex:
-            logger.info(f"Force reindex requested for collection '{self.collection_name}'.")
+            logger.info(f"Force reindex requested for table '{self.table_name}'.")
             if self._persist:
-                # Attempt deletion, raises ImportError if chromadb missing
-                deleted = self._delete_chroma_collection()  # Uses self.collection_name
+                deleted = self._delete_lancedb_table()
                 if not deleted:
-                    # If deletion failed for other reasons, log and continue cautiously
                     logger.warning(
-                        "Collection deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
+                        "LanceDB table deletion failed, but force_reindex=True. Proceeding with indexing, but existing data/config may interfere."
                     )
             else:
-                # For InMemory, force_reindex means we want a fresh store instance.
-                # Re-initialize the instance's in-memory store
-                logger.info(
-                    f"force_reindex=True: Re-initializing InMemory store for collection '{self.collection_name}'."
-                )
-                self._in_memory_store = InMemoryDocumentStore()  # Create a new one
+                # For InMemory, re-initialize the instance's store
+                logger.info(f"force_reindex=True: Re-initializing InMemory store.")
+                if not InMemoryDocumentStore:
+                    raise ImportError("InMemoryDocumentStore not available.")
+                self._in_memory_store = InMemoryDocumentStore()
-        # REMOVED try...except around store retrieval
-        # Let store initialization errors propagate directly
-        store = self._get_store()  # No argument needed
+        # Get Store
+        store = self._get_store()
-        # --- 3. Create Embedder ---
-        # Errors during embedder creation will propagate from the helper
+        # Create Embedder
         embedder = self._get_document_embedder(embedder_device)
-        # --- 4. Convert Indexable to Haystack Docs & Embed ---
+        # Convert Indexable to Haystack Docs & Embed
         haystack_docs_to_embed: List[HaystackDocument] = []
         logger.info(f"Preparing Haystack Documents from {len(indexable_list)} indexable items...")
-        # Consume Indexable items using the protocol methods
         for item in indexable_list:
             doc_id = item.get_id()
             metadata = item.get_metadata()
-            content_obj = item.get_content()  # This might be Page, Region, etc.
-            # Determine content based on embedder type and content object
-            # For now, assume text content is needed and try to extract it
+            content_obj = item.get_content()
             content_text = ""
             if isinstance(content_obj, str):
-                # If get_content() already returned text
                 content_text = content_obj
             elif hasattr(content_obj, "extract_text") and callable(
                 getattr(content_obj, "extract_text")
             ):
-                # If content object has extract_text (like Page or Region)
                 try:
                     content_text = content_obj.extract_text()
                     if not isinstance(content_text, str):
@@ -329,18 +379,12 @@ class HaystackSearchService(SearchServiceProtocol):
                     )
                     content_text = str(content_obj)
             else:
-                # Attempt to convert to string as fallback if no obvious text method
                 logger.warning(
                     f"Could not extract text from content type {type(content_obj)} obtained via get_content() for doc '{doc_id}'. Using str()."
                 )
                 content_text = str(content_obj)
-            # Construct HaystackDocument using data from Indexable protocol methods
-            haystack_doc = HaystackDocument(
-                id=doc_id,  # Use ID from get_id()
-                content=content_text,
-                meta=metadata,  # Use metadata from get_metadata()
-            )
+            haystack_doc = HaystackDocument(id=doc_id, content=content_text, meta=metadata)
             haystack_docs_to_embed.append(haystack_doc)
         if not haystack_docs_to_embed:
@@ -353,68 +397,57 @@ class HaystackSearchService(SearchServiceProtocol):
             f"Embedding {len(haystack_docs_to_embed)} documents using '{self._embedding_model}'..."
         )
         try:
-            # Embed the documents
             embedding_results = embedder.run(documents=haystack_docs_to_embed)
             embedded_docs = embedding_results["documents"]
             logger.info(f"Successfully embedded {len(embedded_docs)} documents.")
         except haystack.errors.dimensionality_mismatch.InvalidDimensionError as dim_error:
-            # Keep specific catch for dimension mismatch - provides useful context
-            error_msg = f"Indexing failed for collection '{self.collection_name}'. Dimension mismatch: {dim_error}. "
-            error_msg += f"Ensure the embedding model ('{self._embedding_model}') matches the expected dimension of the store. "
+            error_msg = (
+                f"Indexing failed for table '{self.table_name}'. Dimension mismatch: {dim_error}. "
+            )
+            error_msg += f"Ensure the embedding model ('{self._embedding_model}', Dim: {self._embedding_dims}) matches the expected dimension of the store. "
             if self._persist:
-                error_msg += f"If the collection already exists at '{self._default_persist_path}', it might have been created with a different model. "
-                error_msg += (
-                    "Try deleting the persistent storage directory or using force_reindex=True."
-                )
+                error_msg += f"If the table already exists at '{self._uri}', it might have been created with a different model/dimension. "
+                error_msg += f"Try deleting the LanceDB table directory ('{os.path.join(self._uri, self.table_name + '.lance')}') or using force_reindex=True."
             else:
                 error_msg += "This usually indicates an issue with the embedder setup or Haystack compatibility."
             logger.error(error_msg, exc_info=True)
             raise IndexConfigurationError(error_msg) from dim_error
-        # REMOVED broad except Exception for embedding errors. Let them propagate.
-        # --- 5. Write Embedded Documents to Store ---
+        # Write Embedded Documents to Store
         logger.info(
-            f"Writing {len(embedded_docs)} embedded documents to store '{self.collection_name}'..."
+            f"Writing {len(embedded_docs)} embedded documents to store (Table/Type: '{self.table_name if self._persist else 'InMemory'}')..."
         )
-        # REMOVED try...except around store writing. Let errors propagate.
         write_result = store.write_documents(
-            documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE  # Or configure as needed
-        )
-        logger.info(
-            f"Successfully wrote {write_result} documents to store '{self.collection_name}'."
+            documents=embedded_docs, policy=DuplicatePolicy.OVERWRITE
         )
-        # --- Add explicit count check after writing ---
-        logger.info(
-            f"Store '{self.collection_name}' document count after write: {store.count_documents()}"
-        )
-        # --- End count check ---
+        logger.info(f"Successfully wrote {write_result} documents to store.")
+        try:
+            count = store.count_documents()
+            logger.info(f"Store document count after write: {count}")
+        except Exception as count_error:
+            logger.warning(f"Could not get document count after write: {count_error}")
     def search(
         self,
-        query: Any,  # Changed from Union[str, Path, Image.Image] to Any
+        query: Any,
         options: BaseSearchOptions,
     ) -> List[Dict[str, Any]]:
         logger.info(
-            f"Search request for collection='{self.collection_name}', query_type={type(query).__name__}, options={options}"
+            f"Search request for table/store='{self.table_name if self._persist else 'InMemory'}', query_type={type(query).__name__}, options={options}"
         )
-        store = self._get_store()  # Let errors propagate
+        store = self._get_store()
-        # --- 1. Handle Query Type and Embedding ---
-        # This implementation currently only supports text query embedding.
-        # TODO: Refactor or extend for multimodal queries based on service capabilities/options.
+        # Handle Query Type and Embedding
         query_embedding = None
         query_text = ""
         if isinstance(query, (str, os.PathLike)):
             if isinstance(query, os.PathLike):
-                logger.warning(
-                    "Image path query received, but multimodal search not fully implemented. Treating as text path string."
-                )
+                logger.warning("Image path query received, treating as text path string.")
                 query_text = str(query)
             else:
                 query_text = query
             text_embedder = self._get_text_embedder()
             embedding_result = text_embedder.run(text=query_text)
             query_embedding = embedding_result["embedding"]
@@ -423,19 +456,11 @@ class HaystackSearchService(SearchServiceProtocol):
             logger.debug(
                 f"Successfully generated query text embedding (dim: {len(query_embedding)})."
             )
         elif isinstance(query, Image.Image):
-            logger.error(
-                "Multimodal query (PIL Image) is not yet supported by this service implementation."
-            )
-            raise NotImplementedError(
-                "Search with PIL Image queries is not implemented in HaystackSearchService."
-            )
-        # Check if query is Indexable and try extracting text?
+            logger.error("Multimodal query (PIL Image) is not yet supported.")
+            raise NotImplementedError("Search with PIL Image queries is not implemented.")
         elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
-            logger.debug(
-                f"Query type {type(query).__name__} has extract_text. Extracting text for search."
-            )
+            logger.debug(f"Query type {type(query).__name__} has extract_text. Extracting text.")
             try:
                 query_text = query.extract_text()
                 if not query_text or not query_text.strip():
@@ -443,82 +468,121 @@ class HaystackSearchService(SearchServiceProtocol):
                         f"Query object {type(query).__name__} provided empty text. Returning no results."
                     )
                     return []
-                # Embed the extracted text
                 text_embedder = self._get_text_embedder()
                 embedding_result = text_embedder.run(text=query_text)
                 query_embedding = embedding_result["embedding"]
                 if not query_embedding:
                     raise ValueError(
-                        f"Text embedder did not return an embedding for text extracted from {type(query).__name__}."
+                        f"Text embedder did not return embedding for text from {type(query).__name__}."
                     )
                 logger.debug(
-                    f"Successfully generated query embedding from extracted text (dim: {len(query_embedding)})."
+                    f"Generated query embedding from extracted text (dim: {len(query_embedding)})."
                 )
             except Exception as e:
                 logger.error(
-                    f"Failed to extract or embed text from query object {type(query).__name__}: {e}",
+                    f"Failed to extract/embed text from query object {type(query).__name__}: {e}",
                     exc_info=True,
                 )
                 raise RuntimeError("Query text extraction or embedding failed.") from e
         else:
-            # Raise specific error for unsupported types by this implementation
             raise TypeError(f"Unsupported query type for HaystackSearchService: {type(query)}")
-        # --- 2. Select Retriever based on Store Type ---
+        # Select Retriever based on Store Type
         retriever = None
-        if isinstance(store, ChromaDocumentStore):
-            if not ChromaEmbeddingRetriever:
-                raise ImportError("ChromaEmbeddingRetriever is required but not available.")
-            retriever = ChromaEmbeddingRetriever(document_store=store)
-        elif isinstance(store, InMemoryDocumentStore):
+        # Check if LanceDB is available *before* checking isinstance
+        if (
+            LANCEDB_HAYSTACK_AVAILABLE
+            and LanceDBDocumentStore
+            and isinstance(store, LanceDBDocumentStore)
+        ):
+            if not LanceDBEmbeddingRetriever:
+                raise ImportError("LanceDBEmbeddingRetriever is required but not available.")
+            retriever = LanceDBEmbeddingRetriever(document_store=store)
+        # Check if InMemory is available *before* checking isinstance
+        elif (
+            InMemoryDocumentStore
+            and InMemoryEmbeddingRetriever
+            and isinstance(store, InMemoryDocumentStore)
+        ):
+            # No separate HAS_INMEMORY flag, check if classes are not None
             retriever = InMemoryEmbeddingRetriever(document_store=store)
         else:
-            # Raise specific error for unsupported store
-            raise TypeError(f"Cannot perform search with store type {type(store)}.")
+            # Improved error message if store type is unexpected
+            store_type_name = type(store).__name__
+            available_integrations = []
+            if LANCEDB_HAYSTACK_AVAILABLE and LanceDBDocumentStore:
+                available_integrations.append("LanceDB")
+            if InMemoryDocumentStore:
+                available_integrations.append("InMemory")
+            if not available_integrations:
+                raise TypeError(
+                    f"Cannot perform search: No supported document store integrations (LanceDB, InMemory) seem to be available. "
+                    f"Check Haystack installation."
+                )
+            # Check if the store type matches one of the available integrations' expected types
+            elif (
+                LANCEDB_HAYSTACK_AVAILABLE
+                and LanceDBDocumentStore
+                and isinstance(store, LanceDBDocumentStore)
+            ) or (InMemoryDocumentStore and isinstance(store, InMemoryDocumentStore)):
+                # This case implies the retriever class (e.g., LanceDBEmbeddingRetriever) might be missing
+                missing_retriever = ""
+                if isinstance(store, LanceDBDocumentStore):
+                    missing_retriever = "LanceDBEmbeddingRetriever"
+                if isinstance(store, InMemoryDocumentStore):
+                    missing_retriever = "InMemoryEmbeddingRetriever"
+                raise ImportError(
+                    f"Store type '{store_type_name}' is supported, but its retriever component '{missing_retriever}' failed to import or is unavailable."
+                )
+            else:  # Store type doesn't match any known/available store type
+                raise TypeError(
+                    f"Cannot perform search with unexpected store type '{store_type_name}'. "
+                    f"Available integrations: {', '.join(available_integrations)}."
+                )
-        # --- 3. Build Retrieval Pipeline ---
+        # This check remains as a final safeguard, though the logic above should catch most issues
+        if not retriever:
+            raise RuntimeError(
+                f"Failed to select a suitable retriever for store type {type(store).__name__}. Please check dependencies and integration availability."
+            )
+        logger.debug(f"Selected retriever: {type(retriever).__name__}")
+        # Build Retrieval Pipeline
         pipeline = Pipeline()
         pipeline.add_component("retriever", retriever)
-        # Add Ranker logic (remains the same)
-        # ... (ranker setup if needed)
-        # --- 4. Prepare Filters (remains the same) ---
+        # Prepare Filters
         haystack_filters = options.filters
         if haystack_filters:
             logger.debug(f"Applying filters: {haystack_filters}")
-        # --- 5. Prepare Retriever Input Data (Dynamically) ---
+        # Prepare Retriever Input Data
         retriever_input_data = {"filters": haystack_filters, "top_k": options.top_k}
-        # Both InMemoryEmbeddingRetriever and ChromaEmbeddingRetriever expect 'query_embedding'
         retriever_input_data["query_embedding"] = query_embedding
         logger.debug(f"Providing 'query_embedding' to {type(retriever).__name__}.")
-        # --- 6. Run Retrieval ---
+        # Run Retrieval
         try:
-            logger.info(f"Running retrieval pipeline for collection '{self.collection_name}'...")
-            result = pipeline.run(
-                data={"retriever": retriever_input_data}
-                # ... (ranker data if needed)
+            logger.info(
+                f"Running retrieval pipeline for table/store '{self.table_name if self._persist else 'InMemory'}'..."
             )
+            result = pipeline.run(data={"retriever": retriever_input_data})
-            # --- 7. Format Results ---
+            # Format Results
             if "retriever" in result and "documents" in result["retriever"]:
                 retrieved_docs: List[HaystackDocument] = result["retriever"]["documents"]
                 logger.info(f"Retrieved {len(retrieved_docs)} documents.")
-                # Format results (remains the same)
                 final_results = []
                 for doc in retrieved_docs:
-                    # Include content_hash in returned metadata if present
                     meta_with_hash = doc.meta
-                    # No need to explicitly add hash here if Haystack store preserves it
                     result_dict = {
                         "content_snippet": doc.content[:200] if doc.content else "",
                         "score": doc.score if doc.score is not None else 0.0,
                         "page_number": meta_with_hash.get("page_number", None),
                         "pdf_path": meta_with_hash.get("pdf_path", None),
-                        "metadata": meta_with_hash,  # Pass full metadata
-                        # "_haystack_document": doc # Optionally include full object
+                        "metadata": meta_with_hash,
                     }
                     final_results.append(result_dict)
                 return final_results
@@ -527,117 +591,97 @@ class HaystackSearchService(SearchServiceProtocol):
                 return []
         except FileNotFoundError:
-            # Keep specific catch for collection not found during retrieval
             logger.error(
-                f"Search failed: Collection '{self.collection_name}' not found at path '{self._default_persist_path}'."
+                f"Search failed: Could not access path for table/store '{self.table_name if self._persist else 'InMemory'}' (URI: '{self._uri if self._persist else 'N/A'}')."
             )
-            raise  # Re-raise the specific FileNotFoundError
-        # REMOVED broad except Exception for pipeline execution. Let errors propagate.
+            raise
-    def delete_index(
-        self,
-    ) -> bool:
+    def delete_index(self) -> bool:
         """
-        Deletes the entire index/collection managed by this service instance.
+        Deletes the entire LanceDB table or resets the InMemory store.
         Returns:
-            True if deletion was successful or collection didn't exist, False otherwise.
+            True if deletion was successful or table/store didn't exist, False otherwise.
         """
-        logger.warning(f"Request to delete index for collection '{self.collection_name}'.")
         if self._persist:
-            # Delegate to internal ChromaDB deletion helper
-            return self._delete_chroma_collection()
-        else:
-            # For InMemory, "deleting" means re-initializing the store
-            logger.info(
-                f"Re-initializing InMemory store for '{self.collection_name}' as deletion request."
+            logger.warning(
+                f"Request to delete LanceDB table '{self.table_name}' at uri '{self._uri}'."
             )
+            return self._delete_lancedb_table()
+        else:
+            logger.info("Request to delete InMemory store (re-initializing).)")
+            if not InMemoryDocumentStore:
+                raise ImportError("InMemoryDocumentStore not available.")
             self._in_memory_store = InMemoryDocumentStore()
-            return True  # Considered successful
+            return True
-    def index_exists(
-        self,
-    ) -> bool:
+    def index_exists(self) -> bool:
         """
-        Checks if the index/collection managed by this service instance exists.
-        NOTE: For ChromaDB, this may involve trying to connect.
-        For InMemory, it checks if the internal store object exists and has documents.
+        Checks if the LanceDB table or InMemory store exists and has documents.
+        NOTE: For LanceDB, this tries to count documents, implicitly checking connection/table existence.
+              For InMemory, it checks if the internal store object exists and has documents.
         """
-        logger.debug(f"Checking existence of index for collection '{self.collection_name}'.")
-        store = self._get_store()  # Get the store instance
+        store_name = self.table_name if self._persist else "InMemory"
+        logger.debug(
+            f"Checking existence of index for '{store_name}'. URI: '{self._uri if self._persist else 'N/A'}'"
+        )
         try:
+            store = self._get_store()
             count = store.count_documents()
             exists = count > 0
             logger.debug(
-                f"Store type {type(store).__name__} for '{self.collection_name}' exists and has {count} documents: {exists}"
+                f"Store type {type(store).__name__} for '{store_name}' exists and has {count} documents: {exists}"
             )
             return exists
+        except ImportError as ie:
+            logger.error(f"Import error checking index existence for '{store_name}': {ie}")
+            return False
         except Exception as e:
-            # Catch errors during count_documents (e.g., connection error for persistent stores)
             logger.warning(
-                f"Could not count documents in store for collection '{self.collection_name}' to check existence: {e}",
+                f"Could not confirm existence or count documents in store for '{store_name}': {e}",
                 exc_info=False,
             )
-            # Special handling for ChromaDB trying to connect to non-existent path? Check Haystack behavior.
-            # Assume not exists if count fails
             return False
     # --- Sync Methods Implementation ---
     def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
-        """Retrieves documents, required for sync.
-        NOTE: Haystack's filter_documents is the closest match.
-              Fetches all docs if filters=None.
-        """
+        """Retrieves documents, required for sync."""
+        store_name = self.table_name if self._persist else "InMemory"
         logger.debug(
-            f"Listing documents for collection '{self.collection_name}' (include_metadata={include_metadata})..."
+            f"Listing documents for '{store_name}' (include_metadata={include_metadata})..."
         )
         store = self._get_store()
         try:
-            # Use filter_documents with no filters to get all
-            # This might be inefficient for very large stores.
-            haystack_docs = store.filter_documents(
-                filters=kwargs.get("filters")
-            )  # Pass filters if provided via kwargs
-            logger.info(f"Retrieved {len(haystack_docs)} documents from store.")
-            # Convert to simple dicts
+            haystack_docs = store.filter_documents(filters=kwargs.get("filters"))
+            logger.info(f"Retrieved {len(haystack_docs)} documents from store '{store_name}'.")
             results = []
             for doc in haystack_docs:
-                doc_dict = {"id": doc.id}  # ID is essential
+                doc_dict = {"id": doc.id}
                 if include_metadata:
-                    # Ensure content_hash is included if it exists in meta
                     doc_dict["meta"] = doc.meta
-                # Optionally include content? Protocol doesn't require it.
-                # doc_dict["content"] = doc.content
                 results.append(doc_dict)
             return results
         except Exception as e:
-            logger.error(
-                f"Failed to list documents from store '{self.collection_name}': {e}", exc_info=True
-            )
-            raise RuntimeError(
-                f"Failed to list documents from store '{self.collection_name}'."
-            ) from e
+            logger.error(f"Failed to list documents from store '{store_name}': {e}", exc_info=True)
+            raise RuntimeError(f"Failed to list documents from store '{store_name}'.") from e
     def delete_documents(self, ids: List[str]) -> None:
         """Deletes documents by ID, required for sync."""
+        store_name = self.table_name if self._persist else "InMemory"
         if not ids:
-            logger.debug("No document IDs provided for deletion. Skipping.")
+            logger.debug(f"No document IDs provided for deletion from '{store_name}'. Skipping.")
             return
-        logger.warning(
-            f"Request to delete {len(ids)} documents from collection '{self.collection_name}'."
-        )
+        logger.warning(f"Request to delete {len(ids)} documents from '{store_name}'.")
         store = self._get_store()
         try:
             store.delete_documents(ids=ids)
             logger.info(
-                f"Successfully deleted {len(ids)} documents (if they existed). Store count now: {store.count_documents()}"
+                f"Successfully requested deletion of {len(ids)} documents from '{store_name}'. Store count now: {store.count_documents()}"
             )
         except Exception as e:
             logger.error(
-                f"Failed to delete documents with IDs {ids} from store '{self.collection_name}': {e}",
+                f"Failed to delete documents with IDs {ids} from store '{store_name}': {e}",
                 exc_info=True,
             )
-            raise RuntimeError(
-                f"Failed to delete documents from store '{self.collection_name}'."
-            ) from e
+            raise RuntimeError(f"Failed to delete documents from store '{store_name}'.") from e

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl