PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/search/haystack_utils.py CHANGED Viewed

@@ -17,10 +17,16 @@ from natural_pdf.search.search_options import (
 # Set up logger for this module
 logger = logging.getLogger(__name__)
+# Import sentence-transformers for dimension calculation
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    SentenceTransformer = None
 # --- Define flag BEFORE trying Haystack imports ---
 HAS_HAYSTACK_EXTRAS = False  # Default to False
-# --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
+# Conditional Haystack Imports
 try:
     import haystack
     from haystack import Document as HaystackDocument
@@ -30,8 +36,18 @@ try:
         SentenceTransformersTextEmbedder,
     )
     from haystack.document_stores.types import DocumentStore, DuplicatePolicy
-    from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
-    from haystack_integrations.document_stores.chroma import ChromaDocumentStore
+    # --- REMOVED Chroma Imports ---
+    # from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
+    # from haystack_integrations.document_stores.chroma import ChromaDocumentStore
+    # --- ADDED LanceDB Imports ---
+    try:
+        from lancedb_haystack import LanceDBDocumentStore, LanceDBEmbeddingRetriever
+    except ImportError:
+        LanceDBDocumentStore = None
+        LanceDBEmbeddingRetriever = None
+    # Removed Chroma Imports
     # Keep try/except for optional Cohere
     try:
@@ -39,15 +55,7 @@ try:
     except ImportError:
         CohereRanker = None
-    # --- Add ChromaDB embedding function import ---
-    try:
-        from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
-    except ImportError:
-        logger.warning(
-            "chromadb library not found. Custom embedding models for ChromaDocumentStore may not work."
-        )
-        SentenceTransformerEmbeddingFunction = None
-    # --- End ChromaDB import ---
+    # Removed ChromaDB embedding function import
     HAS_HAYSTACK_EXTRAS = True  # Set to True if imports succeed
     logger.debug("Successfully imported Haystack components.")
@@ -67,11 +75,15 @@ except ImportError as e:
     HaystackDocument = Dict  # Represent as Dict if not available
     Pipeline = None
     SentenceTransformersTextEmbedder = None
-    ChromaEmbeddingRetriever = None  # Dummy for Embedding Retriever
+    # --- UPDATED Dummies ---
+    LanceDBEmbeddingRetriever = (
+        None  # ChromaEmbeddingRetriever = None  # Dummy for Embedding Retriever
+    )
     CohereRanker = None
-    ChromaDocumentStore = None
+    LanceDBDocumentStore = None  # ChromaDocumentStore = None
     DuplicatePolicy = None  # Dummy for DuplicatePolicy
-    SentenceTransformerEmbeddingFunction = None  # Dummy if kept
+    # --- REMOVED Dummies ---
+    # SentenceTransformerEmbeddingFunction = None  # Dummy if kept
 # Helper function to check availability and raise error
@@ -90,34 +102,60 @@ def check_haystack_availability(feature_name: str = "Search"):
 def create_default_document_store(
-    persist_path: str = "./natural_pdf_index",
-    collection_name: str = "natural_pdf_default",
-    embedding_model: Optional[str] = None,  # Allow specifying the model
+    # --- CHANGED persist_path to uri ---
+    uri: str = "./natural_pdf_index",
+    collection_name: str = "natural_pdf_default",  # LanceDB calls this table_name
+    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",  # Make mandatory for dim calculation
 ) -> DocumentStore:
-    """Creates a default ChromaDB DocumentStore."""
-    check_haystack_availability("create_default_document_store")
+    """Creates a default LanceDB DocumentStore."""
+    check_haystack_availability("create_default_document_store (LanceDB)")
     logger.debug(
-        f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'"
+        f"Creating default LanceDBDocumentStore at uri='{uri}' with table '{collection_name}'"
     )
-    if not ChromaDocumentStore:  # Should be caught by check_haystack_availability, but double-check
-        raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
+    if not LanceDBDocumentStore:
+        raise RuntimeError("LanceDBDocumentStore is not available despite Haystack extras check.")
+    if not SentenceTransformer:
+        raise ImportError(
+            "sentence-transformers library is required to determine embedding dimensions."
+        )
     try:
-        # Note: For Haystack's Chroma integration, the embedding model is typically handled
-        # by the Embedder component in the indexing/query pipeline, not set directly
-        # on the DocumentStore initialization.
-        # The `embedding_model` parameter passed here might be used later to configure that Embedder.
-        store = ChromaDocumentStore(
-            persist_path=persist_path,
-            collection_name=collection_name,
-            # embedding_function parameter removed as it caused issues with Haystack's util
+        # Calculate embedding dimension
+        try:
+            model = SentenceTransformer(embedding_model)
+            embedding_dims = model.get_sentence_embedding_dimension()
+            if not embedding_dims:
+                raise ValueError(
+                    f"Could not determine embedding dimension for model: {embedding_model}"
+                )
+            logger.debug(
+                f"Determined embedding dimension: {embedding_dims} for model '{embedding_model}'"
+            )
+        except Exception as e:
+            logger.error(
+                f"Failed to load SentenceTransformer model '{embedding_model}' to get dimensions: {e}",
+                exc_info=True,
+            )
+            raise RuntimeError(
+                f"Failed to determine embedding dimension for model '{embedding_model}'."
+            ) from e
+        # Create LanceDBDocumentStore
+        store = LanceDBDocumentStore(
+            database=uri,  # Use uri for the database path
+            table_name=collection_name,
+            embedding_dims=embedding_dims,
+            # LanceDB might require a metadata schema, but let's try without it first for simplicity.
+            # Add `metadata_schema=...` if needed based on lancedb-haystack requirements.
+        )
+        logger.info(
+            f"Initialized LanceDBDocumentStore (Table: {collection_name}, Dims: {embedding_dims}) at uri '{uri}'"
         )
-        logger.info(f"Initialized ChromaDocumentStore (Collection: {collection_name})")
         return store
     except Exception as e:
-        logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
+        logger.error(f"Failed to initialize LanceDBDocumentStore: {e}", exc_info=True)
         raise RuntimeError(
-            f"Could not create ChromaDocumentStore for collection '{collection_name}'"
+            f"Could not create LanceDBDocumentStore for table '{collection_name}' at uri '{uri}'"
         ) from e
@@ -261,35 +299,32 @@ def create_default_document_embedder(
 def _perform_haystack_search(
     query: Union[str, Path, Image.Image],
     document_store: Any,  # Use Any for simplicity now
-    collection_name: str,  # Passed for clarity, but Chroma store instance is collection-specific
+    collection_name: str,  # Passed for clarity, corresponds to table_name in LanceDB
     embedder: SentenceTransformersTextEmbedder,  # Explicitly expect a text embedder for queries
     options: BaseSearchOptions,
 ) -> List[Dict[str, Any]]:
-    """Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
+    """Internal function to perform search using Haystack components (LanceDBEmbeddingRetriever)."""
     if not HAS_HAYSTACK_EXTRAS:
-        check_haystack_availability("_perform_haystack_search")
+        check_haystack_availability("_perform_haystack_search (LanceDB)")
         return []  # Should not be reached due to check
     logger.info(
-        f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})..."
+        f"Performing Haystack search in table '{collection_name}' (using store: {type(document_store).__name__})..."
     )
     logger.debug(f"  Query type: {type(query).__name__}")
     logger.debug(f"  Options: {options}")
-    # --- 1. Embed Query (using the provided text embedder) --- #
+    # Embed Query
     text_query: Optional[str] = None
     query_embedding: Optional[List[float]] = None
     if isinstance(query, str):
-        text_query = query  # Keep text for potential reranker use
+        text_query = query
         if not embedder:
             logger.error(
                 "Text query provided, but no embedder instance was passed to _perform_haystack_search."
             )
             return []
-        # No need to check type if the type hint is enforced upstream
-        # if not isinstance(embedder, SentenceTransformersTextEmbedder):
-        #      logger.warning(f"Provided embedder is {type(embedder).__name__}, not SentenceTransformersTextEmbedder. Assuming it works like one for query embedding.")
         try:
             logger.debug(f"Running embedder {type(embedder).__name__} on query text...")
             embedding_result = embedder.run(text=text_query)
@@ -306,24 +341,21 @@ def _perform_haystack_search(
             logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
             return []
     elif isinstance(query, Path) or isinstance(query, Image.Image):
-        # Currently, this function doesn't support multi-modal query embedding directly
         logger.error(
             f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
         )
         return []
     else:
-        # Handle other unexpected types
         logger.error(f"Unsupported query type: {type(query).__name__}. Requires text.")
+        return []
-    # If we didn't get an embedding (e.g., non-text query), we can't proceed
     if query_embedding is None:
         logger.error("Could not obtain query embedding. Cannot perform search.")
         return []
-    # --- 2. Set up Retriever --- #
-    # Assumes the document_store is ChromaDocumentStore for this utility function context
-    if not ChromaEmbeddingRetriever:
-        logger.error("ChromaEmbeddingRetriever not available.")
+    # Set up Retriever
+    if not LanceDBEmbeddingRetriever:
+        logger.error("LanceDBEmbeddingRetriever not available.")
         return []
     # Ensure retriever_top_k is set (should be by __post_init__)
@@ -335,28 +367,26 @@ def _perform_haystack_search(
         retriever_top_k = options.top_k
     # Instantiate the EMBEDDING retriever
-    retriever = ChromaEmbeddingRetriever(
+    retriever = LanceDBEmbeddingRetriever(
         document_store=document_store,
         filters=options.filters or {},  # Pass filters here
         top_k=retriever_top_k,
     )
     logger.debug(
-        f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
+        f"Initialized LanceDBEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
     )
-    # --- 3. Set up Optional Reranker --- #
+    # Set up Optional Reranker
     reranker_instance = None
-    if options.use_reranker in [True, None]:  # Check specifically for True or None
+    if options.use_reranker in [True, None]:
         logger.debug("Attempting to initialize reranker...")
-        # Currently only supports default text reranker (Cohere)
         reranker_instance = create_default_text_reranker(
             api_key=options.reranker_api_key,
             model_name=options.reranker_model or "rerank-english-v2.0",
         )
         if reranker_instance:
-            # Ensure reranker top_k matches final desired top_k
-            reranker_instance.top_k = options.top_k  # Set the final top_k for the reranker
+            reranker_instance.top_k = options.top_k
             logger.info(
                 f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
             )
@@ -365,7 +395,7 @@ def _perform_haystack_search(
                 "Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
             )
-    # --- 4. Build and Run Pipeline --- #
+    # Build and Run Pipeline
     if not Pipeline:
         logger.error("Haystack Pipeline class not available.")
         return []
@@ -380,28 +410,25 @@ def _perform_haystack_search(
     if reranker_instance:
         search_pipeline.add_component("reranker", reranker_instance)
         search_pipeline.connect("retriever.documents", "reranker.documents")
-        # Reranker also needs the query text and final top_k
         if text_query is None:
             logger.error(
                 "Reranker requires text query, but it was not available (query might not have been text)."
             )
-            # Handle this case - maybe skip reranker or raise error?
-            # For now, let's skip reranker if text is missing
             logger.warning("Skipping reranker because text query is missing.")
-            reranker_instance = None  # Effectively remove it from the logic below
-            last_component_name = "retriever"  # Reset last component
-            # Remove reranker component if added? Less clean. Let's just not add its input.
+            reranker_instance = None
+            last_component_name = "retriever"
         else:
             pipeline_input["reranker"] = {
                 "query": text_query,
                 "top_k": options.top_k,
-            }  # Pass query and final top_k
+            }
             last_component_name = "reranker"
             logger.debug("Added reranker to pipeline and configured input.")
     else:
-        # No reranker was initialized or it was skipped
-        last_component_name = "reranker"
-        logger.debug("Added reranker to pipeline.")
+        # --- Fix: last_component_name should only be 'reranker' if it was added ---
+        # if reranker_instance was initialized and added, last_component_name is 'reranker'
+        # if not, it remains 'retriever'
+        pass  # No change needed here if reranker wasn't added
     logger.info("Running Haystack search pipeline...")
     try:
@@ -412,9 +439,8 @@ def _perform_haystack_search(
         logger.error(f"Haystack search pipeline failed: {e}", exc_info=True)
         return []
-    # --- 5. Process Results --- #
+    # Process Results
     final_documents: List[HaystackDocument] = []
-    # Check output based on last component in the pipeline
     if last_component_name in result and result[last_component_name].get("documents"):
         final_documents = result[last_component_name]["documents"]
         logger.debug(
@@ -428,8 +454,7 @@ def _perform_haystack_search(
     # Convert Haystack Documents to the desired output format
     output_results = []
-    for doc in final_documents:  # Correctly loop over final_documents
-        # Check if doc is actually a Haystack Document object or potentially a dict
+    for doc in final_documents:
         doc_id = getattr(doc, "id", None)
         doc_score = getattr(doc, "score", 0.0)
         doc_content = getattr(doc, "content", None)
@@ -439,10 +464,9 @@ def _perform_haystack_search(
         output = {
             "pdf_path": meta.get("pdf_path", "Unknown"),
             "page_number": meta.get("page_number", -1),
-            "score": doc_score if doc_score is not None else 0.0,  # Handle potential None score
-            "content_snippet": doc_content[:200] + "..." if doc_content else "",  # Add snippet
+            "score": doc_score if doc_score is not None else 0.0,
+            "content_snippet": doc_content[:200] + "..." if doc_content else "",
             "metadata": meta,
-            # "haystack_document": doc # Optionally include the full Haystack doc
         }
         output_results.append(output)

natural_pdf/search/search_service_protocol.py CHANGED Viewed

@@ -68,12 +68,11 @@ class SearchServiceProtocol(Protocol):
     Protocol defining the interface for indexing and searching documents.
     Implementations of this protocol handle the specifics of interacting
-    with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
-    An instance of a service implementing this protocol is tied to a specific collection name.
+    with a chosen search backend (e.g., Haystack with LanceDB, Haystack In-Memory).
+    An instance of a service implementing this protocol is tied to a specific index name (e.g., table name).
     """
     collection_name: str
-    # Removed internal state hints (_persist, _embedding_model) - implementation detail
     def index(
         self,
@@ -82,7 +81,7 @@ class SearchServiceProtocol(Protocol):
         force_reindex: bool = False,
     ) -> None:
         """
-        Indexes the provided documents into the collection managed by this service instance.
+        Indexes the provided documents into the index/table managed by this service instance.
         Handles store and embedder creation/retrieval, configuration checks,
         re-indexing logic (including potential deletion), embedding, and writing.
@@ -91,12 +90,12 @@ class SearchServiceProtocol(Protocol):
             documents: An iterable of objects conforming to the Indexable protocol.
             embedder_device: The device ('cpu', 'cuda', etc.) for the embedder.
                              Defaults defined by the implementation.
-            force_reindex: If True, delete the entire existing collection
+            force_reindex: If True, delete the entire existing index/table
                            (if configuration permits) before indexing.
         Raises:
             IndexConfigurationError: If `force_reindex` is False and the existing
-                                     collection has incompatible settings.
+                                     index/table has incompatible settings.
             ImportError: If required backend libraries are missing.
             RuntimeError: For other failures during indexing.
         """
@@ -104,11 +103,11 @@ class SearchServiceProtocol(Protocol):
     def search(
         self,
-        query: Any,  # Allow any query type, service implementation handles it
+        query: Any,
         options: BaseSearchOptions,
     ) -> List[Dict[str, Any]]:
         """
-        Performs a search within the collection managed by this service instance.
+        Performs a search within the index/table managed by this service instance.
         Args:
             query: The search query (type depends on service capabilities).
@@ -119,7 +118,7 @@ class SearchServiceProtocol(Protocol):
             metadata, and relevance scores.
         Raises:
-            FileNotFoundError: If the collection managed by this service does not exist.
+            FileNotFoundError: If the index/table managed by this service does not exist or path is invalid.
             RuntimeError: For other failures during search.
             TypeError: If the query type is incompatible with the backend/options.
         """
@@ -129,10 +128,10 @@ class SearchServiceProtocol(Protocol):
         self,
     ) -> bool:
         """
-        Deletes the entire index/collection managed by this service instance.
+        Deletes the entire index/table managed by this service instance.
         Returns:
-            True if deletion was successful or collection didn't exist,
+            True if deletion was successful or index/table didn't exist,
             False if deletion failed.
         Raises:
@@ -145,7 +144,7 @@ class SearchServiceProtocol(Protocol):
         self,
     ) -> bool:
         """
-        Checks if the index/collection managed by this service instance exists.
+        Checks if the index/table managed by this service instance exists.
         Returns:
             True if the index exists, False otherwise.

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl