PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +209 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1710 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +288 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +413 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +512 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +604 -0
docs/tutorials/12-ocr-integration.md +175 -0
docs/tutorials/13-semantic-search.ipynb +1328 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +50 -33
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/gemini.py +264 -0
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +125 -58
natural_pdf/analyzers/layout/layout_options.py +43 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +89 -45
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +146 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +419 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +1044 -521
natural_pdf/core/pdf.py +516 -313
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +307 -225
natural_pdf/elements/collections.py +805 -543
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +889 -879
natural_pdf/elements/text.py +127 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +57 -35
natural_pdf/ocr/engine.py +150 -46
natural_pdf/ocr/engine_easyocr.py +146 -150
natural_pdf/ocr/engine_paddle.py +118 -175
natural_pdf/ocr/engine_surya.py +78 -141
natural_pdf/ocr/ocr_factory.py +114 -0
natural_pdf/ocr/ocr_manager.py +122 -124
natural_pdf/ocr/ocr_options.py +16 -20
natural_pdf/ocr/utils.py +98 -0
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/templates/spa/css/style.css +334 -0
natural_pdf/templates/spa/index.html +31 -0
natural_pdf/templates/spa/js/app.js +472 -0
natural_pdf/templates/spa/words.txt +235976 -0
natural_pdf/utils/debug.py +32 -0
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/identifiers.py +29 -0
natural_pdf/utils/packaging.py +418 -0
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/METADATA +53 -17
natural_pdf-0.1.6.dist-info/RECORD +141 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/WHEEL +1 -1
natural_pdf-0.1.6.dist-info/top_level.txt +4 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
natural_pdf/templates/ocr_debug.html +0 -517
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.6.dist-info}/licenses/LICENSE +0 -0

natural_pdf/search/haystack_utils.py CHANGED Viewed

@@ -1,30 +1,38 @@
 # natural_pdf/search/haystack_utils.py
 import logging
 import os
-from typing import Optional, Dict, Any, List, Union, Tuple, Type
-from pathlib import Path
-from PIL import Image # Ensure Image is imported unconditionally
 import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from PIL import Image  # Ensure Image is imported unconditionally
-from natural_pdf.search.search_options import SearchOptions, TextSearchOptions, MultiModalSearchOptions, BaseSearchOptions
+from natural_pdf.search.search_options import (
+    BaseSearchOptions,
+    MultiModalSearchOptions,
+    SearchOptions,
+    TextSearchOptions,
+)
 # Set up logger for this module
 logger = logging.getLogger(__name__)
 # --- Define flag BEFORE trying Haystack imports ---
-HAS_HAYSTACK_EXTRAS = False # Default to False
+HAS_HAYSTACK_EXTRAS = False  # Default to False
 # --- Conditional Haystack Imports (Restoring Error Catching with Traceback Logging) ---
 try:
     import haystack
-    from haystack import Document as HaystackDocument, Pipeline
-    from haystack_integrations.document_stores.chroma import ChromaDocumentStore
-    from haystack.document_stores.types import DuplicatePolicy, DocumentStore
+    from haystack import Document as HaystackDocument
+    from haystack import Pipeline
     from haystack.components.embedders import (
+        SentenceTransformersDocumentEmbedder,
         SentenceTransformersTextEmbedder,
-        SentenceTransformersDocumentEmbedder
     )
+    from haystack.document_stores.types import DocumentStore, DuplicatePolicy
     from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
+    from haystack_integrations.document_stores.chroma import ChromaDocumentStore
     # Keep try/except for optional Cohere
     try:
         from haystack.components.rankers import CohereRanker
@@ -33,33 +41,37 @@ try:
     # --- Add ChromaDB embedding function import ---
     try:
-         from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
+        from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
     except ImportError:
-         logger.warning("chromadb library not found. Custom embedding models for ChromaDocumentStore may not work.")
-         SentenceTransformerEmbeddingFunction = None
+        logger.warning(
+            "chromadb library not found. Custom embedding models for ChromaDocumentStore may not work."
+        )
+        SentenceTransformerEmbeddingFunction = None
     # --- End ChromaDB import ---
-    HAS_HAYSTACK_EXTRAS = True # Set to True if imports succeed
+    HAS_HAYSTACK_EXTRAS = True  # Set to True if imports succeed
     logger.debug("Successfully imported Haystack components.")
 except ImportError as e:
     # HAS_HAYSTACK_EXTRAS remains False
     # Log the full error and traceback for debugging
-    logger.error(f"Failed to import Haystack components. Search functionality disabled. Error: {e}", exc_info=True)
+    logger.warning(
+        f"Failed to import Haystack components. Semantic search functionality disabled.",
+    )
     # Define dummy types/classes for type hinting and basic checks when extras aren't installed
     BaseDocumentStore = object
-    DocumentStore = object # Dummy for protocol
-    BaseEmbedder = object # Define dummy BaseEmbedder
+    DocumentStore = object  # Dummy for protocol
+    BaseEmbedder = object  # Define dummy BaseEmbedder
     BaseTextEmbedder = object
-    HaystackDocument = Dict # Represent as Dict if not available
+    HaystackDocument = Dict  # Represent as Dict if not available
     Pipeline = None
     SentenceTransformersTextEmbedder = None
-    ChromaEmbeddingRetriever = None # Dummy for Embedding Retriever
+    ChromaEmbeddingRetriever = None  # Dummy for Embedding Retriever
     CohereRanker = None
     ChromaDocumentStore = None
-    DuplicatePolicy = None # Dummy for DuplicatePolicy
-    SentenceTransformerEmbeddingFunction = None # Dummy if kept
+    DuplicatePolicy = None  # Dummy for DuplicatePolicy
+    SentenceTransformerEmbeddingFunction = None  # Dummy if kept
 # Helper function to check availability and raise error
@@ -76,16 +88,19 @@ def check_haystack_availability(feature_name: str = "Search"):
 # Default Component Creators
 # ===========================
 def create_default_document_store(
     persist_path: str = "./natural_pdf_index",
     collection_name: str = "natural_pdf_default",
-    embedding_model: Optional[str] = None # Allow specifying the model
+    embedding_model: Optional[str] = None,  # Allow specifying the model
 ) -> DocumentStore:
     """Creates a default ChromaDB DocumentStore."""
     check_haystack_availability("create_default_document_store")
-    logger.debug(f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'")
-    if not ChromaDocumentStore: # Should be caught by check_haystack_availability, but double-check
-         raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
+    logger.debug(
+        f"Creating default ChromaDocumentStore at '{persist_path}' with collection '{collection_name}'"
+    )
+    if not ChromaDocumentStore:  # Should be caught by check_haystack_availability, but double-check
+        raise RuntimeError("ChromaDocumentStore is not available despite Haystack extras check.")
     try:
         # Note: For Haystack's Chroma integration, the embedding model is typically handled
@@ -101,11 +116,14 @@ def create_default_document_store(
         return store
     except Exception as e:
         logger.error(f"Failed to initialize ChromaDocumentStore: {e}", exc_info=True)
-        raise RuntimeError(f"Could not create ChromaDocumentStore for collection '{collection_name}'") from e
+        raise RuntimeError(
+            f"Could not create ChromaDocumentStore for collection '{collection_name}'"
+        ) from e
 def create_default_text_embedder(
     model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
-    device: Optional[str] = None # Add device parameter
+    device: Optional[str] = None,  # Add device parameter
 ) -> SentenceTransformersTextEmbedder:
     """Creates a default SentenceTransformer text embedder."""
     check_haystack_availability("create_default_text_embedder")
@@ -115,11 +133,16 @@ def create_default_text_embedder(
     try:
         # Use Haystack component which handles device logic
         embedder = SentenceTransformersTextEmbedder(model=model_name, device=device)
-        logger.info(f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})")
+        logger.info(
+            f"Initialized SentenceTransformersTextEmbedder (Model: {model_name}, Device: {embedder.device})"
+        )
         return embedder
     except Exception as e:
         logger.error(f"Failed to initialize SentenceTransformersTextEmbedder: {e}", exc_info=True)
-        raise RuntimeError(f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'") from e
+        raise RuntimeError(
+            f"Could not create SentenceTransformersTextEmbedder with model '{model_name}'"
+        ) from e
 def create_default_multimodal_embedder(*args, **kwargs) -> Any:
     """Stub for creating a default multimodal embedder (Not Implemented)."""
@@ -129,47 +152,52 @@ def create_default_multimodal_embedder(*args, **kwargs) -> Any:
         " See: https://docs.haystack.deepset.ai/docs/custom-components"
     )
 def create_default_text_reranker(
-        api_key: Optional[str] = None,
-        model_name: str = "rerank-english-v2.0" # Default Cohere model
-    ) -> Optional[Any]: # Returns CohereRanker instance or None
-     """
-     Creates a default Cohere Reranker if available and API key provided.
-     Requires COHERE_API_KEY environment variable or api_key argument.
-     Requires haystack-cohere integration: pip install haystack-cohere
-     """
-     check_haystack_availability("create_default_text_reranker (optional)")
-     if not CohereRanker:
-          logger.debug("CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation.")
-          return None
-     # Check for API key (prefer argument over environment variable)
-     cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
-     if not cohere_api_key:
-         logger.warning("COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker.")
-         return None
-     logger.debug(f"Creating CohereRanker with model '{model_name}'")
-     try:
-         # Pass API key via authenticator for better practice if supported, or directly
-         # As of haystack 2.0b5, CohereRanker takes api_key directly
-         reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
-         logger.info(f"Initialized CohereRanker (Model: {model_name})")
-         return reranker
-     except Exception as e:
-         logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
-         # Don't raise, just return None as reranker is optional
-         return None
+    api_key: Optional[str] = None, model_name: str = "rerank-english-v2.0"  # Default Cohere model
+) -> Optional[Any]:  # Returns CohereRanker instance or None
+    """
+    Creates a default Cohere Reranker if available and API key provided.
+    Requires COHERE_API_KEY environment variable or api_key argument.
+    Requires haystack-cohere integration: pip install haystack-cohere
+    """
+    check_haystack_availability("create_default_text_reranker (optional)")
+    if not CohereRanker:
+        logger.debug(
+            "CohereRanker component not available (haystack-cohere likely not installed). Skipping reranker creation."
+        )
+        return None
+    # Check for API key (prefer argument over environment variable)
+    cohere_api_key = api_key or os.environ.get("COHERE_API_KEY")
+    if not cohere_api_key:
+        logger.warning(
+            "COHERE_API_KEY not found in arguments or environment variables. Cannot create Cohere Reranker."
+        )
+        return None
+    logger.debug(f"Creating CohereRanker with model '{model_name}'")
+    try:
+        # Pass API key via authenticator for better practice if supported, or directly
+        # As of haystack 2.0b5, CohereRanker takes api_key directly
+        reranker = CohereRanker(api_key=cohere_api_key, model=model_name)
+        logger.info(f"Initialized CohereRanker (Model: {model_name})")
+        return reranker
+    except Exception as e:
+        logger.error(f"Failed to initialize CohereRanker: {e}", exc_info=True)
+        # Don't raise, just return None as reranker is optional
+        return None
 # --- Default Document Embedder Creator ---
 def create_default_document_embedder(
     model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
     device: Optional[str] = None,
     progress_bar: bool = True,
-    normalize_embeddings: bool = False # Changed default based on ST documentation
-) -> Any: # Return Any as actual type depends on availability
+    normalize_embeddings: bool = False,  # Changed default based on ST documentation
+) -> Any:  # Return Any as actual type depends on availability
     """Creates a default SentenceTransformersDocumentEmbedder instance.
     Args:
@@ -192,7 +220,9 @@ def create_default_document_embedder(
     # Use the provided device parameter directly.
     # If None, Haystack component will likely pick a default (e.g., 'cpu' or 'cuda' if available)
     resolved_device = device
-    logger.debug(f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}")
+    logger.debug(
+        f"Attempting to create SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {resolved_device or 'auto'}"
+    )
     try:
         embedder = SentenceTransformersDocumentEmbedder(
@@ -204,10 +234,16 @@ def create_default_document_embedder(
             # If embedding meta fields is needed, it should be passed as a parameter
         )
         embedder.warm_up()
-        logger.info(f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}") # Use embedder.device after init
+        logger.info(
+            f"Created SentenceTransformersDocumentEmbedder. Model: {model_name}, Device: {embedder.device}"
+        )  # Use embedder.device after init
     except Exception as e:
-        logger.error(f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True)
-        raise RuntimeError(f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'.") from e
+        logger.error(
+            f"Failed to initialize SentenceTransformersDocumentEmbedder: {e}", exc_info=True
+        )
+        raise RuntimeError(
+            f"Failed to initialize SentenceTransformersDocumentEmbedder with model '{model_name}'."
+        ) from e
     return embedder
@@ -221,19 +257,22 @@ def create_default_document_embedder(
 # Central Search Logic
 # ===========================
 def _perform_haystack_search(
     query: Union[str, Path, Image.Image],
-    document_store: Any, # Use Any for simplicity now
-    collection_name: str, # Passed for clarity, but Chroma store instance is collection-specific
-    embedder: SentenceTransformersTextEmbedder, # Explicitly expect a text embedder for queries
-    options: BaseSearchOptions
+    document_store: Any,  # Use Any for simplicity now
+    collection_name: str,  # Passed for clarity, but Chroma store instance is collection-specific
+    embedder: SentenceTransformersTextEmbedder,  # Explicitly expect a text embedder for queries
+    options: BaseSearchOptions,
 ) -> List[Dict[str, Any]]:
     """Internal function to perform search using Haystack components (ChromaEmbeddingRetriever)."""
     if not HAS_HAYSTACK_EXTRAS:
         check_haystack_availability("_perform_haystack_search")
-        return [] # Should not be reached due to check
+        return []  # Should not be reached due to check
-    logger.info(f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})...")
+    logger.info(
+        f"Performing Haystack search in collection '{collection_name}' (using store: {type(document_store).__name__})..."
+    )
     logger.debug(f"  Query type: {type(query).__name__}")
     logger.debug(f"  Options: {options}")
@@ -242,9 +281,11 @@ def _perform_haystack_search(
     query_embedding: Optional[List[float]] = None
     if isinstance(query, str):
-        text_query = query # Keep text for potential reranker use
+        text_query = query  # Keep text for potential reranker use
         if not embedder:
-            logger.error("Text query provided, but no embedder instance was passed to _perform_haystack_search.")
+            logger.error(
+                "Text query provided, but no embedder instance was passed to _perform_haystack_search."
+            )
             return []
         # No need to check type if the type hint is enforced upstream
         # if not isinstance(embedder, SentenceTransformersTextEmbedder):
@@ -254,15 +295,21 @@ def _perform_haystack_search(
             embedding_result = embedder.run(text=text_query)
             query_embedding = embedding_result.get("embedding")
             if not query_embedding:
-                 logger.error(f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'")
-                 return []
-            logger.debug(f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking.")
+                logger.error(
+                    f"Embedder {type(embedder).__name__} failed to return an embedding for the query: '{text_query[:100]}...'"
+                )
+                return []
+            logger.debug(
+                f"Generated query embedding (Dim: {len(query_embedding)}). Text kept for potential reranking."
+            )
         except Exception as e:
             logger.error(f"Failed to run text embedder on query text: {e}", exc_info=True)
             return []
     elif isinstance(query, Path) or isinstance(query, Image.Image):
         # Currently, this function doesn't support multi-modal query embedding directly
-        logger.error(f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text.")
+        logger.error(
+            f"Unsupported query type ({type(query).__name__}) for embedding in _perform_haystack_search. Requires text."
+        )
         return []
     else:
         # Handle other unexpected types
@@ -276,42 +323,52 @@ def _perform_haystack_search(
     # --- 2. Set up Retriever --- #
     # Assumes the document_store is ChromaDocumentStore for this utility function context
     if not ChromaEmbeddingRetriever:
-         logger.error("ChromaEmbeddingRetriever not available.")
-         return []
+        logger.error("ChromaEmbeddingRetriever not available.")
+        return []
     # Ensure retriever_top_k is set (should be by __post_init__)
     retriever_top_k = options.retriever_top_k
     if retriever_top_k is None:
-         logger.warning("options.retriever_top_k was None, defaulting to options.top_k for retriever.")
-         retriever_top_k = options.top_k
+        logger.warning(
+            "options.retriever_top_k was None, defaulting to options.top_k for retriever."
+        )
+        retriever_top_k = options.top_k
     # Instantiate the EMBEDDING retriever
-    retriever = ChromaEmbeddingRetriever(document_store=document_store,
-                                         filters=options.filters or {}, # Pass filters here
-                                         top_k=retriever_top_k)
-    logger.debug(f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})")
+    retriever = ChromaEmbeddingRetriever(
+        document_store=document_store,
+        filters=options.filters or {},  # Pass filters here
+        top_k=retriever_top_k,
+    )
+    logger.debug(
+        f"Initialized ChromaEmbeddingRetriever (Top K: {retriever.top_k}, Filters: {retriever.filters})"
+    )
     # --- 3. Set up Optional Reranker --- #
     reranker_instance = None
-    if options.use_reranker in [True, None]: # Check specifically for True or None
+    if options.use_reranker in [True, None]:  # Check specifically for True or None
         logger.debug("Attempting to initialize reranker...")
         # Currently only supports default text reranker (Cohere)
-        reranker_instance = create_default_text_reranker(api_key=options.reranker_api_key,
-                                                         model_name=options.reranker_model or "rerank-english-v2.0")
+        reranker_instance = create_default_text_reranker(
+            api_key=options.reranker_api_key,
+            model_name=options.reranker_model or "rerank-english-v2.0",
+        )
         if reranker_instance:
-             # Ensure reranker top_k matches final desired top_k
-             reranker_instance.top_k = options.top_k # Set the final top_k for the reranker
-             logger.info(f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})")
+            # Ensure reranker top_k matches final desired top_k
+            reranker_instance.top_k = options.top_k  # Set the final top_k for the reranker
+            logger.info(
+                f"Using reranker: {type(reranker_instance).__name__} (Final Top K: {options.top_k})"
+            )
         else:
-            logger.warning("Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking.")
+            logger.warning(
+                "Reranker requested (use_reranker=True/None) but could not be initialized (check API key/installation). Proceeding without reranking."
+            )
     # --- 4. Build and Run Pipeline --- #
     if not Pipeline:
-         logger.error("Haystack Pipeline class not available.")
-         return []
+        logger.error("Haystack Pipeline class not available.")
+        return []
     search_pipeline = Pipeline()
     search_pipeline.add_component("retriever", retriever)
@@ -325,15 +382,20 @@ def _perform_haystack_search(
         search_pipeline.connect("retriever.documents", "reranker.documents")
         # Reranker also needs the query text and final top_k
         if text_query is None:
-            logger.error("Reranker requires text query, but it was not available (query might not have been text).")
+            logger.error(
+                "Reranker requires text query, but it was not available (query might not have been text)."
+            )
             # Handle this case - maybe skip reranker or raise error?
             # For now, let's skip reranker if text is missing
             logger.warning("Skipping reranker because text query is missing.")
-            reranker_instance = None # Effectively remove it from the logic below
-            last_component_name = "retriever" # Reset last component
+            reranker_instance = None  # Effectively remove it from the logic below
+            last_component_name = "retriever"  # Reset last component
             # Remove reranker component if added? Less clean. Let's just not add its input.
         else:
-            pipeline_input["reranker"] = {"query": text_query, "top_k": options.top_k} # Pass query and final top_k
+            pipeline_input["reranker"] = {
+                "query": text_query,
+                "top_k": options.top_k,
+            }  # Pass query and final top_k
             last_component_name = "reranker"
             logger.debug("Added reranker to pipeline and configured input.")
     else:
@@ -341,7 +403,6 @@ def _perform_haystack_search(
         last_component_name = "reranker"
         logger.debug("Added reranker to pipeline.")
     logger.info("Running Haystack search pipeline...")
     try:
         result = search_pipeline.run(pipeline_input)
@@ -356,31 +417,34 @@ def _perform_haystack_search(
     # Check output based on last component in the pipeline
     if last_component_name in result and result[last_component_name].get("documents"):
         final_documents = result[last_component_name]["documents"]
-        logger.debug(f"Processed results from '{last_component_name}' ({len(final_documents)} documents).")
+        logger.debug(
+            f"Processed results from '{last_component_name}' ({len(final_documents)} documents)."
+        )
     else:
-        logger.warning(f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}")
+        logger.warning(
+            f"Search pipeline component '{last_component_name}' returned no documents or unexpected output format. Result keys: {result.keys()}"
+        )
         return []
     # Convert Haystack Documents to the desired output format
     output_results = []
-    for doc in final_documents: # Correctly loop over final_documents
-         # Check if doc is actually a Haystack Document object or potentially a dict
-         doc_id = getattr(doc, 'id', None)
-         doc_score = getattr(doc, 'score', 0.0)
-         doc_content = getattr(doc, 'content', None)
-         doc_meta = getattr(doc, 'meta', {})
-         meta = doc_meta or {}
-         output = {
-             "pdf_path": meta.get("pdf_path", "Unknown"),
-             "page_number": meta.get("page_number", -1),
-             "score": doc_score if doc_score is not None else 0.0, # Handle potential None score
-             "content_snippet": doc_content[:200] + "..." if doc_content else "", # Add snippet
-             "metadata": meta,
-             # "haystack_document": doc # Optionally include the full Haystack doc
-         }
-         output_results.append(output)
+    for doc in final_documents:  # Correctly loop over final_documents
+        # Check if doc is actually a Haystack Document object or potentially a dict
+        doc_id = getattr(doc, "id", None)
+        doc_score = getattr(doc, "score", 0.0)
+        doc_content = getattr(doc, "content", None)
+        doc_meta = getattr(doc, "meta", {})
+        meta = doc_meta or {}
+        output = {
+            "pdf_path": meta.get("pdf_path", "Unknown"),
+            "page_number": meta.get("page_number", -1),
+            "score": doc_score if doc_score is not None else 0.0,  # Handle potential None score
+            "content_snippet": doc_content[:200] + "..." if doc_content else "",  # Add snippet
+            "metadata": meta,
+            # "haystack_document": doc # Optionally include the full Haystack doc
+        }
+        output_results.append(output)
     logger.info(f"Returning {len(output_results)} relevant results.")
     return output_results

natural_pdf/search/search_options.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from dataclasses import dataclass, field
-from typing import List, Optional, Dict, Any, Tuple, Union, Literal
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
 # Use object placeholders for external types to avoid direct dependency
 BaseRanker = object
@@ -8,10 +8,12 @@ BaseEmbedder = object
 logger = logging.getLogger(__name__)
 # --- Base Search Options ---
 @dataclass
 class BaseSearchOptions:
     """Base options for search operations."""
     # How many results to return finally (after retrieval and optional reranking)
     top_k: int = 10
     # How many candidates the retriever should fetch initially (relevant if reranking)
@@ -22,12 +24,12 @@ class BaseSearchOptions:
     # --- Reranking Configuration ---
     # Option 1: Simple boolean/None
-    use_reranker: Optional[bool] = True # True=use default Cohere, False/None=disable
+    use_reranker: Optional[bool] = True  # True=use default Cohere, False/None=disable
     # Option 2: Provide a specific instance (takes precedence over use_reranker boolean)
     reranker_instance: Optional[BaseRanker] = None
     # Parameters for default Cohere reranker (if use_reranker=True)
-    reranker_model: Optional[str] = None # Defaults to "rerank-english-v2.0" in util
-    reranker_api_key: Optional[str] = None # Defaults to COHERE_API_KEY env var
+    reranker_model: Optional[str] = None  # Defaults to "rerank-english-v2.0" in util
+    reranker_api_key: Optional[str] = None  # Defaults to COHERE_API_KEY env var
     # --- Embedder Configuration (Less common to override per-query, usually set at indexing) ---
     # embedder_instance: Optional[BaseEmbedder] = None # Might be useful for advanced cases
@@ -35,38 +37,47 @@ class BaseSearchOptions:
     def __post_init__(self):
         # Validate that top_k values make sense
         if self.retriever_top_k is None:
-             # If retriever_top_k isn't set, default it based on reranking needs
-             if self.use_reranker:
-                 self.retriever_top_k = max(self.top_k * 2, 20) # Fetch more if reranking
-             else:
-                 self.retriever_top_k = self.top_k
+            # If retriever_top_k isn't set, default it based on reranking needs
+            if self.use_reranker:
+                self.retriever_top_k = max(self.top_k * 2, 20)  # Fetch more if reranking
+            else:
+                self.retriever_top_k = self.top_k
         elif self.retriever_top_k < self.top_k:
-            logger.warning(f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results.")
+            logger.warning(
+                f"retriever_top_k ({self.retriever_top_k}) is less than top_k ({self.top_k}). Retriever should fetch at least as many candidates as the final desired results."
+            )
 # --- Text Search Specific Options ---
 @dataclass
 class TextSearchOptions(BaseSearchOptions):
     """Options specific to text-based semantic search."""
     # Add any text-specific overrides or parameters here if needed in the future
     # e.g., specifying default text reranker model name if different defaults emerge
     # default_text_reranker_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
-    pass # Currently inherits all base options
+    pass  # Currently inherits all base options
 # --- MultiModal Search Specific Options ---
 @dataclass
 class MultiModalSearchOptions(BaseSearchOptions):
     """Options specific to multimodal semantic search."""
     # Flag to potentially use a default multimodal reranker if available
     # (overrides base use_reranker=True if reranker_instance is None)
-    use_multimodal_reranker: bool = True # Attempt multimodal rerank if use_reranker=True/None and no instance given
+    use_multimodal_reranker: bool = (
+        True  # Attempt multimodal rerank if use_reranker=True/None and no instance given
+    )
     # e.g., specifying default multimodal embedder/reranker models
     # default_multimodal_embedder_model: str = "sentence-transformers/clip-ViT-B-32-multilingual-v1"
     # default_multimodal_reranker_model: str = "jinaai/jina-reranker-m0" # Example
 # --- Union Type ---
 # Defines the types allowed for search configuration.
 SearchOptions = Union[
     TextSearchOptions,
     MultiModalSearchOptions,
-    BaseSearchOptions # Include base for typing flexibility
-]
+    BaseSearchOptions,  # Include base for typing flexibility
+]

natural_pdf/search/search_service_protocol.py CHANGED Viewed

@@ -1,15 +1,18 @@
 """Defines the protocol for a search service."""
-from typing import Protocol, List, Dict, Any, Optional, Union, Iterable
 from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Protocol, Union
 from PIL import Image
+# Forward declare SearchOptions to avoid circular import if needed,
+# or import if structure allows (assuming it's safe here)
+from natural_pdf.search.search_options import BaseSearchOptions, SearchOptions
 # Use typing_extensions for Python < 3.8 compatibility if needed,
 # otherwise, typing.Protocol is fine for >= 3.8
 # from typing_extensions import Protocol
-# Forward declare SearchOptions to avoid circular import if needed,
-# or import if structure allows (assuming it's safe here)
-from natural_pdf.search.search_options import SearchOptions, BaseSearchOptions
 # Use Dict as placeholder for external Haystack Document type
 HaystackDocument = Dict[str, Any]
@@ -17,12 +20,14 @@ HaystackDocument = Dict[str, Any]
 class IndexConfigurationError(RuntimeError):
     """Custom exception for configuration mismatches during indexing."""
     pass
 # Add new exception for sync/init safety
 class IndexExistsError(RuntimeError):
     """Raised when attempting to index implicitly to an existing persistent index without force_reindex=True."""
     pass
@@ -66,6 +71,7 @@ class SearchServiceProtocol(Protocol):
     with a chosen search backend (e.g., Haystack with ChromaDB, Haystack In-Memory).
     An instance of a service implementing this protocol is tied to a specific collection name.
     """
     collection_name: str
     # Removed internal state hints (_persist, _embedding_model) - implementation detail
@@ -98,7 +104,7 @@ class SearchServiceProtocol(Protocol):
     def search(
         self,
-        query: Any, # Allow any query type, service implementation handles it
+        query: Any,  # Allow any query type, service implementation handles it
         options: BaseSearchOptions,
     ) -> List[Dict[str, Any]]:
         """
@@ -186,4 +192,4 @@ class SearchServiceProtocol(Protocol):
         ...
     # Optional: Add methods for getting index stats, etc.
-    # def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...
+    # def get_index_stats(self, collection_name: str) -> Dict[str, Any]: ...

natural-pdf 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.6py3-none-any.whl