PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +125 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +907 -513
natural_pdf/core/pdf.py +385 -287
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +708 -508
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/search/searchable_mixin.py CHANGED Viewed

@@ -1,34 +1,48 @@
+import hashlib  # For hashing content
 import logging
-from typing import Optional, List, Dict, Any, Iterable, TYPE_CHECKING, Union, Type, Generator
 from abc import ABC, abstractmethod
-import hashlib # For hashing content
+from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
 # Now import the flag from the canonical source - this import should always work
 from .haystack_utils import HAS_HAYSTACK_EXTRAS
 DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
 # Avoid runtime import errors if extras not installed
 try:
     # Import protocols and options first
+    from . import get_search_service
+    from .search_options import SearchOptions, TextSearchOptions
     from .search_service_protocol import (
-        SearchServiceProtocol, Indexable,
-        IndexConfigurationError, IndexExistsError
+        Indexable,
+        IndexConfigurationError,
+        IndexExistsError,
+        SearchServiceProtocol,
     )
-    from .search_options import SearchOptions, TextSearchOptions
-    from . import get_search_service
-    if TYPE_CHECKING: # Keep type hints working
-        from natural_pdf.elements.region import Region # Example indexable type
+    if TYPE_CHECKING:  # Keep type hints working
+        from natural_pdf.elements.region import Region  # Example indexable type
 except ImportError:
     # Define dummies if extras missing
-    SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = object, object, RuntimeError, RuntimeError
+    SearchServiceProtocol, Indexable, IndexConfigurationError, IndexExistsError = (
+        object,
+        object,
+        RuntimeError,
+        RuntimeError,
+    )
     SearchOptions, TextSearchOptions = object, object
     DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
-    def get_search_service(**kwargs): raise ImportError("Search dependencies missing.")
-    class Region: pass # Dummy for type hint
+    def get_search_service(**kwargs):
+        raise ImportError("Search dependencies missing.")
+    class Region:
+        pass  # Dummy for type hint
 logger = logging.getLogger(__name__)
 class SearchableMixin(ABC):
     """
     Mixin class providing search functionality (initialization, indexing, searching, syncing).
@@ -36,6 +50,7 @@ class SearchableMixin(ABC):
     Requires the inheriting class to implement `get_indexable_items`.
     Assumes the inheriting class has a `_search_service` attribute initialized to None.
     """
     # Ensure inheriting class initializes this
     _search_service: Optional[SearchServiceProtocol] = None
@@ -53,12 +68,12 @@ class SearchableMixin(ABC):
         *,
         persist: Optional[bool] = None,
         collection_name: Optional[str] = None,
-        embedding_model: Optional[str] = None, # Allow overriding embedding model
-        index: bool = False, # Changed from index_now
+        embedding_model: Optional[str] = None,  # Allow overriding embedding model
+        index: bool = False,  # Changed from index_now
         force_reindex: bool = False,
         embedder_device: Optional[str] = None,
-        **kwargs # Pass other args to get_search_service
-    ) -> 'SearchableMixin': # Return self for chaining
+        **kwargs,  # Pass other args to get_search_service
+    ) -> "SearchableMixin":  # Return self for chaining
         """
         Initializes and configures the search service for this instance.
@@ -90,7 +105,9 @@ class SearchableMixin(ABC):
         """
         if service:
             # Attach provided service
-            logger.info(f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}').")
+            logger.info(
+                f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
+            )
             # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
             self._search_service = service
         else:
@@ -101,24 +118,45 @@ class SearchableMixin(ABC):
                 raise ValueError("A collection_name must be provided when persist=True.")
             elif not effective_persist and not effective_collection_name:
                 effective_collection_name = DEFAULT_SEARCH_COLLECTION_NAME
-                logger.info(f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service.")
+                logger.info(
+                    f"Using default collection name '{DEFAULT_SEARCH_COLLECTION_NAME}' for in-memory service."
+                )
-            logger.info(f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}")
+            logger.info(
+                f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
+            )
             try:
-                service_args = {"collection_name": effective_collection_name, "persist": effective_persist, **kwargs}
-                if embedding_model: service_args['embedding_model'] = embedding_model
+                service_args = {
+                    "collection_name": effective_collection_name,
+                    "persist": effective_persist,
+                    **kwargs,
+                }
+                if embedding_model:
+                    service_args["embedding_model"] = embedding_model
                 self._search_service = get_search_service(**service_args)
+            except ImportError as ie:  # Catch the specific ImportError first
+                logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
+                raise ie  # Re-raise the original ImportError
             except Exception as e:
-                logger.error(f"Failed to create SearchService: {e}", exc_info=True)
-                raise RuntimeError("Could not create SearchService instance.") from e
-        # --- Optional Immediate Indexing (with safety check for persistent) ---
+                logger.error(
+                    f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
+                )
+                # Keep the RuntimeError for other unexpected creation errors
+                raise RuntimeError(
+                    "Could not create SearchService instance due to an unexpected error."
+                ) from e
+        # --- Optional Immediate Indexing (with safety check for persistent) ---
         if index:
-            if not self._search_service: # Should not happen if logic above is correct
-                raise RuntimeError("Cannot index: Search service not available after initialization attempt.")
+            if not self._search_service:  # Should not happen if logic above is correct
+                raise RuntimeError(
+                    "Cannot index: Search service not available after initialization attempt."
+                )
-            is_persistent = getattr(self._search_service, '_persist', False) # Check if service is persistent
-            collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
+            is_persistent = getattr(
+                self._search_service, "_persist", False
+            )  # Check if service is persistent
+            collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
             if is_persistent and not force_reindex:
                 # Check existence only if persistent and not forcing reindex
@@ -131,14 +169,20 @@ class SearchableMixin(ABC):
                     )
                 else:
                     # Index doesn't exist, safe to proceed
-                    logger.info(f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing.")
+                    logger.info(
+                        f"Persistent index '{collection_name}' does not exist. Proceeding with initial indexing."
+                    )
             elif is_persistent and force_reindex:
-                logger.warning(f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted.")
+                logger.warning(
+                    f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
+                )
             # else: # Not persistent, safe to proceed without existence check
             #     logger.debug("Proceeding with index=True for non-persistent index.")
             # Proceed with indexing if checks passed or not applicable
-            logger.info(f"index=True: Proceeding to index collection immediately after search initialization.")
+            logger.info(
+                f"index=True: Proceeding to index collection immediately after search initialization."
+            )
             self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
         return self
@@ -148,42 +192,53 @@ class SearchableMixin(ABC):
         if not self._search_service:
             raise RuntimeError("Search service not initialized. Call init_search first.")
-        collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
-        logger.info(f"Starting internal indexing process into SearchService collection '{collection_name}'...")
+        collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
+        logger.info(
+            f"Starting internal indexing process into SearchService collection '{collection_name}'..."
+        )
         # Use the abstract method to get items
         try:
-            indexable_items = list(self.get_indexable_items()) # Consume iterator
+            indexable_items = list(self.get_indexable_items())  # Consume iterator
         except Exception as e:
-             logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
-             raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
+            logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
+            raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
         if not indexable_items:
-            logger.warning("No indexable items provided by get_indexable_items(). Skipping index call.")
+            logger.warning(
+                "No indexable items provided by get_indexable_items(). Skipping index call."
+            )
             return
         logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
         try:
-             logger.debug(f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex}).")
-             self._search_service.index(
-                 documents=indexable_items,
-                 embedder_device=embedder_device,
-                 force_reindex=force_reindex,
-             )
-             logger.info(f"Successfully completed indexing into SearchService collection '{collection_name}'.")
+            logger.debug(
+                f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
+            )
+            self._search_service.index(
+                documents=indexable_items,
+                embedder_device=embedder_device,
+                force_reindex=force_reindex,
+            )
+            logger.info(
+                f"Successfully completed indexing into SearchService collection '{collection_name}'."
+            )
         except IndexConfigurationError as ice:
-             logger.error(f"Indexing failed due to configuration error in collection '{collection_name}': {ice}", exc_info=True)
-             raise # Re-raise specific error
-        except Exception as e: # Catch other indexing errors from the service
-             logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
-             raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
+            logger.error(
+                f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
+                exc_info=True,
+            )
+            raise  # Re-raise specific error
+        except Exception as e:  # Catch other indexing errors from the service
+            logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
+            raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
     def index_for_search(
-        self, *, # Make args keyword-only
+        self,
+        *,  # Make args keyword-only
         embedder_device: Optional[str] = None,
         force_reindex: bool = False,
-    ) -> 'SearchableMixin':
+    ) -> "SearchableMixin":
         """
         Ensures the search service is initialized (using default if needed)
         and indexes the items provided by `get_indexable_items`.
@@ -201,8 +256,10 @@ class SearchableMixin(ABC):
         """
         # --- Ensure Service is Initialized (Use Default if Needed) ---
         if not self._search_service:
-            logger.info("Search service not initialized prior to index_for_search. Initializing default in-memory service.")
-            self.init_search() # Call init with defaults
+            logger.info(
+                "Search service not initialized prior to index_for_search. Initializing default in-memory service."
+            )
+            self.init_search()  # Call init with defaults
         # --- Perform Indexing ---
         self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
@@ -210,10 +267,10 @@ class SearchableMixin(ABC):
     def find_relevant(
         self,
-        query: Any, # Query type depends on service capabilities
-        *, # Make options/service keyword-only
+        query: Any,  # Query type depends on service capabilities
+        *,  # Make options/service keyword-only
         options: Optional[SearchOptions] = None,
-        search_service: Optional[SearchServiceProtocol] = None # Allow override
+        search_service: Optional[SearchServiceProtocol] = None,  # Allow override
     ) -> List[Dict[str, Any]]:
         """
         Finds relevant items using the configured or provided search service.
@@ -240,8 +297,10 @@ class SearchableMixin(ABC):
                 "or provide an explicit 'search_service' instance to find_relevant()."
             )
-        collection_name = getattr(effective_service, 'collection_name', '<Unknown>')
-        logger.info(f"Searching collection '{collection_name}' via {type(effective_service).__name__}...")
+        collection_name = getattr(effective_service, "collection_name", "<Unknown>")
+        logger.info(
+            f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
+        )
         # --- Prepare Query and Options ---
         query_input = query
@@ -263,11 +322,15 @@ class SearchableMixin(ABC):
                 query=query_input,
                 options=effective_options,
             )
-            logger.info(f"SearchService returned {len(results)} results from collection '{collection_name}'.")
+            logger.info(
+                f"SearchService returned {len(results)} results from collection '{collection_name}'."
+            )
             return results
         except FileNotFoundError as fnf:
-            logger.error(f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}")
-            raise # Re-raise specific error
+            logger.error(
+                f"Search failed: Collection '{collection_name}' not found by service. Error: {fnf}"
+            )
+            raise  # Re-raise specific error
         except Exception as e:
             logger.error(f"Search failed for collection '{collection_name}': {e}", exc_info=True)
             # Consider wrapping in a SearchError?
@@ -276,11 +339,11 @@ class SearchableMixin(ABC):
     # --- NEW Sync Method ---
     def sync_index(
         self,
-        strategy: str = 'full', # 'full' (add/update/delete) or 'upsert_only'
+        strategy: str = "full",  # 'full' (add/update/delete) or 'upsert_only'
         dry_run: bool = False,
-        batch_size: int = 100, # For batching deletes/updates if needed
-        embedder_device: Optional[str] = None, # Pass embedder device if needed for updates
-        **kwargs: Any # Allow passing extra args to get_search_service
+        batch_size: int = 100,  # For batching deletes/updates if needed
+        embedder_device: Optional[str] = None,  # Pass embedder device if needed for updates
+        **kwargs: Any,  # Allow passing extra args to get_search_service
     ) -> Dict[str, int]:
         """
         Synchronizes the search index with the current state of indexable items.
@@ -307,91 +370,107 @@ class SearchableMixin(ABC):
             RuntimeError: For backend errors during synchronization.
         """
         if not self._search_service:
-             raise RuntimeError("Search service not configured. Call init_search first.")
+            raise RuntimeError("Search service not configured. Call init_search first.")
-        collection_name = getattr(self._search_service, 'collection_name', '<Unknown>')
-        logger.info(f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})...")
-        summary = {'added': 0, 'updated': 0, 'deleted': 0, 'skipped': 0}
+        collection_name = getattr(self._search_service, "collection_name", "<Unknown>")
+        logger.info(
+            f"Starting index synchronization for collection '{collection_name}' (Strategy: {strategy}, Dry run: {dry_run})..."
+        )
+        summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
         # --- Check Service Capabilities for 'full' sync ---
-        if strategy == 'full':
-             required_methods = ['list_documents', 'delete_documents']
-             missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
-             if missing_methods:
-                 raise NotImplementedError(
-                     f"The configured search service ({type(self._search_service).__name__}) "
-                     f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
-                 )
+        if strategy == "full":
+            required_methods = ["list_documents", "delete_documents"]
+            missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
+            if missing_methods:
+                raise NotImplementedError(
+                    f"The configured search service ({type(self._search_service).__name__}) "
+                    f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
+                )
         # --- 1. Get Desired State (from current collection) ---
-        desired_state: Dict[str, Indexable] = {} # {id: item}
-        desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
+        desired_state: Dict[str, Indexable] = {}  # {id: item}
+        desired_hashes: Dict[str, Optional[str]] = {}  # {id: hash or None}
         try:
-             for item in self.get_indexable_items():
-                 item_id = item.get_id()
-                 if not item_id:
-                     logger.warning(f"Skipping item with no ID: {item}")
-                     summary['skipped'] += 1
-                     continue
-                 if item_id in desired_state:
-                      logger.warning(f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item.")
-                      summary['skipped'] += 1
-                      continue
-                 desired_state[item_id] = item
-                 # Try to get hash, store None if unavailable or fails
-                 try:
-                      desired_hashes[item_id] = item.get_content_hash()
-                 except (AttributeError, NotImplementedError):
-                      logger.debug(f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based.")
-                      desired_hashes[item_id] = None
-                 except Exception as e:
-                      logger.warning(f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.", exc_info=False)
-                      desired_hashes[item_id] = None
+            for item in self.get_indexable_items():
+                item_id = item.get_id()
+                if not item_id:
+                    logger.warning(f"Skipping item with no ID: {item}")
+                    summary["skipped"] += 1
+                    continue
+                if item_id in desired_state:
+                    logger.warning(
+                        f"Duplicate ID '{item_id}' found in get_indexable_items(). Skipping subsequent item."
+                    )
+                    summary["skipped"] += 1
+                    continue
+                desired_state[item_id] = item
+                # Try to get hash, store None if unavailable or fails
+                try:
+                    desired_hashes[item_id] = item.get_content_hash()
+                except (AttributeError, NotImplementedError):
+                    logger.debug(
+                        f"get_content_hash not available for item ID '{item_id}' ({type(item).__name__}). Sync update check will be ID-based."
+                    )
+                    desired_hashes[item_id] = None
+                except Exception as e:
+                    logger.warning(
+                        f"Error getting content hash for item ID '{item_id}': {e}. Sync update check will be ID-based.",
+                        exc_info=False,
+                    )
+                    desired_hashes[item_id] = None
         except Exception as e:
-             logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
-             raise RuntimeError("Failed to get current indexable items.") from e
+            logger.error(f"Error iterating through get_indexable_items: {e}", exc_info=True)
+            raise RuntimeError("Failed to get current indexable items.") from e
         logger.info(f"Desired state contains {len(desired_state)} indexable items.")
         # --- 2. Handle Different Strategies ---
-        if strategy == 'upsert_only':
+        if strategy == "upsert_only":
             # Simple case: just index everything, let the service handle upserts
             items_to_index = list(desired_state.values())
-            summary['added'] = len(items_to_index) # Approximate count
-            logger.info(f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting.")
+            summary["added"] = len(items_to_index)  # Approximate count
+            logger.info(
+                f"Strategy 'upsert_only': Prepared {len(items_to_index)} items for indexing/upserting."
+            )
             if not dry_run and items_to_index:
-                 logger.debug("Calling service.index for upsert...")
-                 # Call index directly, force_reindex=False implies upsert
-                 self._search_service.index(
-                     documents=items_to_index,
-                     force_reindex=False,
-                     embedder_device=embedder_device
-                 )
+                logger.debug("Calling service.index for upsert...")
+                # Call index directly, force_reindex=False implies upsert
+                self._search_service.index(
+                    documents=items_to_index, force_reindex=False, embedder_device=embedder_device
+                )
             elif dry_run:
                 logger.info("[Dry Run] Would index/upsert %d items.", len(items_to_index))
-        elif strategy == 'full':
+        elif strategy == "full":
             # Complex case: Add/Update/Delete
             # 2a. Get Current Index State
             try:
-                 logger.debug("Listing documents currently in the index...")
-                 # Assumes list_documents takes filters and include_metadata
-                 # Fetch all documents with metadata
-                 current_docs = self._search_service.list_documents(include_metadata=True)
-                 current_state: Dict[str, Dict] = {} # {id: {'meta': {...}, ...}}
-                 duplicates = 0
-                 for doc in current_docs:
-                     doc_id = doc.get('id')
-                     if not doc_id: continue # Skip docs without ID from service
-                     if doc_id in current_state: duplicates +=1
-                     current_state[doc_id] = doc
-                 logger.info(f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs).")
-                 if duplicates > 0: logger.warning(f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison.")
+                logger.debug("Listing documents currently in the index...")
+                # Assumes list_documents takes filters and include_metadata
+                # Fetch all documents with metadata
+                current_docs = self._search_service.list_documents(include_metadata=True)
+                current_state: Dict[str, Dict] = {}  # {id: {'meta': {...}, ...}}
+                duplicates = 0
+                for doc in current_docs:
+                    doc_id = doc.get("id")
+                    if not doc_id:
+                        continue  # Skip docs without ID from service
+                    if doc_id in current_state:
+                        duplicates += 1
+                    current_state[doc_id] = doc
+                logger.info(
+                    f"Found {len(current_state)} documents currently in the index (encountered {duplicates} duplicate IDs)."
+                )
+                if duplicates > 0:
+                    logger.warning(
+                        f"Found {duplicates} duplicate IDs in the index. Using the last encountered version for comparison."
+                    )
             except Exception as e:
-                 logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
-                 raise RuntimeError("Could not retrieve current index state for sync.") from e
+                logger.error(f"Failed to list documents from search service: {e}", exc_info=True)
+                raise RuntimeError("Could not retrieve current index state for sync.") from e
             # 2b. Compare States and Plan Actions
             ids_in_desired = set(desired_state.keys())
@@ -403,62 +482,68 @@ class SearchableMixin(ABC):
             items_to_update = []
             for item_id in ids_to_check_update:
-                 desired_hash = desired_hashes.get(item_id)
-                 current_meta = current_state[item_id].get('meta', {})
-                 current_hash = current_meta.get('content_hash') # Assuming hash stored in meta
-                 # Check if hash exists and differs, or if hash is missing (force update)
-                 if desired_hash is None or current_hash is None or desired_hash != current_hash:
-                      if desired_hash != current_hash:
-                          logger.debug(f"Content hash changed for ID {item_id}. Scheduling for update.")
-                      else:
-                          logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
-                      items_to_update.append(desired_state[item_id])
-                 # Else: hashes match, no update needed
+                desired_hash = desired_hashes.get(item_id)
+                current_meta = current_state[item_id].get("meta", {})
+                current_hash = current_meta.get("content_hash")  # Assuming hash stored in meta
+                # Check if hash exists and differs, or if hash is missing (force update)
+                if desired_hash is None or current_hash is None or desired_hash != current_hash:
+                    if desired_hash != current_hash:
+                        logger.debug(
+                            f"Content hash changed for ID {item_id}. Scheduling for update."
+                        )
+                    else:
+                        logger.debug(f"Hash missing for ID {item_id}. Scheduling for update.")
+                    items_to_update.append(desired_state[item_id])
+                # Else: hashes match, no update needed
             items_to_add = [desired_state[id_] for id_ in ids_to_add]
-            items_to_index = items_to_add + items_to_update # Combine adds and updates for single index call
+            items_to_index = (
+                items_to_add + items_to_update
+            )  # Combine adds and updates for single index call
-            summary['added'] = len(items_to_add)
-            summary['updated'] = len(items_to_update)
-            summary['deleted'] = len(ids_to_delete)
+            summary["added"] = len(items_to_add)
+            summary["updated"] = len(items_to_update)
+            summary["deleted"] = len(ids_to_delete)
-            logger.info(f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}")
+            logger.info(
+                f"Sync Plan: Add={summary['added']}, Update={summary['updated']}, Delete={summary['deleted']}"
+            )
             # 2c. Execute Actions (if not dry_run)
             if not dry_run:
-                 # Execute Deletes
-                 if ids_to_delete:
-                      logger.info(f"Deleting {len(ids_to_delete)} items from index...")
-                      try:
-                           # Assuming delete_documents takes list of IDs
-                           # Implement batching if needed
-                           self._search_service.delete_documents(ids=list(ids_to_delete))
-                           logger.info("Deletion successful.")
-                      except Exception as e:
-                           logger.error(f"Failed to delete documents: {e}", exc_info=True)
-                           # Decide whether to continue or raise
-                           raise RuntimeError("Failed during deletion phase of sync.") from e
-                 # Execute Adds/Updates
-                 if items_to_index:
-                      logger.info(f"Indexing/Updating {len(items_to_index)} items...")
-                      try:
-                           # Upsert logic handled by service's index method with force_reindex=False
-                           self._search_service.index(
-                               documents=items_to_index,
-                               force_reindex=False,
-                               embedder_device=embedder_device
-                           )
-                           logger.info("Add/Update successful.")
-                      except Exception as e:
-                           logger.error(f"Failed to index/update documents: {e}", exc_info=True)
-                           raise RuntimeError("Failed during add/update phase of sync.") from e
-                 logger.info("Sync actions completed.")
+                # Execute Deletes
+                if ids_to_delete:
+                    logger.info(f"Deleting {len(ids_to_delete)} items from index...")
+                    try:
+                        # Assuming delete_documents takes list of IDs
+                        # Implement batching if needed
+                        self._search_service.delete_documents(ids=list(ids_to_delete))
+                        logger.info("Deletion successful.")
+                    except Exception as e:
+                        logger.error(f"Failed to delete documents: {e}", exc_info=True)
+                        # Decide whether to continue or raise
+                        raise RuntimeError("Failed during deletion phase of sync.") from e
+                # Execute Adds/Updates
+                if items_to_index:
+                    logger.info(f"Indexing/Updating {len(items_to_index)} items...")
+                    try:
+                        # Upsert logic handled by service's index method with force_reindex=False
+                        self._search_service.index(
+                            documents=items_to_index,
+                            force_reindex=False,
+                            embedder_device=embedder_device,
+                        )
+                        logger.info("Add/Update successful.")
+                    except Exception as e:
+                        logger.error(f"Failed to index/update documents: {e}", exc_info=True)
+                        raise RuntimeError("Failed during add/update phase of sync.") from e
+                logger.info("Sync actions completed.")
             else:
-                 logger.info("[Dry Run] No changes applied to the index.")
+                logger.info("[Dry Run] No changes applied to the index.")
         else:
             raise ValueError(f"Unknown sync strategy: '{strategy}'. Use 'full' or 'upsert_only'.")
-        return summary
+        return summary

natural_pdf/selectors/__init__.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """
 Selector module for natural-pdf.
 """
-from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func

natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl