PyPI - natural-pdf - Versions diffs - 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl - Mend

natural-pdf 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

natural_pdf/__init__.py +29 -40
natural_pdf/analyzers/text_options.py +9 -1
natural_pdf/analyzers/text_structure.py +371 -58
natural_pdf/classification/manager.py +1 -1
natural_pdf/core/element_manager.py +11 -1
natural_pdf/core/highlighting_service.py +120 -40
natural_pdf/core/page.py +20 -18
natural_pdf/core/pdf.py +146 -13
natural_pdf/elements/base.py +17 -0
natural_pdf/elements/collections.py +374 -30
natural_pdf/elements/region.py +45 -14
natural_pdf/exporters/data/__init__.py +0 -0
natural_pdf/exporters/data/pdf.ttf +0 -0
natural_pdf/exporters/data/sRGB.icc +0 -0
natural_pdf/exporters/hocr.py +519 -0
natural_pdf/exporters/hocr_font.py +136 -0
natural_pdf/exporters/original_pdf.py +127 -0
natural_pdf/exporters/searchable_pdf.py +2 -12
natural_pdf/ocr/engine_surya.py +1 -1
natural_pdf/search/__init__.py +65 -52
natural_pdf/search/lancedb_search_service.py +325 -0
natural_pdf/search/numpy_search_service.py +255 -0
natural_pdf/search/searchable_mixin.py +25 -71
natural_pdf/widgets/viewer.py +22 -31
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
natural_pdf/search/haystack_search_service.py +0 -687
natural_pdf/search/haystack_utils.py +0 -474
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0

natural_pdf/search/numpy_search_service.py ADDED Viewed

@@ -0,0 +1,255 @@
+import logging
+import numpy as np
+import json
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Union
+from sentence_transformers import SentenceTransformer
+from .search_options import BaseSearchOptions
+from .search_service_protocol import (
+    Indexable,
+    IndexConfigurationError,
+    SearchServiceProtocol,
+)
+logger = logging.getLogger(__name__)
+DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+class NumpySearchService(SearchServiceProtocol):
+    """Basic in-memory vector search implementation using NumPy."""
+    collection_name: str
+    def __init__(
+        self,
+        collection_name: str,
+        persist: bool = False,
+        uri: Optional[str] = None,
+        embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
+    ):
+        if persist:
+            raise RuntimeError(
+                "Persistence requested but LanceDB is not installed. "
+                "For persistent vector search, install LanceDB: pip install lancedb"
+            )
+        self.collection_name = collection_name
+        self._embedding_model_name = embedding_model_name
+        self.embedding_model = SentenceTransformer(self._embedding_model_name)
+        self._embedding_dims = len(self.embedding_model.encode("test"))
+        # Simple in-memory storage
+        self._vectors = []
+        self._documents = []
+        self._metadata = []
+        self._ids = []
+        logger.info(f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'")
+    def index(
+        self,
+        documents: Iterable[Indexable],
+        embedder_device: Optional[str] = None,
+        force_reindex: bool = False,
+    ) -> None:
+        if force_reindex:
+            logger.info(f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors.")
+            self._vectors = []
+            self._documents = []
+            self._metadata = []
+            self._ids = []
+        items = list(documents)
+        logger.info(f"Indexing {len(items)} documents for collection '{self.collection_name}'")
+        if not items:
+            logger.warning("No documents provided for indexing. Skipping.")
+            return
+        texts_to_embed = []
+        items_info = []
+        for item in items:
+            doc_id = item.get_id()
+            metadata = item.get_metadata().copy()
+            content_obj = item.get_content()
+            content_text = ""
+            if isinstance(content_obj, str):
+                content_text = content_obj
+            elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
+                content_text = content_obj.extract_text()
+                if not isinstance(content_text, str):
+                    content_text = str(content_obj)
+            else:
+                content_text = str(content_obj)
+            # Try to add content hash to metadata
+            try:
+                content_hash = item.get_content_hash()
+                if content_hash:
+                    metadata["content_hash"] = content_hash
+            except (AttributeError, NotImplementedError):
+                pass
+            except Exception as e:
+                logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
+            texts_to_embed.append(content_text)
+            items_info.append({
+                "id": doc_id,
+                "metadata": metadata,
+                "text": content_text
+            })
+        if not texts_to_embed:
+            logger.warning("No text content to embed. Skipping.")
+            return
+        logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
+        generated_embeddings = self.embedding_model.encode(
+            texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
+        )
+        for i, item_info in enumerate(items_info):
+            self._vectors.append(generated_embeddings[i])
+            self._documents.append(item_info["text"])
+            self._metadata.append(item_info["metadata"])
+            self._ids.append(item_info["id"])
+        logger.info(f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}")
+    def search(
+        self,
+        query: Any,
+        options: BaseSearchOptions,
+    ) -> List[Dict[str, Any]]:
+        if not self._vectors:
+            logger.debug("No vectors in index. Returning empty results.")
+            return []
+        # Process query to text
+        query_text = ""
+        if isinstance(query, (str, Path)):
+            query_text = str(query)
+        elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
+            query_text = query.extract_text()
+            if not query_text or not query_text.strip():
+                return []
+        else:
+            raise TypeError(f"Unsupported query type: {type(query)}")
+        logger.info(f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}")
+        # Encode query and perform similarity search
+        query_vector = self.embedding_model.encode(query_text)
+        # Convert list to numpy array for batch operations
+        vectors_array = np.array(self._vectors)
+        # Normalize vectors for cosine similarity
+        query_norm = np.linalg.norm(query_vector)
+        if query_norm > 0:
+            query_vector = query_vector / query_norm
+        # Normalize all vectors (avoid division by zero)
+        vector_norms = np.linalg.norm(vectors_array, axis=1, keepdims=True)
+        valid_indices = vector_norms.flatten() > 0
+        vectors_array[valid_indices] = vectors_array[valid_indices] / vector_norms[valid_indices]
+        # Calculate cosine similarities
+        similarities = np.dot(vectors_array, query_vector)
+        # Apply filters if present
+        filtered_indices = np.arange(len(similarities))
+        if options.filters:
+            # Simple filtering for metadata fields
+            # This is a basic implementation and doesn't support complex filters like LanceDB
+            if isinstance(options.filters, dict):
+                for field, value in options.filters.items():
+                    new_filtered = []
+                    for i in filtered_indices:
+                        metadata = self._metadata[i]
+                        if field in metadata and metadata[field] == value:
+                            new_filtered.append(i)
+                    filtered_indices = np.array(new_filtered)
+            else:
+                logger.warning(f"Complex filter expressions not supported in NumPy backend: {options.filters}")
+        # Apply filtering and sort by similarity
+        if len(filtered_indices) > 0:
+            filtered_similarities = similarities[filtered_indices]
+            top_k = min(options.top_k, len(filtered_similarities))
+            if top_k == 0:
+                return []
+            top_indices_within_filtered = np.argsort(filtered_similarities)[-top_k:][::-1]
+            top_indices = filtered_indices[top_indices_within_filtered]
+        else:
+            top_k = min(options.top_k, len(similarities))
+            if top_k == 0:
+                return []
+            top_indices = np.argsort(similarities)[-top_k:][::-1]
+        # Format results
+        results = []
+        for idx in top_indices:
+            metadata = self._metadata[idx]
+            results.append({
+                "id": self._ids[idx],
+                "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
+                "score": float(similarities[idx]),
+                "page_number": metadata.get("page_number"),
+                "pdf_path": metadata.get("pdf_path"),
+                "metadata": metadata,
+            })
+        logger.info(f"Search returned {len(results)} results from collection '{self.collection_name}'")
+        return results
+    def index_exists(self) -> bool:
+        return len(self._vectors) > 0
+    def delete_index(self) -> bool:
+        logger.warning(f"Deleting in-memory index for collection '{self.collection_name}'")
+        self._vectors = []
+        self._documents = []
+        self._metadata = []
+        self._ids = []
+        return True
+    def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
+        logger.debug(f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})...")
+        results = []
+        for i, doc_id in enumerate(self._ids):
+            doc_info = {"id": doc_id}
+            if include_metadata:
+                doc_info["meta"] = self._metadata[i]
+            results.append(doc_info)
+        logger.info(f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'")
+        return results
+    def delete_documents(self, ids: List[str]) -> None:
+        if not ids:
+            logger.debug("No document IDs provided for deletion. Skipping.")
+            return
+        logger.warning(f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'")
+        # Find indices to remove
+        keep_indices = []
+        for i, doc_id in enumerate(self._ids):
+            if doc_id not in ids:
+                keep_indices.append(i)
+        # Create new filtered lists
+        self._ids = [self._ids[i] for i in keep_indices]
+        self._vectors = [self._vectors[i] for i in keep_indices]
+        self._documents = [self._documents[i] for i in keep_indices]
+        self._metadata = [self._metadata[i] for i in keep_indices]
+        logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")

natural_pdf/search/searchable_mixin.py CHANGED Viewed

@@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
 # Now import the flag from the canonical source - this import should always work
-from .haystack_utils import HAS_HAYSTACK_EXTRAS
 DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
@@ -108,7 +107,6 @@ class SearchableMixin(ABC):
             logger.info(
                 f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
             )
-            # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
             self._search_service = service
         else:
             # Create new service
@@ -125,28 +123,17 @@ class SearchableMixin(ABC):
             logger.info(
                 f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
             )
-            try:
-                service_args = {
-                    "collection_name": effective_collection_name,
-                    "persist": effective_persist,
-                    **kwargs,
-                }
-                if embedding_model:
-                    service_args["embedding_model"] = embedding_model
-                self._search_service = get_search_service(**service_args)
-            except ImportError as ie:  # Catch the specific ImportError first
-                logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
-                raise ie  # Re-raise the original ImportError
-            except Exception as e:
-                logger.error(
-                    f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
-                )
-                # Keep the RuntimeError for other unexpected creation errors
-                raise RuntimeError(
-                    "Could not create SearchService instance due to an unexpected error."
-                ) from e
+            # Direct creation without try/except
+            service_args = {
+                "collection_name": effective_collection_name,
+                "persist": effective_persist,
+                **kwargs,
+            }
+            if embedding_model:
+                service_args["embedding_model"] = embedding_model
+            self._search_service = get_search_service(**service_args)
-        # --- Optional Immediate Indexing (with safety check for persistent) ---
         if index:
             if not self._search_service:  # Should not happen if logic above is correct
                 raise RuntimeError(
@@ -176,8 +163,6 @@ class SearchableMixin(ABC):
                 logger.warning(
                     f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
                 )
-            # else: # Not persistent, safe to proceed without existence check
-            #     logger.debug("Proceeding with index=True for non-persistent index.")
             # Proceed with indexing if checks passed or not applicable
             logger.info(
@@ -197,12 +182,8 @@ class SearchableMixin(ABC):
             f"Starting internal indexing process into SearchService collection '{collection_name}'..."
         )
-        # Use the abstract method to get items
-        try:
-            indexable_items = list(self.get_indexable_items())  # Consume iterator
-        except Exception as e:
-            logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
-            raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
+        # Get indexable items without try/except
+        indexable_items = list(self.get_indexable_items())  # Consume iterator
         if not indexable_items:
             logger.warning(
@@ -211,27 +192,19 @@ class SearchableMixin(ABC):
             return
         logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
-        try:
-            logger.debug(
-                f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
-            )
-            self._search_service.index(
-                documents=indexable_items,
-                embedder_device=embedder_device,
-                force_reindex=force_reindex,
-            )
-            logger.info(
-                f"Successfully completed indexing into SearchService collection '{collection_name}'."
-            )
-        except IndexConfigurationError as ice:
-            logger.error(
-                f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
-                exc_info=True,
-            )
-            raise  # Re-raise specific error
-        except Exception as e:  # Catch other indexing errors from the service
-            logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
-            raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
+        logger.debug(
+            f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
+        )
+        # Call index without try/except
+        self._search_service.index(
+            documents=indexable_items,
+            embedder_device=embedder_device,
+            force_reindex=force_reindex,
+        )
+        logger.info(
+            f"Successfully completed indexing into SearchService collection '{collection_name}'."
+        )
     def index_for_search(
         self,
@@ -254,14 +227,12 @@ class SearchableMixin(ABC):
         Returns:
             Self for method chaining.
         """
-        # --- Ensure Service is Initialized (Use Default if Needed) ---
         if not self._search_service:
             logger.info(
                 "Search service not initialized prior to index_for_search. Initializing default in-memory service."
             )
             self.init_search()  # Call init with defaults
-        # --- Perform Indexing ---
         self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
         return self
@@ -289,7 +260,6 @@ class SearchableMixin(ABC):
             RuntimeError: If no search service is configured or provided, or if search fails.
             FileNotFoundError: If the collection managed by the service does not exist.
         """
-        # --- Determine which Search Service to use ---
         effective_service = search_service or self._search_service
         if not effective_service:
             raise RuntimeError(
@@ -302,21 +272,9 @@ class SearchableMixin(ABC):
             f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
         )
-        # --- Prepare Query and Options ---
         query_input = query
-        # Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
-        # If we keep it here, it makes the mixin less generic.
-        # Let's assume the SearchService handles the query type appropriately for now.
-        # if isinstance(query, Region):
-        #     logger.debug("Query is a Region object. Extracting text.")
-        #     query_input = query.extract_text()
-        #     if not query_input or query_input.isspace():
-        #         logger.warning("Region provided for query has no extractable text.")
-        #         return []
         effective_options = options if options is not None else TextSearchOptions()
-        # --- Call SearchService Search Method ---
         try:
             results = effective_service.search(
                 query=query_input,
@@ -336,7 +294,6 @@ class SearchableMixin(ABC):
             # Consider wrapping in a SearchError?
             raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
-    # --- NEW Sync Method ---
     def sync_index(
         self,
         strategy: str = "full",  # 'full' (add/update/delete) or 'upsert_only'
@@ -378,7 +335,6 @@ class SearchableMixin(ABC):
         )
         summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
-        # --- Check Service Capabilities for 'full' sync ---
         if strategy == "full":
             required_methods = ["list_documents", "delete_documents"]
             missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
@@ -388,7 +344,6 @@ class SearchableMixin(ABC):
                     f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
                 )
-        # --- 1. Get Desired State (from current collection) ---
         desired_state: Dict[str, Indexable] = {}  # {id: item}
         desired_hashes: Dict[str, Optional[str]] = {}  # {id: hash or None}
         try:
@@ -426,7 +381,6 @@ class SearchableMixin(ABC):
         logger.info(f"Desired state contains {len(desired_state)} indexable items.")
-        # --- 2. Handle Different Strategies ---
         if strategy == "upsert_only":
             # Simple case: just index everything, let the service handle upserts
             items_to_index = list(desired_state.values())

natural_pdf/widgets/viewer.py CHANGED Viewed

@@ -31,20 +31,6 @@ try:
     from PIL import Image
     from traitlets import Dict, List, Unicode, observe
-    # --- Read JS code from file (only needed if widgets are defined) --- #
-    _MODULE_DIR = os.path.dirname(__file__)
-    _FRONTEND_JS_PATH = os.path.join(_MODULE_DIR, "frontend", "viewer.js")
-    try:
-        with open(_FRONTEND_JS_PATH, "r", encoding="utf-8") as f:
-            _FRONTEND_JS_CODE = f.read()
-        logger.debug(f"Successfully read frontend JS from: {_FRONTEND_JS_PATH}")
-    except FileNotFoundError:
-        logger.error(f"Frontend JS file not found at {_FRONTEND_JS_PATH}. Widget will likely fail.")
-        _FRONTEND_JS_CODE = "console.error('Frontend JS file not found! Widget cannot load.');"
-    except Exception as e:
-        logger.error(f"Error reading frontend JS file {_FRONTEND_JS_PATH}: {e}")
-        _FRONTEND_JS_CODE = f"console.error('Error reading frontend JS file: {e}');"
     # --- Define Widget Classes ONLY if ipywidgets is available ---
     class SimpleInteractiveViewerWidget(widgets.DOMWidget):
         def __init__(self, pdf_data=None, **kwargs):
@@ -631,7 +617,7 @@ try:
             # Filter out 'char' elements
             filtered_page_elements = [
-                el for el in page_elements if getattr(el, "type", "").lower() != "char"
+                el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
             ]
             logger.debug(
                 f"Filtered out char elements, keeping {len(filtered_page_elements)} elements."
@@ -659,19 +645,21 @@ try:
             for i, element in enumerate(filtered_page_elements):
                 # Get original coordinates and calculated width/height (always present via base class)
+                # Assuming 'element' is always an object with these attributes now
                 original_x0 = element.x0
                 original_y0 = element.top
                 original_x1 = element.x1
                 original_y1 = element.bottom
                 width = element.width
                 height = element.height
+                current_element_type = element.type  # Direct attribute access
                 scale = 1.0
                 # Base element dict with required info
                 elem_dict = {
                     "id": i,
                     # Use the standardized .type property
-                    "type": element.type,
+                    "type": current_element_type,
                     # Scaled coordinates for positioning in HTML/SVG
                     "x0": original_x0 * scale,
                     "y0": original_y0 * scale,
@@ -684,21 +672,24 @@ try:
                 # --- Get Default Attributes --- #
                 attributes_found = set()
                 for attr_name in default_attributes_to_get:
+                    # Assuming 'element' is always an object
                     if hasattr(element, attr_name):
                         try:
-                            value = getattr(element, attr_name)
+                            value_to_process = getattr(element, attr_name)
                             # Convert non-JSON serializable types to string
-                            processed_value = value
+                            processed_value = value_to_process
                             if (
-                                not isinstance(value, (str, int, float, bool, list, dict, tuple))
-                                and value is not None
+                                not isinstance(
+                                    value_to_process, (str, int, float, bool, list, dict, tuple)
+                                )
+                                and value_to_process is not None
                             ):
-                                processed_value = str(value)
+                                processed_value = str(value_to_process)
                             elem_dict[attr_name] = processed_value
                             attributes_found.add(attr_name)
                         except Exception as e:
                             logger.warning(
-                                f"Could not get or process default attribute '{attr_name}' for element {i} ({element.type}): {e}"
+                                f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
                             )
                 # --- Get User-Requested Attributes (if any) --- #
@@ -707,23 +698,23 @@ try:
                         # Only process if not already added and exists
                         if attr_name not in attributes_found and hasattr(element, attr_name):
                             try:
-                                value = getattr(element, attr_name)
-                                processed_value = value
+                                value_to_process = getattr(element, attr_name)
+                                processed_value = value_to_process
                                 if (
                                     not isinstance(
-                                        value, (str, int, float, bool, list, dict, tuple)
+                                        value_to_process, (str, int, float, bool, list, dict, tuple)
                                     )
-                                    and value is not None
+                                    and value_to_process is not None
                                 ):
-                                    processed_value = str(value)
+                                    processed_value = str(value_to_process)
                                 elem_dict[attr_name] = processed_value
                             except Exception as e:
                                 logger.warning(
-                                    f"Could not get or process requested attribute '{attr_name}' for element {i} ({element.type}): {e}"
+                                    f"Could not get or process requested attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
                                 )
-                for attr_name in elem_dict:
-                    if isinstance(elem_dict[attr_name], float):
-                        elem_dict[attr_name] = round(elem_dict[attr_name], 2)
+                for attr_name_val in elem_dict:  # Renamed to avoid conflict
+                    if isinstance(elem_dict[attr_name_val], float):
+                        elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
                 elements.append(elem_dict)
             logger.debug(

natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

natural-pdf 0.1.10py3-none-any.whl → 0.1.12py3-none-any.whl