PyPI - natural-pdf - Versions diffs - 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl - Mend

natural-pdf 0.1.15py3-none-any.whl → 0.1.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +117 -75
natural_pdf/core/pdf.py +19 -22
natural_pdf/elements/base.py +9 -9
natural_pdf/elements/collections.py +105 -50
natural_pdf/elements/region.py +200 -126
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0

natural_pdf/search/numpy_search_service.py CHANGED Viewed

@@ -1,9 +1,9 @@
-import logging
-import numpy as np
 import json
+import logging
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Union
+import numpy as np
 from sentence_transformers import SentenceTransformer
 from .search_options import BaseSearchOptions
@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
 DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 class NumpySearchService(SearchServiceProtocol):
     """Basic in-memory vector search implementation using NumPy."""
@@ -34,19 +35,21 @@ class NumpySearchService(SearchServiceProtocol):
                 "Persistence requested but LanceDB is not installed. "
                 "For persistent vector search, install LanceDB: pip install lancedb"
             )
         self.collection_name = collection_name
         self._embedding_model_name = embedding_model_name
         self.embedding_model = SentenceTransformer(self._embedding_model_name)
         self._embedding_dims = len(self.embedding_model.encode("test"))
         # Simple in-memory storage
         self._vectors = []
         self._documents = []
         self._metadata = []
         self._ids = []
-        logger.info(f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'")
+        logger.info(
+            f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'"
+        )
     def index(
         self,
@@ -55,70 +58,74 @@ class NumpySearchService(SearchServiceProtocol):
         force_reindex: bool = False,
     ) -> None:
         if force_reindex:
-            logger.info(f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors.")
+            logger.info(
+                f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors."
+            )
             self._vectors = []
             self._documents = []
             self._metadata = []
             self._ids = []
         items = list(documents)
         logger.info(f"Indexing {len(items)} documents for collection '{self.collection_name}'")
         if not items:
             logger.warning("No documents provided for indexing. Skipping.")
             return
         texts_to_embed = []
         items_info = []
         for item in items:
             doc_id = item.get_id()
             metadata = item.get_metadata().copy()
             content_obj = item.get_content()
             content_text = ""
             if isinstance(content_obj, str):
                 content_text = content_obj
-            elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
+            elif hasattr(content_obj, "extract_text") and callable(
+                getattr(content_obj, "extract_text")
+            ):
                 content_text = content_obj.extract_text()
-                if not isinstance(content_text, str):
+                if not isinstance(content_text, str):
                     content_text = str(content_obj)
             else:
                 content_text = str(content_obj)
             # Try to add content hash to metadata
             try:
                 content_hash = item.get_content_hash()
-                if content_hash:
+                if content_hash:
                     metadata["content_hash"] = content_hash
             except (AttributeError, NotImplementedError):
                 pass
             except Exception as e:
                 logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
             texts_to_embed.append(content_text)
-            items_info.append({
-                "id": doc_id,
-                "metadata": metadata,
-                "text": content_text
-            })
+            items_info.append({"id": doc_id, "metadata": metadata, "text": content_text})
         if not texts_to_embed:
             logger.warning("No text content to embed. Skipping.")
             return
-        logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
+        logger.info(
+            f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'..."
+        )
         generated_embeddings = self.embedding_model.encode(
             texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
         )
         for i, item_info in enumerate(items_info):
             self._vectors.append(generated_embeddings[i])
             self._documents.append(item_info["text"])
             self._metadata.append(item_info["metadata"])
             self._ids.append(item_info["id"])
-        logger.info(f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}")
+        logger.info(
+            f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}"
+        )
     def search(
         self,
@@ -128,7 +135,7 @@ class NumpySearchService(SearchServiceProtocol):
         if not self._vectors:
             logger.debug("No vectors in index. Returning empty results.")
             return []
         # Process query to text
         query_text = ""
         if isinstance(query, (str, Path)):
@@ -139,28 +146,30 @@ class NumpySearchService(SearchServiceProtocol):
                 return []
         else:
             raise TypeError(f"Unsupported query type: {type(query)}")
-        logger.info(f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}")
+        logger.info(
+            f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}"
+        )
         # Encode query and perform similarity search
         query_vector = self.embedding_model.encode(query_text)
         # Convert list to numpy array for batch operations
         vectors_array = np.array(self._vectors)
         # Normalize vectors for cosine similarity
         query_norm = np.linalg.norm(query_vector)
         if query_norm > 0:
             query_vector = query_vector / query_norm
         # Normalize all vectors (avoid division by zero)
         vector_norms = np.linalg.norm(vectors_array, axis=1, keepdims=True)
         valid_indices = vector_norms.flatten() > 0
         vectors_array[valid_indices] = vectors_array[valid_indices] / vector_norms[valid_indices]
         # Calculate cosine similarities
         similarities = np.dot(vectors_array, query_vector)
         # Apply filters if present
         filtered_indices = np.arange(len(similarities))
         if options.filters:
@@ -175,43 +184,49 @@ class NumpySearchService(SearchServiceProtocol):
                             new_filtered.append(i)
                     filtered_indices = np.array(new_filtered)
             else:
-                logger.warning(f"Complex filter expressions not supported in NumPy backend: {options.filters}")
+                logger.warning(
+                    f"Complex filter expressions not supported in NumPy backend: {options.filters}"
+                )
         # Apply filtering and sort by similarity
         if len(filtered_indices) > 0:
             filtered_similarities = similarities[filtered_indices]
             top_k = min(options.top_k, len(filtered_similarities))
             if top_k == 0:
                 return []
             top_indices_within_filtered = np.argsort(filtered_similarities)[-top_k:][::-1]
             top_indices = filtered_indices[top_indices_within_filtered]
         else:
             top_k = min(options.top_k, len(similarities))
             if top_k == 0:
                 return []
             top_indices = np.argsort(similarities)[-top_k:][::-1]
         # Format results
         results = []
         for idx in top_indices:
             metadata = self._metadata[idx]
-            results.append({
-                "id": self._ids[idx],
-                "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
-                "score": float(similarities[idx]),
-                "page_number": metadata.get("page_number"),
-                "pdf_path": metadata.get("pdf_path"),
-                "metadata": metadata,
-            })
-        logger.info(f"Search returned {len(results)} results from collection '{self.collection_name}'")
+            results.append(
+                {
+                    "id": self._ids[idx],
+                    "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
+                    "score": float(similarities[idx]),
+                    "page_number": metadata.get("page_number"),
+                    "pdf_path": metadata.get("pdf_path"),
+                    "metadata": metadata,
+                }
+            )
+        logger.info(
+            f"Search returned {len(results)} results from collection '{self.collection_name}'"
+        )
         return results
     def index_exists(self) -> bool:
         return len(self._vectors) > 0
     def delete_index(self) -> bool:
         logger.warning(f"Deleting in-memory index for collection '{self.collection_name}'")
         self._vectors = []
@@ -219,37 +234,43 @@ class NumpySearchService(SearchServiceProtocol):
         self._metadata = []
         self._ids = []
         return True
     def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
-        logger.debug(f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})...")
+        logger.debug(
+            f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})..."
+        )
         results = []
         for i, doc_id in enumerate(self._ids):
             doc_info = {"id": doc_id}
             if include_metadata:
                 doc_info["meta"] = self._metadata[i]
             results.append(doc_info)
-        logger.info(f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'")
+        logger.info(
+            f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'"
+        )
         return results
     def delete_documents(self, ids: List[str]) -> None:
         if not ids:
             logger.debug("No document IDs provided for deletion. Skipping.")
             return
-        logger.warning(f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'")
+        logger.warning(
+            f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'"
+        )
         # Find indices to remove
         keep_indices = []
         for i, doc_id in enumerate(self._ids):
             if doc_id not in ids:
                 keep_indices.append(i)
         # Create new filtered lists
         self._ids = [self._ids[i] for i in keep_indices]
         self._vectors = [self._vectors[i] for i in keep_indices]
         self._documents = [self._documents[i] for i in keep_indices]
         self._metadata = [self._metadata[i] for i in keep_indices]
-        logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")
+        logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")

natural_pdf/search/searchable_mixin.py CHANGED Viewed

@@ -123,7 +123,7 @@ class SearchableMixin(ABC):
             logger.info(
                 f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
             )
             # Direct creation without try/except
             service_args = {
                 "collection_name": effective_collection_name,
@@ -195,7 +195,7 @@ class SearchableMixin(ABC):
         logger.debug(
             f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
         )
         # Call index without try/except
         self._search_service.index(
             documents=indexable_items,

natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

natural-pdf 0.1.15py3-none-any.whl → 0.1.16py3-none-any.whl