PyPI - natural-pdf - Versions diffs - 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

natural-pdf 0.1.15py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +119 -76
natural_pdf/core/pdf.py +19 -22
natural_pdf/describe/__init__.py +21 -0
natural_pdf/describe/base.py +457 -0
natural_pdf/describe/elements.py +411 -0
natural_pdf/describe/mixin.py +84 -0
natural_pdf/describe/summary.py +186 -0
natural_pdf/elements/base.py +11 -10
natural_pdf/elements/collections.py +116 -51
natural_pdf/elements/region.py +204 -127
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/ocr_options.py CHANGED Viewed

@@ -65,97 +65,62 @@ class EasyOCROptions(BaseOCROptions):
 # --- PaddleOCR Specific Options ---
 @dataclass
 class PaddleOCROptions(BaseOCROptions):
-    """Specific options for the PaddleOCR engine."""
-    # General
-    use_gpu: Optional[bool] = None
-    gpu_mem: int = 8000  # Default from Paddle documentation
-    gpu_mem: int = 8000  # Default from Paddle documentation
-    ir_optim: bool = True
-    use_tensorrt: bool = False
-    min_subgraph_size: int = 15
-    precision: str = "fp32"
-    enable_mkldnn: bool = False
-    cpu_threads: int = 10
-    use_fp16: bool = False
-    show_log: bool = False
-    use_onnx: bool = False
-    use_zero_copy_run: bool = False
-    # Detection
-    det: bool = True
-    det_algorithm: str = "DB"
-    show_log: bool = False
-    use_onnx: bool = False
-    use_zero_copy_run: bool = False
-    # Detection
-    det: bool = True
-    det_algorithm: str = "DB"
-    det_model_dir: Optional[str] = None
-    det_limit_side_len: int = 960  # Corresponds to det_max_side_len
-    # DB specific
-    det_db_thresh: float = 0.3
-    det_db_box_thresh: float = 0.5
-    det_db_unclip_ratio: float = 2.0
-    # EAST specific
-    det_east_score_thresh: float = 0.8
-    det_east_cover_thresh: float = 0.1
-    det_east_nms_thresh: float = 0.2
-    # Recognition
-    rec: bool = True
-    rec_algorithm: str = "CRNN"
-    det_limit_side_len: int = 960  # Corresponds to det_max_side_len
-    # DB specific
-    det_db_thresh: float = 0.3
-    det_db_box_thresh: float = 0.5
-    det_db_unclip_ratio: float = 2.0
-    # EAST specific
-    det_east_score_thresh: float = 0.8
-    det_east_cover_thresh: float = 0.1
-    det_east_nms_thresh: float = 0.2
-    # Recognition
-    rec: bool = True
-    rec_algorithm: str = "CRNN"
-    rec_model_dir: Optional[str] = None
-    rec_image_shape: str = "3, 32, 320"  # Kept as string per Paddle examples
-    rec_batch_num: int = 30  # Default from Paddle documentation
-    rec_image_shape: str = "3, 32, 320"  # Kept as string per Paddle examples
-    rec_batch_num: int = 30  # Default from Paddle documentation
-    max_text_length: int = 25
-    rec_char_dict_path: Optional[str] = None  # Path to char dictionary file
-    rec_char_dict_path: Optional[str] = None  # Path to char dictionary file
-    use_space_char: bool = True
-    drop_score: float = 0.5
-    # Classification
-    cls: Optional[bool] = None  # Often inferred from use_angle_cls
-    use_angle_cls: bool = False  # Default from Paddle documentation
-    cls_model_dir: Optional[str] = None
-    cls_image_shape: str = "3, 48, 192"  # Kept as string per Paddle examples
-    label_list: List[str] = field(default_factory=lambda: ["0", "180"])  # Default from Paddle doc
-    cls_batch_num: int = 30
-    # Classification
-    cls: Optional[bool] = None  # Often inferred from use_angle_cls
-    use_angle_cls: bool = False  # Default from Paddle documentation
-    cls_model_dir: Optional[str] = None
-    cls_image_shape: str = "3, 48, 192"  # Kept as string per Paddle examples
-    label_list: List[str] = field(default_factory=lambda: ["0", "180"])  # Default from Paddle doc
-    cls_batch_num: int = 30
+    """
+    Specific options for the PaddleOCR engine, reflecting the paddleocr>=3.0.0 API.
+    See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html
+    """
+    # --- Constructor Parameters ---
+    # Model paths and names
+    doc_orientation_classify_model_name: Optional[str] = None
+    doc_orientation_classify_model_dir: Optional[str] = None
+    doc_unwarping_model_name: Optional[str] = None
+    doc_unwarping_model_dir: Optional[str] = None
+    text_detection_model_name: Optional[str] = None
+    text_detection_model_dir: Optional[str] = None
+    textline_orientation_model_name: Optional[str] = None
+    textline_orientation_model_dir: Optional[str] = None
+    text_recognition_model_name: Optional[str] = None
+    text_recognition_model_dir: Optional[str] = None
+    # Module usage flags (can be overridden at predict time)
+    use_doc_orientation_classify: Optional[bool] = False
+    use_doc_unwarping: Optional[bool] = False
+    use_textline_orientation: Optional[bool] = False
+    # Batch sizes
+    textline_orientation_batch_size: Optional[int] = None
+    text_recognition_batch_size: Optional[int] = None
+    # Detection parameters (can be overridden at predict time)
+    # https://github.com/PaddlePaddle/PaddleOCR/issues/15424
+    text_det_limit_side_len: Optional[int] = 736 # WAITING FOR FIX
+    text_det_limit_type: Optional[str] = 'max' # WAITING FOR FIX
+    text_det_thresh: Optional[float] = None
+    text_det_box_thresh: Optional[float] = None
+    text_det_unclip_ratio: Optional[float] = None
+    text_det_input_shape: Optional[Tuple[int, int]] = None
+    # Recognition parameters (can be overridden at predict time)
+    text_rec_score_thresh: Optional[float] = None
+    text_rec_input_shape: Optional[Tuple[int, int, int]] = None
+    # General parameters
+    lang: Optional[str] = None
+    ocr_version: Optional[str] = None
+    device: Optional[str] = None
+    enable_hpi: Optional[bool] = None
+    use_tensorrt: Optional[bool] = None
+    precision: Optional[str] = None
+    enable_mkldnn: Optional[bool] = False # https://github.com/PaddlePaddle/PaddleOCR/issues/15294
+    # mkldnn_cache_capacity: Optional[int] = None
+    cpu_threads: Optional[int] = None
+    paddlex_config: Optional[str] = None
     def __post_init__(self):
         pass
-    #     if self.use_gpu is None:
-    #         if self.device and "cuda" in self.device.lower():
-    #             self.use_gpu = True
-    #         else:
-    #             self.use_gpu = False
-    #     # logger.debug(f"Initialized PaddleOCROptions: {self}")
 # --- Surya Specific Options ---
 @dataclass

natural_pdf/search/__init__.py CHANGED Viewed

@@ -4,8 +4,12 @@ import logging
 from typing import Optional
 # Import constants
-from .search_options import SearchOptions
-from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
+from .search_options import (
+    BaseSearchOptions,
+    MultiModalSearchOptions,
+    SearchOptions,
+    TextSearchOptions,
+)
 from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
 # Check search extras availability
@@ -13,21 +17,27 @@ LANCEDB_AVAILABLE = False
 SEARCH_DEPENDENCIES_AVAILABLE = False
 try:
-    import sentence_transformers
     import numpy as np
+    import sentence_transformers
     # Basic search dependencies are available
     SEARCH_DEPENDENCIES_AVAILABLE = True
     # Check if LanceDB is available
     try:
         import lancedb
         import pyarrow
         LANCEDB_AVAILABLE = True
-        from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
+        from .lancedb_search_service import (
+            DEFAULT_EMBEDDING_MODEL,
+            DEFAULT_LANCEDB_PERSIST_PATH,
+            LanceDBSearchService,
+        )
     except ImportError:
         # LanceDB not available, we'll use NumPy fallback
         LANCEDB_AVAILABLE = False
-        from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
+        from .numpy_search_service import DEFAULT_EMBEDDING_MODEL, NumpySearchService
 except ImportError:
     # Basic dependencies missing
     SEARCH_DEPENDENCIES_AVAILABLE = False
@@ -35,6 +45,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
 def check_search_availability():
     """Check if required search dependencies are available."""
     if not SEARCH_DEPENDENCIES_AVAILABLE:
@@ -43,6 +54,7 @@ def check_search_availability():
             "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
         )
 def get_search_service(
     collection_name: str,
     persist: bool = False,
@@ -51,7 +63,7 @@ def get_search_service(
 ) -> SearchServiceProtocol:
     """
     Factory function to get an instance of the configured search service.
     Automatically selects the best available implementation:
     - LanceDB if installed (recommended for both in-memory and persistent)
     - Numpy fallback for in-memory only
@@ -84,16 +96,17 @@ def get_search_service(
     # If persistence is requested, LanceDB is required
     if persist and not LANCEDB_AVAILABLE:
         raise RuntimeError(
-            "Persistent vector search requires LanceDB. "
-            "Please install: pip install lancedb"
+            "Persistent vector search requires LanceDB. " "Please install: pip install lancedb"
         )
     # Select the appropriate implementation
     if LANCEDB_AVAILABLE:
         logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
         service_instance = LanceDBSearchService(**service_args)
     else:
-        logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
+        logger.info(
+            f"Using NumPy fallback for in-memory vector search (collection: {collection_name})"
+        )
         service_instance = NumpySearchService(**service_args)
     return service_instance

natural_pdf/search/lancedb_search_service.py CHANGED Viewed

@@ -63,20 +63,22 @@ class LanceDBSearchService(SearchServiceProtocol):
     def _get_schema(self) -> pa.Schema:
         if self._embedding_dims is None:
             raise RuntimeError("Embedding dimensions not determined. Cannot create schema.")
-        return pa.schema([
-            pa.field("id", pa.string(), nullable=False),
-            pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
-            pa.field("text", pa.string()),
-            pa.field("metadata_json", pa.string())
-        ])
+        return pa.schema(
+            [
+                pa.field("id", pa.string(), nullable=False),
+                pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
+                pa.field("text", pa.string()),
+                pa.field("metadata_json", pa.string()),
+            ]
+        )
     def _open_or_create_table(self):
         if self._db is None:
             raise RuntimeError("LanceDB connection not established.")
         table_names = self._db.table_names()
         if self.collection_name in table_names:
             logger.debug(f"Opening existing LanceDB table: {self.collection_name}")
             self._table = self._db.open_table(self.collection_name)
@@ -86,7 +88,7 @@ class LanceDBSearchService(SearchServiceProtocol):
             self._table = self._db.create_table(self.collection_name, schema=schema, mode="create")
     def __del__(self):
-        if not self._persist and hasattr(self, '_temp_dir_obj') and logger:
+        if not self._persist and hasattr(self, "_temp_dir_obj") and logger:
             logger.debug(f"Cleaning up temporary directory for in-memory LanceDB: {self._uri}")
             self._temp_dir_obj.cleanup()
@@ -130,17 +132,23 @@ class LanceDBSearchService(SearchServiceProtocol):
             if isinstance(content_obj, str):
                 content_text = content_obj
-            elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
+            elif hasattr(content_obj, "extract_text") and callable(
+                getattr(content_obj, "extract_text")
+            ):
                 content_text = content_obj.extract_text()
-                if not isinstance(content_text, str): content_text = str(content_obj)
+                if not isinstance(content_text, str):
+                    content_text = str(content_obj)
             else:
                 content_text = str(content_obj)
             try:
                 content_hash = item.get_content_hash()
-                if content_hash: metadata["content_hash"] = content_hash
-            except (AttributeError, NotImplementedError): pass
-            except Exception as e: logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
+                if content_hash:
+                    metadata["content_hash"] = content_hash
+            except (AttributeError, NotImplementedError):
+                pass
+            except Exception as e:
+                logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
             # Ensure doc_id is not None - use a fallback if needed
             if doc_id is None:
@@ -151,28 +159,30 @@ class LanceDBSearchService(SearchServiceProtocol):
                     doc_id = f"auto_{len(texts_to_embed)}"
             texts_to_embed.append(content_text)
-            original_items_info.append({
-                "id": doc_id,
-                "metadata_json": json.dumps(metadata),
-                "text": content_text
-            })
+            original_items_info.append(
+                {"id": doc_id, "metadata_json": json.dumps(metadata), "text": content_text}
+            )
         if not texts_to_embed:
             logger.warning("No text content to embed. Skipping.")
             return
-        logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
+        logger.info(
+            f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'..."
+        )
         generated_embeddings = self.embedding_model.encode(
             texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
         )
         for i, item_info in enumerate(original_items_info):
-            data_to_add.append({
-                "id": item_info["id"],
-                "vector": generated_embeddings[i].tolist(),
-                "text": item_info["text"],
-                "metadata_json": item_info["metadata_json"]
-            })
+            data_to_add.append(
+                {
+                    "id": item_info["id"],
+                    "vector": generated_embeddings[i].tolist(),
+                    "text": item_info["text"],
+                    "metadata_json": item_info["metadata_json"],
+                }
+            )
         if not data_to_add:
             logger.warning("No data prepared for LanceDB. Skipping add.")
@@ -188,11 +198,17 @@ class LanceDBSearchService(SearchServiceProtocol):
         ]
         table = pa.Table.from_arrays(arrays, schema=schema)
-        logger.info(f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'.")
-        self._table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(
+        logger.info(
+            f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'."
+        )
+        self._table.merge_insert(
+            "id"
+        ).when_matched_update_all().when_not_matched_insert_all().execute(
             table,
         )
-        logger.info(f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}")
+        logger.info(
+            f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}"
+        )
     def search(
         self,
@@ -202,12 +218,16 @@ class LanceDBSearchService(SearchServiceProtocol):
         if self._table is None:
             raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
-        logger.info(f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}")
+        logger.info(
+            f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}"
+        )
         query_text = ""
-        if isinstance(query, (str, Path)): query_text = str(query)
+        if isinstance(query, (str, Path)):
+            query_text = str(query)
         elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
             query_text = query.extract_text()
-            if not query_text or not query_text.strip(): return []
+            if not query_text or not query_text.strip():
+                return []
         else:
             raise TypeError(f"Unsupported query type: {type(query)}")
@@ -226,7 +246,9 @@ class LanceDBSearchService(SearchServiceProtocol):
                         filter_parts.append(f"{k} = {v}")
                 if filter_parts:
                     lancedb_filter = " AND ".join(filter_parts)
-                logger.warning(f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions.")
+                logger.warning(
+                    f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions."
+                )
         search_query = self._table.search(query_vector).limit(options.top_k)
         if lancedb_filter:
@@ -246,15 +268,19 @@ class LanceDBSearchService(SearchServiceProtocol):
             score = 1 - row["_distance"] if "_distance" in row else 0.0
-            final_results.append({
-                "id": row.get("id"),
-                "content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
-                "score": score,
-                "page_number": metadata.get("page_number"),
-                "pdf_path": metadata.get("pdf_path"),
-                "metadata": metadata,
-            })
-        logger.info(f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'.")
+            final_results.append(
+                {
+                    "id": row.get("id"),
+                    "content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
+                    "score": score,
+                    "page_number": metadata.get("page_number"),
+                    "pdf_path": metadata.get("pdf_path"),
+                    "metadata": metadata,
+                }
+            )
+        logger.info(
+            f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'."
+        )
         return final_results
     def delete_index(self) -> bool:
@@ -262,29 +288,33 @@ class LanceDBSearchService(SearchServiceProtocol):
             logger.warning("LanceDB connection not initialized. Cannot delete index.")
             return False
         logger.warning(f"Request to delete LanceDB table '{self.collection_name}'.")
         self._db.drop_table(self.collection_name)
         self._table = None
         logger.info(f"LanceDB table '{self.collection_name}' deleted successfully.")
         return True
     def index_exists(self) -> bool:
-        if self._db is None:
+        if self._db is None:
             return False
         exists = self.collection_name in self._db.table_names()
         if exists:
             tbl = self._db.open_table(self.collection_name)
             count = tbl.count_rows()
-            logger.debug(f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}")
+            logger.debug(
+                f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}"
+            )
             return count > 0
         logger.debug(f"LanceDB table '{self.collection_name}' not found in db.table_names().")
         return False
     def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
-        if self._table is None:
+        if self._table is None:
             raise RuntimeError("Table not initialized")
-        logger.debug(f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})...")
+        logger.debug(
+            f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})..."
+        )
         select_columns = ["id"]
         if include_metadata:
@@ -298,6 +328,7 @@ class LanceDBSearchService(SearchServiceProtocol):
         formatted_docs: List[Dict[str, Any]] = []
         import json
         for row in results_list:
             doc_data: Dict[str, Any] = {"id": row.get("id")}
             if include_metadata and "metadata_json" in row and row["metadata_json"]:
@@ -307,11 +338,13 @@ class LanceDBSearchService(SearchServiceProtocol):
                 except json.JSONDecodeError:
                     doc_data["meta"] = {}
             formatted_docs.append(doc_data)
-        logger.info(f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'.")
+        logger.info(
+            f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'."
+        )
         return formatted_docs
     def delete_documents(self, ids: List[str]) -> None:
-        if self._table is None:
+        if self._table is None:
             raise RuntimeError("Table not initialized")
         if not ids:
             logger.debug("No document IDs provided for deletion. Skipping.")
@@ -319,7 +352,11 @@ class LanceDBSearchService(SearchServiceProtocol):
         id_filter_string = ", ".join([f"'{doc_id}'" for doc_id in ids])
         delete_condition = f"id IN ({id_filter_string})"
-        logger.warning(f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}")
+        logger.warning(
+            f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}"
+        )
         self._table.delete(delete_condition)
-        logger.info(f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}")
+        logger.info(
+            f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}"
+        )

natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

natural-pdf 0.1.15py3-none-any.whl → 0.1.17py3-none-any.whl