PyPI - natural-pdf - Versions diffs - 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +126 -98
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +910 -516
natural_pdf/core/pdf.py +387 -289
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +714 -514
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.3.dist-info/RECORD +0 -61
natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/collections/pdf_collection.py CHANGED Viewed

@@ -1,39 +1,50 @@
-import os
+import copy  # Added for copying options
 import glob as py_glob
 import logging
-from typing import List, Optional, Dict, Any, Union, Iterable, Set, TYPE_CHECKING, Type
+import os
+import re  # Added for safe path generation
 from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
 from PIL import Image
-import re # Added for safe path generation
-import copy # Added for copying options
 from tqdm import tqdm
 # Set up logger early
 logger = logging.getLogger(__name__)
 from natural_pdf.core.pdf import PDF
-from natural_pdf.elements.region import Region
+from natural_pdf.elements.region import Region
 # --- Search Imports ---
 try:
     from natural_pdf.search.search_service_protocol import (
-         SearchServiceProtocol, SearchOptions, Indexable
-     )
+        Indexable,
+        SearchOptions,
+        SearchServiceProtocol,
+    )
     from natural_pdf.search.searchable_mixin import SearchableMixin
 except ImportError as e:
     logger_init = logging.getLogger(__name__)
-    logger_init.error(f"Failed to import search components. Search functionality disabled. Error: {e}", exc_info=True)
+    logger_init.warning(
+        f"Failed to import Haystack components. Semantic search functionality disabled.",
+    )
     # Dummy definitions
-    class SearchableMixin: pass
+    class SearchableMixin:
+        pass
     SearchServiceProtocol, SearchOptions, Indexable = object, object, object
-from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
+from natural_pdf.search.searchable_mixin import SearchableMixin  # Import the new mixin
-class PDFCollection(SearchableMixin): # Inherit from the mixin
-    def __init__(self,
-                 source: Union[str, Iterable[Union[str, 'PDF']]],
-                 recursive: bool = True,
-                 **pdf_options: Any):
+class PDFCollection(SearchableMixin):  # Inherit from the mixin
+    def __init__(
+        self,
+        source: Union[str, Iterable[Union[str, "PDF"]]],
+        recursive: bool = True,
+        **pdf_options: Any,
+    ):
         """
         Initializes a collection of PDF documents from various sources.
@@ -46,27 +57,29 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
                        whether to search recursively (default: True).
             **pdf_options: Keyword arguments passed to the PDF constructor.
         """
-        self._pdfs: List['PDF'] = []
-        self._pdf_options = pdf_options # Store options for potential slicing later
-        self._recursive = recursive # Store setting for potential slicing
+        self._pdfs: List["PDF"] = []
+        self._pdf_options = pdf_options  # Store options for potential slicing later
+        self._recursive = recursive  # Store setting for potential slicing
         # Dynamically import PDF class within methods to avoid circular import at module load time
         PDF = self._get_pdf_class()
-        if hasattr(source, '__iter__') and not isinstance(source, str):
-             source_list = list(source)
-             if not source_list: return # Empty list source
-             if isinstance(source_list[0], PDF):
-                  if all(isinstance(item, PDF) for item in source_list):
-                       self._pdfs = source_list # Direct assignment
-                       # Don't adopt search context anymore
-                       return
-                  else: raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
-             # If it's an iterable but not PDFs, fall through to resolve sources
+        if hasattr(source, "__iter__") and not isinstance(source, str):
+            source_list = list(source)
+            if not source_list:
+                return  # Empty list source
+            if isinstance(source_list[0], PDF):
+                if all(isinstance(item, PDF) for item in source_list):
+                    self._pdfs = source_list  # Direct assignment
+                    # Don't adopt search context anymore
+                    return
+                else:
+                    raise TypeError("Iterable source has mixed PDF/non-PDF objects.")
+            # If it's an iterable but not PDFs, fall through to resolve sources
         # Resolve string, iterable of strings, or single string source to paths/URLs
         resolved_paths_or_urls = self._resolve_sources_to_paths(source)
-        self._initialize_pdfs(resolved_paths_or_urls, PDF) # Pass PDF class
+        self._initialize_pdfs(resolved_paths_or_urls, PDF)  # Pass PDF class
         self._iter_index = 0
@@ -79,15 +92,21 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
         try:
             # Import needs to resolve path correctly
             from natural_pdf.core.pdf import PDF
             return PDF
         except ImportError as e:
-            logger.error("Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime.")
+            logger.error(
+                "Could not import PDF class from natural_pdf.core.pdf. Ensure it exists and there are no circular imports at runtime."
+            )
             raise ImportError("PDF class is required but could not be imported.") from e
     # --- Internal Helpers ---
-    def _is_url(self, s: str) -> bool: return s.startswith(('http://', 'https://'))
-    def _has_glob_magic(self, s: str) -> bool: return py_glob.has_magic(s)
+    def _is_url(self, s: str) -> bool:
+        return s.startswith(("http://", "https://"))
+    def _has_glob_magic(self, s: str) -> bool:
+        return py_glob.has_magic(s)
     def _execute_glob(self, pattern: str) -> Set[str]:
         """Glob for paths and return a set of valid PDF paths."""
@@ -96,10 +115,10 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
             # Use iglob for potentially large directories/matches
             paths_iter = py_glob.iglob(pattern, recursive=self._recursive)
             for path_str in paths_iter:
-                 # Use Path object for easier checking
-                 p = Path(path_str)
-                 if p.is_file() and p.suffix.lower() == ".pdf":
-                      found_paths.add(str(p.resolve())) # Store resolved absolute path
+                # Use Path object for easier checking
+                p = Path(path_str)
+                if p.is_file() and p.suffix.lower() == ".pdf":
+                    found_paths.add(str(p.resolve()))  # Store resolved absolute path
         except Exception as e:
             logger.error(f"Error processing glob pattern '{pattern}': {e}")
         return found_paths
@@ -111,33 +130,37 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
         if isinstance(source, str):
             sources_to_process.append(source)
-        elif hasattr(source, '__iter__'):
+        elif hasattr(source, "__iter__"):
             sources_to_process.extend(list(source))
-        else: # Should not happen based on __init__ checks, but safeguard
-             raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
+        else:  # Should not happen based on __init__ checks, but safeguard
+            raise TypeError(f"Unexpected source type in _resolve_sources_to_paths: {type(source)}")
         for item in sources_to_process:
-             if not isinstance(item, str):
-                  logger.warning(f"Skipping non-string item in source list: {type(item)}")
-                  continue
-             item_path = Path(item)
-             if self._is_url(item):
-                 final_paths.add(item) # Add URL directly
-             elif self._has_glob_magic(item):
-                 glob_results = self._execute_glob(item)
-                 final_paths.update(glob_results)
-             elif item_path.is_dir():
-                 # Use glob to find PDFs in directory, respecting recursive flag
-                 dir_pattern = str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
-                 dir_glob_results = self._execute_glob(dir_pattern)
-                 final_paths.update(dir_glob_results)
-             elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
-                 final_paths.add(str(item_path.resolve())) # Add resolved file path
-             else:
-                 logger.warning(f"Source item ignored (not a valid URL, directory, file, or glob): {item}")
+            if not isinstance(item, str):
+                logger.warning(f"Skipping non-string item in source list: {type(item)}")
+                continue
+            item_path = Path(item)
+            if self._is_url(item):
+                final_paths.add(item)  # Add URL directly
+            elif self._has_glob_magic(item):
+                glob_results = self._execute_glob(item)
+                final_paths.update(glob_results)
+            elif item_path.is_dir():
+                # Use glob to find PDFs in directory, respecting recursive flag
+                dir_pattern = (
+                    str(item_path / "**" / "*.pdf") if self._recursive else str(item_path / "*.pdf")
+                )
+                dir_glob_results = self._execute_glob(dir_pattern)
+                final_paths.update(dir_glob_results)
+            elif item_path.is_file() and item_path.suffix.lower() == ".pdf":
+                final_paths.add(str(item_path.resolve()))  # Add resolved file path
+            else:
+                logger.warning(
+                    f"Source item ignored (not a valid URL, directory, file, or glob): {item}"
+                )
         return sorted(list(final_paths))
     def _initialize_pdfs(self, paths_or_urls: List[str], PDF_cls: Type):
@@ -149,32 +172,38 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
                 pdf_instance = PDF_cls(path_or_url, **self._pdf_options)
                 self._pdfs.append(pdf_instance)
             except Exception as e:
-                 logger.error(f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False) # Keep log concise
-                 failed_count += 1
+                logger.error(
+                    f"Failed to load PDF: {path_or_url}. Error: {e}", exc_info=False
+                )  # Keep log concise
+                failed_count += 1
         logger.info(f"Successfully initialized {len(self._pdfs)} PDFs. Failed: {failed_count}")
     # --- Public Factory Class Methods (Simplified) ---
     @classmethod
-    def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> 'PDFCollection':
+    def from_paths(cls, paths_or_urls: List[str], **pdf_options: Any) -> "PDFCollection":
         """Creates a PDFCollection explicitly from a list of file paths or URLs."""
         # __init__ can handle List[str] directly now
         return cls(paths_or_urls, **pdf_options)
     @classmethod
-    def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
+    def from_glob(cls, pattern: str, recursive: bool = True, **pdf_options: Any) -> "PDFCollection":
         """Creates a PDFCollection explicitly from a single glob pattern."""
         # __init__ can handle single glob string directly
         return cls(pattern, recursive=recursive, **pdf_options)
     @classmethod
-    def from_globs(cls, patterns: List[str], recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
+    def from_globs(
+        cls, patterns: List[str], recursive: bool = True, **pdf_options: Any
+    ) -> "PDFCollection":
         """Creates a PDFCollection explicitly from a list of glob patterns."""
-         # __init__ can handle List[str] containing globs directly
+        # __init__ can handle List[str] containing globs directly
         return cls(patterns, recursive=recursive, **pdf_options)
     @classmethod
-    def from_directory(cls, directory_path: str, recursive: bool = True, **pdf_options: Any) -> 'PDFCollection':
+    def from_directory(
+        cls, directory_path: str, recursive: bool = True, **pdf_options: Any
+    ) -> "PDFCollection":
         """Creates a PDFCollection explicitly from PDF files within a directory."""
         # __init__ can handle single directory string directly
         return cls(directory_path, recursive=recursive, **pdf_options)
@@ -183,12 +212,12 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
     def __len__(self) -> int:
         return len(self._pdfs)
-    def __getitem__(self, key) -> Union['PDF', 'PDFCollection']:
+    def __getitem__(self, key) -> Union["PDF", "PDFCollection"]:
         # Use dynamic import here as well
         PDF = self._get_pdf_class()
         if isinstance(key, slice):
             # Create a new collection with the sliced PDFs and original options
-            new_collection = PDFCollection.__new__(PDFCollection) # Create blank instance
+            new_collection = PDFCollection.__new__(PDFCollection)  # Create blank instance
             new_collection._pdfs = self._pdfs[key]
             new_collection._pdf_options = self._pdf_options
             new_collection._recursive = self._recursive
@@ -199,9 +228,9 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
             if 0 <= key < len(self._pdfs):
                 return self._pdfs[key]
             else:
-                 raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
+                raise IndexError(f"PDF index {key} out of range (0-{len(self._pdfs)-1}).")
         else:
-             raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
+            raise TypeError(f"PDF indices must be integers or slices, not {type(key)}.")
     def __iter__(self):
         return iter(self._pdfs)
@@ -211,24 +240,23 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
         return f"<PDFCollection(count={len(self)})>"
     @property
-    def pdfs(self) -> List['PDF']:
-         """Returns the list of PDF objects held by the collection."""
-         return self._pdfs
+    def pdfs(self) -> List["PDF"]:
+        """Returns the list of PDF objects held by the collection."""
+        return self._pdfs
-    # --- Other Methods (e.g., apply_ocr_to_pages - could leverage service in future?) ---
-    def apply_ocr_to_pages(self, *args, **kwargs):
+    def apply_ocr(self, *args, **kwargs):
         PDF = self._get_pdf_class()
         # Delegate to individual PDF objects
         logger.info("Applying OCR to relevant PDFs in collection...")
         results = []
         for pdf in self._pdfs:
-             # We need to figure out which pages belong to which PDF if batching here
-             # For now, simpler to call on each PDF
-             try:
-                 # Assume apply_ocr_to_pages exists on PDF and accepts similar args
-                 pdf.apply_ocr_to_pages(*args, **kwargs)
-             except Exception as e:
-                 logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
+            # We need to figure out which pages belong to which PDF if batching here
+            # For now, simpler to call on each PDF
+            try:
+                # Assume apply_ocr exists on PDF and accepts similar args
+                pdf.apply_ocr(*args, **kwargs)
+            except Exception as e:
+                logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
         return self
     # --- Advanced Method Placeholders ---
@@ -237,23 +265,23 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
     def categorize(self, categories: List[str], **kwargs):
         """Categorizes PDFs in the collection based on content or features."""
         # Implementation requires integrating with classification models or logic
-        raise NotImplementedError("categorize requires classification implementation.")
+        raise NotImplementedError("categorize requires classification implementation.")
-    # --- Mixin Required Implementation ---
+    # --- Mixin Required Implementation ---
     def get_indexable_items(self) -> Iterable[Indexable]:
         """Yields Page objects from the collection, conforming to Indexable."""
         if not self._pdfs:
-             return # Return empty iterator if no PDFs
+            return  # Return empty iterator if no PDFs
         for pdf in self._pdfs:
-             if not pdf.pages: # Handle case where a PDF might have 0 pages after loading
-                 logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
-                 continue
-             for page in pdf.pages:
-                 # Optional: Add filtering here if needed (e.g., skip empty pages)
-                 # Assuming Page object conforms to Indexable
-                 # We might still want the empty page check here for efficiency
-                 # if not page.extract_text(use_exclusions=False).strip():
-                 #     logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
-                 #     continue
-                 yield page
+            if not pdf.pages:  # Handle case where a PDF might have 0 pages after loading
+                logger.warning(f"PDF '{pdf.path}' has no pages. Skipping.")
+                continue
+            for page in pdf.pages:
+                # Optional: Add filtering here if needed (e.g., skip empty pages)
+                # Assuming Page object conforms to Indexable
+                # We might still want the empty page check here for efficiency
+                # if not page.extract_text(use_exclusions=False).strip():
+                #     logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
+                #     continue
+                yield page

natural_pdf/core/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 """
 Core classes for Natural PDF.
-"""
+"""

natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.3py3-none-any.whl → 0.1.5py3-none-any.whl