PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/base.py CHANGED Viewed

@@ -2,13 +2,15 @@
 Base Element class for natural-pdf.
 """
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overload
 from PIL import Image
+# Import selector parsing functions
+from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
-    from natural_pdf.elements.base import Element
     from natural_pdf.elements.collections import ElementCollection
     from natural_pdf.elements.region import Region
@@ -135,19 +137,11 @@ class DirectionalMixin:
                 # Adjust cross boundaries if cross_size is 'element'
                 if cross_size == "element":
                     if is_horizontal:  # Adjust y0, y1
-                        target_y0 = (
-                            target.top if include_endpoint else target.bottom
-                        )  # Use opposite boundary if excluding
-                        target_y1 = target.bottom if include_endpoint else target.top
-                        y0 = min(y0, target_y0)
-                        y1 = max(y1, target_y1)
+                        y0 = min(y0, self.y0)
+                        y1 = max(y1, self.y1)
                     else:  # Adjust x0, x1
-                        target_x0 = (
-                            target.x0 if include_endpoint else target.x1
-                        )  # Use opposite boundary if excluding
-                        target_x1 = target.x1 if include_endpoint else target.x0
-                        x0 = min(x0, target_x0)
-                        x1 = max(x1, target_x1)
+                        x0 = min(x0, self.x0)
+                        x1 = max(x1, self.x1)
         # 4. Finalize bbox coordinates
         if is_horizontal:
@@ -525,7 +519,7 @@ class Element(DirectionalMixin):
             selector: Optional selector to filter by
             limit: Maximum number of elements to search through (default: 10)
             apply_exclusions: Whether to apply exclusion regions (default: True)
-            **kwargs: Additional parameters
+            **kwargs: Additional parameters for selector filtering (e.g., regex, case)
         Returns:
             Next element or None if not found
@@ -548,13 +542,19 @@ class Element(DirectionalMixin):
             # Limit search range for performance
             candidates = candidates[:limit] if limit else candidates
-            # Find matching elements
-            from natural_pdf.elements.collections import ElementCollection
+            # Parse the selector and create a filter function
+            parsed_selector = parse_selector(selector)
+            # Pass relevant kwargs (like regex, case) to the filter function builder
+            filter_func = selector_to_filter_func(parsed_selector, **kwargs)
+            # Iterate and return the first match
+            for candidate in candidates:
+                if filter_func(candidate):
+                    return candidate
+            return None  # No match found
-            matches = ElementCollection(candidates).find_all(selector, **kwargs)
-            return matches[0] if matches else None
+        # No selector, just return the next element if it exists
         elif idx + 1 < len(all_elements):
-            # No selector, just return the next element
             return all_elements[idx + 1]
         return None
@@ -573,7 +573,7 @@ class Element(DirectionalMixin):
             selector: Optional selector to filter by
             limit: Maximum number of elements to search through (default: 10)
             apply_exclusions: Whether to apply exclusion regions (default: True)
-            **kwargs: Additional parameters
+            **kwargs: Additional parameters for selector filtering (e.g., regex, case)
         Returns:
             Previous element or None if not found
@@ -598,13 +598,19 @@ class Element(DirectionalMixin):
             # Limit search range for performance
             candidates = candidates[:limit] if limit else candidates
-            # Find matching elements using ElementCollection
-            from natural_pdf.elements.collections import ElementCollection
+            # Parse the selector and create a filter function
+            parsed_selector = parse_selector(selector)
+            # Pass relevant kwargs (like regex, case) to the filter function builder
+            filter_func = selector_to_filter_func(parsed_selector, **kwargs)
+            # Iterate and return the first match (from reversed list)
+            for candidate in candidates:
+                if filter_func(candidate):
+                    return candidate
+            return None  # No match found
-            matches = ElementCollection(candidates).find_all(selector, **kwargs)
-            return matches[0] if matches else None  # find_all returns a collection
+        # No selector, just return the previous element if it exists
         elif idx > 0:
-            # No selector, just return the previous element
             return all_elements[idx - 1]
         return None
@@ -887,40 +893,128 @@ class Element(DirectionalMixin):
         """String representation of the element."""
         return f"<{self.__class__.__name__} bbox={self.bbox}>"
-    def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
+    @overload
+    def find(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional["Element"]: ...
+    @overload
+    def find(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional["Element"]: ...
+    def find(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> Optional["Element"]:
         """
-        Find first element within this element's bounds matching the selector.
+        Find first element within this element's bounds matching the selector OR text.
         Creates a temporary region to perform the search.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to apply exclusion regions
-            **kwargs: Additional parameters for element filtering
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to apply exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional parameters for element filtering.
         Returns:
-            First matching element or None
+            First matching element or None.
         """
         from natural_pdf.elements.region import Region
         # Create a temporary region from this element's bounds
         temp_region = Region(self.page, self.bbox)
-        return temp_region.find(selector, apply_exclusions=apply_exclusions, **kwargs)
+        # Delegate to the region's find method
+        return temp_region.find(
+            selector=selector,
+            text=text,
+            apply_exclusions=apply_exclusions,
+            regex=regex,
+            case=case,
+            **kwargs,
+        )
+    @overload
+    def find_all(
+        self,
+        *,
+        text: str,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
+    @overload
+    def find_all(
+        self,
+        selector: str,
+        *,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection": ...
-    def find_all(self, selector: str, apply_exclusions=True, **kwargs) -> "ElementCollection":
+    def find_all(
+        self,
+        selector: Optional[str] = None,
+        *,
+        text: Optional[str] = None,
+        apply_exclusions: bool = True,
+        regex: bool = False,
+        case: bool = True,
+        **kwargs,
+    ) -> "ElementCollection":
         """
-        Find all elements within this element's bounds matching the selector.
+        Find all elements within this element's bounds matching the selector OR text.
         Creates a temporary region to perform the search.
+        Provide EITHER `selector` OR `text`, but not both.
         Args:
-            selector: CSS-like selector string
-            apply_exclusions: Whether to apply exclusion regions
-            **kwargs: Additional parameters for element filtering
+            selector: CSS-like selector string.
+            text: Text content to search for (equivalent to 'text:contains(...)').
+            apply_exclusions: Whether to apply exclusion regions (default: True).
+            regex: Whether to use regex for text search (`selector` or `text`) (default: False).
+            case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
+            **kwargs: Additional parameters for element filtering.
         Returns:
-            ElementCollection with matching elements
+            ElementCollection with matching elements.
         """
         from natural_pdf.elements.region import Region
         # Create a temporary region from this element's bounds
         temp_region = Region(self.page, self.bbox)
-        return temp_region.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
+        # Delegate to the region's find_all method
+        return temp_region.find_all(
+            selector=selector,
+            text=text,
+            apply_exclusions=apply_exclusions,
+            regex=regex,
+            case=case,
+            **kwargs,
+        )

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl