PyPI - natural-pdf - Versions diffs - 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl - Mend

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

docs/api/index.md +386 -0
docs/assets/favicon.png +3 -0
docs/assets/favicon.svg +3 -0
docs/assets/javascripts/custom.js +17 -0
docs/assets/logo.svg +3 -0
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +17 -0
docs/assets/social-preview.svg +17 -0
docs/assets/stylesheets/custom.css +65 -0
docs/document-qa/index.ipynb +435 -0
docs/document-qa/index.md +79 -0
docs/element-selection/index.ipynb +915 -0
docs/element-selection/index.md +229 -0
docs/index.md +170 -0
docs/installation/index.md +69 -0
docs/interactive-widget/index.ipynb +962 -0
docs/interactive-widget/index.md +12 -0
docs/layout-analysis/index.ipynb +818 -0
docs/layout-analysis/index.md +185 -0
docs/ocr/index.md +222 -0
docs/pdf-navigation/index.ipynb +314 -0
docs/pdf-navigation/index.md +97 -0
docs/regions/index.ipynb +816 -0
docs/regions/index.md +294 -0
docs/tables/index.ipynb +658 -0
docs/tables/index.md +144 -0
docs/text-analysis/index.ipynb +370 -0
docs/text-analysis/index.md +105 -0
docs/text-extraction/index.ipynb +1478 -0
docs/text-extraction/index.md +292 -0
docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
docs/tutorials/01-loading-and-extraction.md +95 -0
docs/tutorials/02-finding-elements.ipynb +340 -0
docs/tutorials/02-finding-elements.md +149 -0
docs/tutorials/03-extracting-blocks.ipynb +147 -0
docs/tutorials/03-extracting-blocks.md +48 -0
docs/tutorials/04-table-extraction.ipynb +114 -0
docs/tutorials/04-table-extraction.md +50 -0
docs/tutorials/05-excluding-content.ipynb +270 -0
docs/tutorials/05-excluding-content.md +109 -0
docs/tutorials/06-document-qa.ipynb +332 -0
docs/tutorials/06-document-qa.md +91 -0
docs/tutorials/07-layout-analysis.ipynb +260 -0
docs/tutorials/07-layout-analysis.md +66 -0
docs/tutorials/07-working-with-regions.ipynb +409 -0
docs/tutorials/07-working-with-regions.md +151 -0
docs/tutorials/08-spatial-navigation.ipynb +508 -0
docs/tutorials/08-spatial-navigation.md +190 -0
docs/tutorials/09-section-extraction.ipynb +2434 -0
docs/tutorials/09-section-extraction.md +256 -0
docs/tutorials/10-form-field-extraction.ipynb +484 -0
docs/tutorials/10-form-field-extraction.md +201 -0
docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
docs/tutorials/11-enhanced-table-processing.md +9 -0
docs/tutorials/12-ocr-integration.ipynb +586 -0
docs/tutorials/12-ocr-integration.md +188 -0
docs/tutorials/13-semantic-search.ipynb +1888 -0
docs/tutorials/13-semantic-search.md +77 -0
docs/visual-debugging/index.ipynb +2970 -0
docs/visual-debugging/index.md +157 -0
docs/visual-debugging/region.png +0 -0
natural_pdf/__init__.py +39 -20
natural_pdf/analyzers/__init__.py +2 -1
natural_pdf/analyzers/layout/base.py +32 -24
natural_pdf/analyzers/layout/docling.py +131 -72
natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
natural_pdf/analyzers/layout/layout_manager.py +98 -58
natural_pdf/analyzers/layout/layout_options.py +32 -17
natural_pdf/analyzers/layout/paddle.py +152 -95
natural_pdf/analyzers/layout/surya.py +164 -92
natural_pdf/analyzers/layout/tatr.py +149 -84
natural_pdf/analyzers/layout/yolo.py +84 -44
natural_pdf/analyzers/text_options.py +22 -15
natural_pdf/analyzers/text_structure.py +131 -85
natural_pdf/analyzers/utils.py +30 -23
natural_pdf/collections/pdf_collection.py +125 -97
natural_pdf/core/__init__.py +1 -1
natural_pdf/core/element_manager.py +416 -337
natural_pdf/core/highlighting_service.py +268 -196
natural_pdf/core/page.py +907 -513
natural_pdf/core/pdf.py +385 -287
natural_pdf/elements/__init__.py +1 -1
natural_pdf/elements/base.py +302 -214
natural_pdf/elements/collections.py +708 -508
natural_pdf/elements/line.py +39 -36
natural_pdf/elements/rect.py +32 -30
natural_pdf/elements/region.py +854 -883
natural_pdf/elements/text.py +122 -99
natural_pdf/exporters/__init__.py +0 -1
natural_pdf/exporters/searchable_pdf.py +261 -102
natural_pdf/ocr/__init__.py +23 -14
natural_pdf/ocr/engine.py +17 -8
natural_pdf/ocr/engine_easyocr.py +63 -47
natural_pdf/ocr/engine_paddle.py +97 -68
natural_pdf/ocr/engine_surya.py +54 -44
natural_pdf/ocr/ocr_manager.py +88 -62
natural_pdf/ocr/ocr_options.py +16 -10
natural_pdf/qa/__init__.py +1 -1
natural_pdf/qa/document_qa.py +119 -111
natural_pdf/search/__init__.py +37 -31
natural_pdf/search/haystack_search_service.py +312 -189
natural_pdf/search/haystack_utils.py +186 -122
natural_pdf/search/search_options.py +25 -14
natural_pdf/search/search_service_protocol.py +12 -6
natural_pdf/search/searchable_mixin.py +261 -176
natural_pdf/selectors/__init__.py +2 -1
natural_pdf/selectors/parser.py +159 -316
natural_pdf/templates/__init__.py +1 -1
natural_pdf/utils/highlighting.py +8 -2
natural_pdf/utils/reading_order.py +65 -63
natural_pdf/utils/text_extraction.py +195 -0
natural_pdf/utils/visualization.py +70 -61
natural_pdf/widgets/__init__.py +2 -3
natural_pdf/widgets/viewer.py +749 -718
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
natural_pdf-0.1.5.dist-info/RECORD +134 -0
natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
notebooks/Examples.ipynb +1293 -0
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +543 -0
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
tests/test_loading.py +50 -0
tests/test_optional_deps.py +298 -0
natural_pdf-0.1.4.dist-info/RECORD +0 -61
natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0

natural_pdf/elements/line.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
 Line element class for natural-pdf.
 """
-from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 from natural_pdf.elements.base import Element
@@ -12,40 +13,40 @@ if TYPE_CHECKING:
 class LineElement(Element):
     """
     Represents a line element in a PDF.
     This class is a wrapper around pdfplumber's line objects,
     providing additional functionality for analysis and extraction.
     """
-    def __init__(self, obj: Dict[str, Any], page: 'Page'):
+    def __init__(self, obj: Dict[str, Any], page: "Page"):
         """
         Initialize a line element.
         Args:
             obj: The underlying pdfplumber object
             page: The parent Page object
         """
         super().__init__(obj, page)
     @property
     def type(self) -> str:
         """Element type."""
-        return 'line'
+        return "line"
     @property
     def color(self) -> Tuple:
         """Get the line color (RGB tuple)."""
         # PDFs often use non-RGB values, so we handle different formats
-        color = self._obj.get('stroking_color', (0, 0, 0))
+        color = self._obj.get("stroking_color", (0, 0, 0))
         # If it's a single value, treat as grayscale
         if isinstance(color, (int, float)):
             return (color, color, color)
         # If it's a tuple of 3 values, treat as RGB
         if isinstance(color, tuple) and len(color) == 3:
             return color
         # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
         if isinstance(color, tuple) and len(color) == 4:
             c, m, y, k = color
@@ -53,88 +54,90 @@ class LineElement(Element):
             g = 1 - min(1, m + k)
             b = 1 - min(1, y + k)
             return (r, g, b)
         # Default to black
         return (0, 0, 0)
     @property
     def width(self) -> float:
         """Get the line thickness (extracted from PDF properties)."""
-        return self._obj.get('linewidth', 0)
+        return self._obj.get("linewidth", 0)
     @property
     def is_horizontal(self) -> bool:
         """Check if this is a horizontal line based on coordinates."""
         # Calculate absolute difference in coordinates
         dx = abs(self.x1 - self.x0)
         dy = abs(self.top - self.bottom)
         # Define a tolerance for near-horizontal lines (e.g., 1 point)
-        tolerance = 1.0
+        tolerance = 1.0
         # Horizontal if y-change is within tolerance and x-change is significant
         return dy <= tolerance and dx > tolerance
     @property
     def is_vertical(self) -> bool:
         """Check if this is a vertical line based on coordinates."""
         # Calculate absolute difference in coordinates
         dx = abs(self.x1 - self.x0)
         dy = abs(self.top - self.bottom)
         # Define a tolerance for near-vertical lines (e.g., 1 point)
         tolerance = 1.0
         # Vertical if x-change is within tolerance and y-change is significant
         return dx <= tolerance and dy > tolerance
     def text_above(self, distance: float = 5, **kwargs) -> Any:
         """
         Get text elements above this line.
         Args:
             distance: Maximum distance above the line in points
             **kwargs: Additional filter parameters
         Returns:
             ElementCollection of text elements above this line
         """
         from natural_pdf.elements.collections import ElementCollection
         # TODO: Implement proper filtering of elements above this line
         return ElementCollection([])  # Placeholder
     def text_below(self, distance: float = 5, **kwargs) -> Any:
         """
         Get text elements below this line.
         Args:
             distance: Maximum distance below the line in points
             **kwargs: Additional filter parameters
         Returns:
             ElementCollection of text elements below this line
         """
         from natural_pdf.elements.collections import ElementCollection
         # TODO: Implement proper filtering of elements below this line
         return ElementCollection([])  # Placeholder
     def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
         """
         Lines don't have text, so this returns an empty string.
         Args:
             keep_blank_chars: Whether to keep blank characters (default: True)
             apply_exclusions: Whether to apply exclusion regions (default: True)
             **kwargs: Additional extraction parameters
         Returns:
             Empty string
         """
         return ""
     def __repr__(self) -> str:
         """String representation of the line element."""
-        line_type = "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
-        return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"
+        line_type = (
+            "horizontal" if self.is_horizontal else "vertical" if self.is_vertical else "diagonal"
+        )
+        return f"<LineElement type={line_type} width={self.width:.1f} bbox={self.bbox}>"

natural_pdf/elements/rect.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
 Rectangle element class for natural-pdf.
 """
-from typing import Dict, Any, Optional, Tuple, TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
 from natural_pdf.elements.base import Element
@@ -12,40 +13,40 @@ if TYPE_CHECKING:
 class RectangleElement(Element):
     """
     Represents a rectangle element in a PDF.
     This class is a wrapper around pdfplumber's rectangle objects,
     providing additional functionality for analysis and extraction.
     """
-    def __init__(self, obj: Dict[str, Any], page: 'Page'):
+    def __init__(self, obj: Dict[str, Any], page: "Page"):
         """
         Initialize a rectangle element.
         Args:
             obj: The underlying pdfplumber object
             page: The parent Page object
         """
         super().__init__(obj, page)
     @property
     def type(self) -> str:
         """Element type."""
-        return 'rect'
+        return "rect"
     @property
     def fill(self) -> Tuple:
         """Get the fill color of the rectangle (RGB tuple)."""
         # PDFs often use non-RGB values, so we handle different formats
-        color = self._obj.get('non_stroking_color', (0, 0, 0))
+        color = self._obj.get("non_stroking_color", (0, 0, 0))
         # If it's a single value, treat as grayscale
         if isinstance(color, (int, float)):
             return (color, color, color)
         # If it's a tuple of 3 values, treat as RGB
         if isinstance(color, tuple) and len(color) == 3:
             return color
         # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
         if isinstance(color, tuple) and len(color) == 4:
             c, m, y, k = color
@@ -53,24 +54,24 @@ class RectangleElement(Element):
             g = 1 - min(1, m + k)
             b = 1 - min(1, y + k)
             return (r, g, b)
         # Default to black
         return (0, 0, 0)
     @property
     def stroke(self) -> Tuple:
         """Get the stroke color of the rectangle (RGB tuple)."""
         # PDFs often use non-RGB values, so we handle different formats
-        color = self._obj.get('stroking_color', (0, 0, 0))
+        color = self._obj.get("stroking_color", (0, 0, 0))
         # If it's a single value, treat as grayscale
         if isinstance(color, (int, float)):
             return (color, color, color)
         # If it's a tuple of 3 values, treat as RGB
         if isinstance(color, tuple) and len(color) == 3:
             return color
         # If it's a tuple of 4 values, treat as CMYK and convert to approximate RGB
         if isinstance(color, tuple) and len(color) == 4:
             c, m, y, k = color
@@ -78,45 +79,46 @@ class RectangleElement(Element):
             g = 1 - min(1, m + k)
             b = 1 - min(1, y + k)
             return (r, g, b)
         # Default to black
         return (0, 0, 0)
     @property
     def stroke_width(self) -> float:
         """Get the stroke width of the rectangle."""
-        return self._obj.get('linewidth', 0)
+        return self._obj.get("linewidth", 0)
     def text_inside(self, **kwargs) -> Any:
         """
         Get text elements inside this rectangle.
         Args:
             **kwargs: Additional filter parameters
         Returns:
             ElementCollection of text elements inside this rectangle
         """
         from natural_pdf.elements.collections import ElementCollection
         # TODO: Implement proper filtering of elements inside this rectangle
         return ElementCollection([])  # Placeholder
     def extract_text(self, **kwargs) -> str:
         """
         Extract text from inside this rectangle.
         Args:
             **kwargs: Additional extraction parameters
         Returns:
             Extracted text as string
         """
         # Use the region to extract text
         from natural_pdf.elements.region import Region
         region = Region(self.page, self.bbox)
         return region.extract_text(**kwargs)
     def __repr__(self) -> str:
         """String representation of the rectangle element."""
-        return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"
+        return f"<RectangleElement fill={self.fill} stroke={self.stroke} bbox={self.bbox}>"

natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

natural-pdf 0.1.4py3-none-any.whl → 0.1.5py3-none-any.whl