PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +11 -6
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +252 -399
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +231 -89
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +405 -280
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +1658 -19
natural_pdf/flows/region.py +757 -263
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +35 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +101 -0
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/elements/text.py CHANGED Viewed

@@ -230,7 +230,9 @@ class TextElement(Element):
         # Default to black
         return (0, 0, 0)
-    def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
+    def extract_text(
+        self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs
+    ) -> str:
         """
         Extract text from this element.
@@ -253,22 +255,22 @@ class TextElement(Element):
         # Apply content filtering if provided
         if content_filter is not None and result:
             import re
             if isinstance(content_filter, str):
                 # Single regex pattern - remove matching characters
                 try:
-                    result = re.sub(content_filter, '', result)
+                    result = re.sub(content_filter, "", result)
                 except re.error:
                     pass  # Invalid regex, skip filtering
             elif isinstance(content_filter, list):
                 # List of regex patterns - remove characters matching ANY pattern
                 try:
                     for pattern in content_filter:
-                        result = re.sub(pattern, '', result)
+                        result = re.sub(pattern, "", result)
                 except re.error:
                     pass  # Invalid regex, skip filtering
             elif callable(content_filter):
                 # Callable filter - apply to individual characters
                 try:
@@ -276,7 +278,7 @@ class TextElement(Element):
                     for char in result:
                         if content_filter(char):
                             filtered_chars.append(char)
-                    result = ''.join(filtered_chars)
+                    result = "".join(filtered_chars)
                 except Exception:
                     pass  # Function error, skip filtering

natural_pdf/exporters/base.py CHANGED Viewed

@@ -3,8 +3,8 @@ import logging
 from typing import TYPE_CHECKING, List, Union
 if TYPE_CHECKING:
-    from natural_pdf.collections.pdf_collection import PDFCollection
     from natural_pdf.core.pdf import PDF
+    from natural_pdf.core.pdf_collection import PDFCollection
 logger = logging.getLogger(__name__)
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
         """
         Helper to consistently resolve the input source to a list of PDF objects.
         """
-        from natural_pdf.collections.pdf_collection import PDFCollection  # Avoid circular import
         from natural_pdf.core.pdf import PDF  # Avoid circular import at module level
+        from natural_pdf.core.pdf_collection import PDFCollection  # Avoid circular import
         pdfs_to_process: List["PDF"] = []
         if isinstance(source, PDF):

natural_pdf/exporters/original_pdf.py CHANGED Viewed

@@ -17,8 +17,8 @@ except ImportError:
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
+    from natural_pdf.core.page_collection import PageCollection
     from natural_pdf.core.pdf import PDF
-    from natural_pdf.elements.collections import PageCollection
 logger = logging.getLogger(__name__)

natural_pdf/exporters/paddleocr.py CHANGED Viewed

@@ -13,8 +13,8 @@ from natural_pdf.exporters.base import FinetuneExporter
 from natural_pdf.utils.identifiers import generate_short_path_hash
 if TYPE_CHECKING:
-    from natural_pdf.collections.pdf_collection import PDFCollection
     from natural_pdf.core.pdf import PDF
+    from natural_pdf.core.pdf_collection import PDFCollection
     from natural_pdf.elements.text import TextElement
 logger = logging.getLogger(__name__)
@@ -216,9 +216,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
                     try:
                         # Expand region, render, and save image
                         region = element.expand(self.padding)
-                        img = region.to_image(
-                            resolution=self.resolution, crop=True, include_highlights=False
-                        )
+                        img = region.render(resolution=self.resolution, crop=True)
                         img.save(absolute_image_path, "PNG")
                         # Add to labels and character set

natural_pdf/exporters/searchable_pdf.py CHANGED Viewed

@@ -28,8 +28,8 @@ except ImportError:
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
+    from natural_pdf.core.page_collection import PageCollection
     from natural_pdf.core.pdf import PDF
-    from natural_pdf.elements.collections import PageCollection
 logger = logging.getLogger(__name__)
@@ -345,7 +345,8 @@ def create_searchable_pdf(
                 # 1. Render page image at target DPI
                 logger.debug(f"  Rendering page {i} to image ({dpi} DPI)...")
                 # Use the Page's to_image method
-                pil_image = page.to_image(resolution=dpi, include_highlights=False)
+                # Use render() for clean image without highlights
+                pil_image = page.render(resolution=dpi)
                 pil_image.save(img_path, format="PNG")
                 img_width, img_height = pil_image.size
                 logger.debug(f"  Image saved to {img_path} ({img_width}x{img_height})")

natural_pdf/extraction/mixin.py CHANGED Viewed

@@ -94,10 +94,8 @@ class ExtractionMixin(ABC):
                 resolution = kwargs.pop("resolution", 72)
                 include_highlights = kwargs.pop("include_highlights", False)
                 labels = kwargs.pop("labels", False)
-                return self.to_image(
+                return self.render(
                     resolution=resolution,
-                    include_highlights=include_highlights,
-                    labels=labels,
                     **kwargs,
                 )
             else:

natural_pdf/flows/collections.py CHANGED Viewed

@@ -8,7 +8,7 @@ if TYPE_CHECKING:
     # from PIL.Image import Image as PIL_Image # No longer needed with Image.Image type hint
     from natural_pdf.core.page import Page as PhysicalPage
     from natural_pdf.elements.base import Element as PhysicalElement
-    from natural_pdf.elements.collections import ElementCollection
+    from natural_pdf.elements.element_collection import ElementCollection
     from .element import FlowElement
     from .flow import Flow  # Though not directly used in __init__, FlowRegion needs it.
@@ -632,73 +632,5 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
             all_cropped_images.extend(fr.to_images(resolution=resolution, **kwargs))
         return all_cropped_images
-    def to_image(
-        self,
-        stack_direction: str = "vertical",
-        background_color=(255, 255, 255),
-        gap: int = 5,
-        **kwargs_for_constituent_to_image,
-    ) -> Optional[Image.Image]:
-        """
-        Creates a single composite image by stacking the composite images of each FlowRegion.
-        Each FlowRegion's composite is generated by its own .to_image() method.
-        These are then stacked.
-        Args:
-            stack_direction: "vertical" or "horizontal".
-            background_color: Background for the final composite.
-            gap: Gap in pixels between stacked FlowRegion images.
-            **kwargs_for_constituent_to_image: Passed to each FlowRegion.to_image().
-        """
-        if not self._flow_regions:
-            return None
-        region_composites: List[Image.Image] = []
-        for fr in self._flow_regions:
-            img = fr.to_image(background_color=background_color, **kwargs_for_constituent_to_image)
-            if img:
-                region_composites.append(img)
-        if not region_composites:
-            return None
-        if len(region_composites) == 1:
-            return region_composites[0]
-        if stack_direction == "vertical":
-            final_width = max(img.width for img in region_composites)
-            final_height = (
-                sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
-            )
-            if final_width == 0 or final_height == 0:
-                return None
-            new_image = Image.new("RGB", (final_width, final_height), background_color)
-            current_y = 0
-            for img in region_composites:
-                # Align to left for vertical stacking
-                new_image.paste(img, (0, current_y))
-                current_y += img.height + gap
-            return new_image
-        elif stack_direction == "horizontal":
-            final_width = (
-                sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
-            )
-            final_height = max(img.height for img in region_composites)
-            if final_width == 0 or final_height == 0:
-                return None
-            new_image = Image.new("RGB", (final_width, final_height), background_color)
-            current_x = 0
-            for img in region_composites:
-                # Align to top for horizontal stacking
-                new_image.paste(img, (current_x, 0))
-                current_x += img.width + gap
-            return new_image
-        else:
-            logger.warning(
-                f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'."
-            )
-            return None  # Or perhaps return the list of images?
     def apply(self, func: Callable[["FlowRegion"], Any]) -> List[Any]:
         return [func(fr) for fr in self._flow_regions]

natural_pdf/flows/element.py CHANGED Viewed

@@ -73,6 +73,31 @@ class FlowElement:
         """Returns the physical page of the underlying element."""
         return getattr(self.physical_object, "page", None)
+    def __getattr__(self, name: str) -> Any:
+        """
+        Delegate unknown attribute access to the physical_object.
+        This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
+        from the physical element are accessible on the FlowElement wrapper.
+        Args:
+            name: The attribute name being accessed
+        Returns:
+            The attribute value from physical_object
+        Raises:
+            AttributeError: If the attribute doesn't exist on physical_object either
+        """
+        try:
+            return getattr(self.physical_object, name)
+        except AttributeError:
+            # Provide a helpful error message that mentions both FlowElement and physical_object
+            raise AttributeError(
+                f"'{type(self).__name__}' object has no attribute '{name}' "
+                f"(also not found on underlying {type(self.physical_object).__name__})"
+            )
     def _flow_direction(
         self,
         direction: str,  # "above", "below", "left", "right"

natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl