PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +11 -6
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +252 -399
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +231 -89
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +405 -280
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +1658 -19
natural_pdf/flows/region.py +757 -263
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +35 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +101 -0
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/layout/layout_analyzer.py CHANGED Viewed

@@ -86,9 +86,8 @@ class LayoutAnalyzer:
             layout_resolution = getattr(self._page._parent, "_config", {}).get(
                 "layout_image_resolution", 72
             )
-            std_res_page_image = self._page.to_image(
-                resolution=layout_resolution, include_highlights=False
-            )
+            # Use render() for clean image without highlights
+            std_res_page_image = self._page.render(resolution=layout_resolution)
             if not std_res_page_image:
                 raise ValueError("Initial page rendering returned None")
             logger.debug(

natural_pdf/analyzers/layout/layout_manager.py CHANGED Viewed

@@ -128,7 +128,17 @@ class LayoutManager:
                 engine_class = engine_class_or_factory
             detector_instance = engine_class()  # Instantiate
-            if not detector_instance.is_available():
+            # Try to check availability and capture any errors
+            availability_error = None
+            is_available = False
+            try:
+                is_available = detector_instance.is_available()
+            except Exception as e:
+                availability_error = e
+                logger.error(f"Error checking availability of {engine_name}: {e}", exc_info=True)
+            if not is_available:
                 # Check availability before storing
                 # Construct helpful error message with install hint
                 install_hint = ""
@@ -141,9 +151,13 @@ class LayoutManager:
                 else:
                     install_hint = f"(Check installation requirements for {engine_name})"
-                raise RuntimeError(
-                    f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
-                )
+                error_msg = f"Layout engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
+                # If we have an availability error, include it
+                if availability_error:
+                    error_msg += f"\nAvailability check error: {availability_error}"
+                raise RuntimeError(error_msg)
             self._detector_instances[engine_name] = detector_instance  # Store if available
         return self._detector_instances[engine_name]

natural_pdf/analyzers/layout/paddle.py CHANGED Viewed

@@ -42,13 +42,21 @@ logger = logging.getLogger(__name__)
 paddle_spec = importlib.util.find_spec("paddle") or importlib.util.find_spec("paddlepaddle")
 paddleocr_spec = importlib.util.find_spec("paddleocr")
 PPStructureV3 = None
+_paddle_import_error = None  # Store the import error for debugging
 if paddle_spec and paddleocr_spec:
     try:
         from paddleocr import PPStructureV3
     except ImportError as e:
+        _paddle_import_error = str(e)
         logger.warning(f"Could not import Paddle dependencies: {e}")
 else:
+    if not paddle_spec:
+        _paddle_import_error = "paddlepaddle not found"
+    elif not paddleocr_spec:
+        _paddle_import_error = "paddleocr not found"
+    else:
+        _paddle_import_error = "Unknown import issue"
     logger.warning(
         "paddlepaddle or paddleocr not found. PaddleLayoutDetector will not be available."
     )
@@ -82,6 +90,9 @@ class PaddleLayoutDetector(LayoutDetector):
     def is_available(self) -> bool:
         """Check if dependencies are installed."""
+        if PPStructureV3 is None and _paddle_import_error:
+            # Raise an informative error instead of just returning False
+            raise RuntimeError(f"Paddle dependencies check failed: {_paddle_import_error}")
         return PPStructureV3 is not None
     def _get_cache_key(self, options: BaseLayoutOptions) -> str:

natural_pdf/analyzers/layout/surya.py CHANGED Viewed

@@ -188,9 +188,8 @@ class SuryaLayoutDetector(LayoutDetector):
         high_res_dpi = getattr(self._page_ref._parent, "_config", {}).get(
             "surya_table_rec_dpi", 192
         )
-        high_res_page_image = self._page_ref.to_image(
-            resolution=high_res_dpi, include_highlights=False
-        )
+        # Use render() for clean image without highlights
+        high_res_page_image = self._page_ref.render(resolution=high_res_dpi)
         # Render high-res page ONCE
         self.logger.debug(

natural_pdf/analyzers/shape_detection_mixin.py CHANGED Viewed

@@ -10,8 +10,9 @@ from sklearn.cluster import MiniBatchKMeans
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
+    from natural_pdf.core.page_collection import PageCollection
     from natural_pdf.core.pdf import PDF
-    from natural_pdf.elements.collections import ElementCollection, PageCollection
+    from natural_pdf.elements.element_collection import ElementCollection
     from natural_pdf.elements.line import LineElement
     # from natural_pdf.elements.rect import RectangleElement # Removed
@@ -59,14 +60,13 @@ class ShapeDetectionMixin:
         # Determine the type of self and get the appropriate image and page context
         if (
-            hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height")
+            hasattr(self, "render") and hasattr(self, "width") and hasattr(self, "height")
         ):  # Page or Region
             if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):  # Region
                 logger.debug(f"Shape detection on Region: {self}")
                 page_obj = self._page
-                pil_image = self.to_image(
-                    resolution=resolution, crop=True, include_highlights=False
-                )
+                # Use render() for clean image without highlights, with cropping
+                pil_image = self.render(resolution=resolution, crop=True)
                 if pil_image:  # Ensure pil_image is not None before accessing attributes
                     origin_offset_pdf = (self.x0, self.top)
                     logger.debug(
@@ -75,7 +75,8 @@ class ShapeDetectionMixin:
             else:  # Page
                 logger.debug(f"Shape detection on Page: {self}")
                 page_obj = self
-                pil_image = self.to_image(resolution=resolution, include_highlights=False)
+                # Use render() for clean image without highlights
+                pil_image = self.render(resolution=resolution)
                 logger.debug(
                     f"Page image rendered successfully: {pil_image.width}x{pil_image.height}"
                 )
@@ -150,6 +151,12 @@ class ShapeDetectionMixin:
             origin_offset_pdf[1] + line_data_img["y2"] * effective_scale
         )  # y2 is the second y-coord
+        # Clamp coords to image dimensions
+        x0 = max(0, min(x0, page_obj.width))
+        top = max(0, min(top, page_obj.height))
+        x1 = max(0, min(x1, page_obj.width))
+        bottom = max(0, min(bottom, page_obj.height))
         # For lines, width attribute in PDF points
         line_width_pdf = line_data_img["width"] * effective_scale
@@ -158,7 +165,7 @@ class ShapeDetectionMixin:
             getattr(page_obj._page, "initial_doctop", 0) if hasattr(page_obj, "_page") else 0
         )
-        return {
+        attrs = {
             "x0": x0,
             "top": top,
             "x1": x1,
@@ -179,6 +186,8 @@ class ShapeDetectionMixin:
             "raw_line_position_px": line_data_img.get("line_position_px"),  # Added for clarity
         }
+        return attrs
     def _find_lines_on_image_data(
         self,
         cv_image: np.ndarray,
@@ -680,13 +689,12 @@ class ShapeDetectionMixin:
             return self
         pil_image_for_dims = None
-        if hasattr(self, "to_image") and hasattr(self, "width") and hasattr(self, "height"):
+        if hasattr(self, "render") and hasattr(self, "width") and hasattr(self, "height"):
             if hasattr(self, "x0") and hasattr(self, "top") and hasattr(self, "_page"):
-                pil_image_for_dims = self.to_image(
-                    resolution=resolution, crop=True, include_highlights=False
-                )
+                pil_image_for_dims = self.render(resolution=resolution, crop=True)
             else:
-                pil_image_for_dims = self.to_image(resolution=resolution, include_highlights=False)
+                # Use render() for clean image without highlights
+                pil_image_for_dims = self.render(resolution=resolution)
         if pil_image_for_dims is None:
             logger.warning(f"Could not re-render PIL image for dimensions for {self}.")
             pil_image_for_dims = Image.fromarray(cv_image)  # Ensure it's not None
@@ -710,7 +718,6 @@ class ShapeDetectionMixin:
                     logger.info(
                         f"Removed {removed_count} existing lines with source '{source_label}' from {page_object_ctx}"
                     )
         lines_data_img, profile_h_smoothed, profile_v_smoothed = self._find_lines_on_image_data(
             cv_image=cv_image,
             pil_image_rgb=pil_image_for_dims,
@@ -733,7 +740,6 @@ class ShapeDetectionMixin:
             smoothing_sigma_v=smoothing_sigma_v,
             peak_width_rel_height=peak_width_rel_height,
         )
         from natural_pdf.elements.line import LineElement
         element_manager = page_object_ctx._element_mgr
@@ -742,14 +748,8 @@ class ShapeDetectionMixin:
             element_constructor_data = self._convert_line_to_element_data(
                 line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
             )
-            try:
-                line_element = LineElement(element_constructor_data, page_object_ctx)
-                element_manager.add_element(line_element, element_type="lines")
-            except Exception as e:
-                logger.error(
-                    f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}",
-                    exc_info=True,
-                )
+            line_element = LineElement(element_constructor_data, page_object_ctx)
+            element_manager.add_element(line_element, element_type="lines")
         logger.info(
             f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}' using projection profiling."
@@ -826,14 +826,8 @@ class ShapeDetectionMixin:
             element_constructor_data = self._convert_line_to_element_data(
                 line_data_item_img, scale_factor, origin_offset_pdf, page_object_ctx, source_label
             )
-            try:
-                line_element = LineElement(element_constructor_data, page_object_ctx)
-                element_manager.add_element(line_element, element_type="lines")
-            except Exception as e:
-                logger.error(
-                    f"Failed to create or add LineElement: {e}. Data: {element_constructor_data}",
-                    exc_info=True,
-                )
+            line_element = LineElement(element_constructor_data, page_object_ctx)
+            element_manager.add_element(line_element, element_type="lines")
         logger.info(
             f"Detected and added {len(lines_data_img)} lines to {page_object_ctx} with source '{source_label}' using LSD."
@@ -1256,10 +1250,7 @@ class ShapeDetectionMixin:
                 and getattr(r, "source", None) == source_label
             ]
             for r in old_blobs:
-                try:
-                    page_obj._element_mgr.regions.remove(r)
-                except ValueError:
-                    pass
+                page_obj._element_mgr.regions.remove(r)
         # ── iterate clusters ───────────────────────────────────────────────────
         unique_clusters = [cid for cid in np.unique(labels_img) if cid >= 0]

natural_pdf/analyzers/text_structure.py CHANGED Viewed

@@ -14,7 +14,7 @@ from natural_pdf.analyzers.text_options import TextStyleOptions
 if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.base import Element
-    from natural_pdf.elements.collections import ElementCollection
+    from natural_pdf.elements.element_collection import ElementCollection
 logger = logging.getLogger(__name__)
@@ -282,7 +282,7 @@ class TextStyleAnalyzer:
     def analyze(
         self, page: "Page", options: Optional[TextStyleOptions] = None
     ) -> "ElementCollection":
-        from natural_pdf.elements.collections import ElementCollection
+        from natural_pdf.elements.element_collection import ElementCollection
         current_options = options or self.options
         logger.info(

natural_pdf/classification/manager.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
-import time
 import threading  # Add threading for locks
+import time
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union

natural_pdf/collections/mixins.py CHANGED Viewed

@@ -92,9 +92,10 @@ class ApplyMixin:
         # Import here to avoid circular imports
         from natural_pdf import PDF, Page
-        from natural_pdf.collections.pdf_collection import PDFCollection
+        from natural_pdf.core.page_collection import PageCollection
+        from natural_pdf.core.pdf_collection import PDFCollection
         from natural_pdf.elements.base import Element
-        from natural_pdf.elements.collections import ElementCollection, PageCollection
+        from natural_pdf.elements.element_collection import ElementCollection
         from natural_pdf.elements.region import Region
         first_non_none = next((r for r in results if r is not None), None)

natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl