PyPI - natural-pdf - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl - Mend

natural-pdf 0.1.27py3-none-any.whl → 0.1.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

bad_pdf_analysis/analyze_10_more.py +300 -0
bad_pdf_analysis/analyze_final_10.py +552 -0
bad_pdf_analysis/analyze_specific_pages.py +394 -0
bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +45 -1
natural_pdf/analyzers/layout/surya.py +1 -1
natural_pdf/analyzers/layout/yolo.py +2 -2
natural_pdf/analyzers/shape_detection_mixin.py +228 -0
natural_pdf/classification/manager.py +67 -0
natural_pdf/core/element_manager.py +556 -25
natural_pdf/core/highlighting_service.py +98 -43
natural_pdf/core/page.py +86 -20
natural_pdf/core/pdf.py +0 -2
natural_pdf/describe/base.py +40 -9
natural_pdf/describe/elements.py +11 -6
natural_pdf/elements/base.py +134 -20
natural_pdf/elements/collections.py +43 -11
natural_pdf/elements/image.py +43 -0
natural_pdf/elements/region.py +64 -19
natural_pdf/elements/text.py +89 -11
natural_pdf/flows/collections.py +4 -4
natural_pdf/flows/region.py +17 -2
natural_pdf/ocr/engine_paddle.py +1 -1
natural_pdf/ocr/ocr_factory.py +8 -8
natural_pdf/ocr/ocr_manager.py +51 -1
natural_pdf/selectors/parser.py +27 -7
natural_pdf/tables/__init__.py +5 -0
natural_pdf/tables/result.py +101 -0
natural_pdf/utils/bidi_mirror.py +36 -0
natural_pdf/utils/visualization.py +15 -1
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
optimization/memory_comparison.py +172 -0
optimization/pdf_analyzer.py +410 -0
optimization/performance_analysis.py +397 -0
optimization/test_cleanup_methods.py +155 -0
optimization/test_memory_fix.py +162 -0
tools/bad_pdf_eval/__init__.py +1 -0
tools/bad_pdf_eval/analyser.py +302 -0
tools/bad_pdf_eval/collate_summaries.py +130 -0
tools/bad_pdf_eval/eval_suite.py +116 -0
tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
tools/bad_pdf_eval/llm_enrich.py +273 -0
tools/bad_pdf_eval/reporter.py +17 -0
tools/bad_pdf_eval/utils.py +127 -0
tools/rtl_smoke_test.py +80 -0
natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0

natural_pdf/describe/elements.py CHANGED Viewed

@@ -279,7 +279,7 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
     """Analyze typography patterns in text elements."""
     fonts = Counter()
     sizes = Counter()
-    styles = {'bold': 0, 'italic': 0}
+    styles = {'bold': 0, 'italic': 0, 'strikeout': 0, 'underline': 0, 'highlight': 0}
     colors = Counter()
     for element in elements:
@@ -302,6 +302,12 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
             styles['bold'] += 1
         if getattr(element, 'italic', False):
             styles['italic'] += 1
+        if getattr(element, 'strikeout', False):
+            styles['strikeout'] += 1
+        if getattr(element, 'underline', False):
+            styles['underline'] += 1
+        if getattr(element, 'highlight', False):
+            styles['highlight'] += 1
         # Color - use TextElement's color property
         color = getattr(element, 'color', None)
@@ -328,13 +334,12 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
     # Styles
     style_list = []
-    if styles['bold']:
-        style_list.append(f"{styles['bold']} bold")
-    if styles['italic']:
-        style_list.append(f"{styles['italic']} italic")
+    for style, count in styles.items():
+        if count > 0:
+            style_list.append(f"{count} {style}")
     if style_list:
         result['styles'] = ", ".join(style_list)
     # Colors
     if colors and len(colors) > 1:  # Only show if there are multiple colors
         result['colors'] = dict(colors.most_common())

natural_pdf/elements/base.py CHANGED Viewed

@@ -414,6 +414,114 @@ class DirectionalMixin:
         return new_region
+    # ------------------------------------------------------------------
+    # Spatial parent lookup
+    # ------------------------------------------------------------------
+    def parent(
+        self,
+        selector: Optional[str] = None,
+        *,
+        mode: str = "contains",  # "contains" | "center" | "overlap"
+    ) -> Optional["Element"]:
+        """Return the *smallest* element/region that encloses this one.
+        The search is purely geometric – no pre-existing hierarchy is assumed.
+        Parameters
+        ----------
+        selector : str, optional
+            CSS-style selector used to filter candidate containers first.
+        mode : str, default "contains"
+            How to decide if a candidate encloses this element.
+            • ``"contains"`` – candidate bbox fully contains *self* bbox.
+            • ``"center"``   – candidate contains the centroid of *self*.
+            • ``"overlap"``  – any bbox intersection > 0 pt².
+        Returns
+        -------
+        Element | Region | None
+            The smallest-area container that matches, or *None* if none found.
+        """
+        from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
+        # --- Gather candidates ------------------------------------------------
+        page = getattr(self, "page", None)
+        if page is None:
+            return None
+        # All basic elements
+        try:
+            candidates: List["Element"] = list(page.get_elements(apply_exclusions=False))
+        except Exception:
+            candidates = []
+        # Add detected regions if present
+        if hasattr(page, "_element_mgr") and hasattr(page._element_mgr, "regions"):
+            candidates.extend(list(page._element_mgr.regions))
+        # Remove self from pool
+        candidates = [c for c in candidates if c is not self]
+        # Apply selector filtering early if provided
+        if selector:
+            sel_obj = parse_selector(selector)
+            filt = selector_to_filter_func(sel_obj)
+            candidates = [c for c in candidates if filt(c)]
+        if not candidates:
+            return None
+        # Helper to extract bbox (x0, top, x1, bottom)
+        def _bbox(obj):
+            return extract_bbox(obj)
+        # Self metrics
+        self_bbox = _bbox(self)
+        if self_bbox is None:
+            return None
+        s_x0, s_y0, s_x1, s_y1 = self_bbox
+        s_cx = (s_x0 + s_x1) / 2
+        s_cy = (s_y0 + s_y1) / 2
+        matches: List["Element"] = []
+        for cand in candidates:
+            c_bbox = _bbox(cand)
+            if c_bbox is None:
+                continue
+            c_x0, c_y0, c_x1, c_y1 = c_bbox
+            if mode == "contains":
+                if c_x0 <= s_x0 and c_y0 <= s_y0 and c_x1 >= s_x1 and c_y1 >= s_y1:
+                    matches.append(cand)
+            elif mode == "center":
+                if c_x0 <= s_cx <= c_x1 and c_y0 <= s_cy <= c_y1:
+                    matches.append(cand)
+            elif mode == "overlap":
+                # Compute overlap rectangle
+                ox0 = max(c_x0, s_x0)
+                oy0 = max(c_y0, s_y0)
+                ox1 = min(c_x1, s_x1)
+                oy1 = min(c_y1, s_y1)
+                if ox1 > ox0 and oy1 > oy0:
+                    matches.append(cand)
+        if not matches:
+            return None
+        # Pick the smallest-area match
+        def _area(obj):
+            bb = _bbox(obj)
+            if bb is None:
+                return float("inf")
+            return (bb[2] - bb[0]) * (bb[3] - bb[1])
+        matches.sort(key=_area)
+        return matches[0]
 class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
     """
@@ -805,25 +913,17 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
     def highlight(
         self,
-        label: Optional[str] = None,
-        color: Optional[Union[Tuple, str]] = None,  # Allow string color
-        use_color_cycling: bool = False,
+        label: str = "",
+        color: Optional[Tuple[float, float, float]] = None,
+        use_color_cycling: bool = True,
         include_attrs: Optional[List[str]] = None,
         existing: str = "append",
     ) -> "Element":
-        """
-        Highlight this element on the page.
-        Args:
-            label: Optional label for the highlight
-            color: Color tuple/string for the highlight, or None to use automatic color
-            use_color_cycling: Force color cycling even with no label (default: False)
-            include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
-            existing: How to handle existing highlights - 'append' (default) or 'replace'
+        """Highlight the element with the specified colour.
-        Returns:
-            Self for method chaining
+        Highlight the element on the page.
         """
         # Access the correct highlighter service
         highlighter = self.page._highlighter
@@ -850,7 +950,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
     def show(
         self,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         labels: bool = True,
         legend_position: str = "right",
         color: Optional[Union[Tuple, str]] = "red",  # Default color for single element
@@ -862,7 +962,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
         Show the page with only this element highlighted temporarily.
         Args:
-            scale: Scale factor for rendering
+            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             labels: Whether to include a legend for the highlight
             legend_position: Position of the legend
             color: Color to highlight this element (default: red)
@@ -874,6 +974,13 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
         Returns:
             PIL Image of the page with only this element highlighted, or None if error.
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         if not hasattr(self, "page") or not self.page:
             logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
             return None
@@ -909,7 +1016,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
             return service.render_preview(
                 page_index=self.page.index,
                 temporary_highlights=[temp_highlight_data],
-                scale=scale,
+                resolution=resolution,
                 width=width,  # Pass the width parameter
                 labels=labels,
                 legend_position=legend_position,
@@ -920,22 +1027,29 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
             return None
     def save(
-        self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
+        self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
     ) -> None:
         """
         Save the page with this element highlighted to an image file.
         Args:
             filename: Path to save the image to
-            scale: Scale factor for rendering
+            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             labels: Whether to include a legend for labels
             legend_position: Position of the legend
         Returns:
             Self for method chaining
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # Save the highlighted image
-        self.page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
+        self.page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
         return self
     # Note: save_image method removed in favor of save()

natural_pdf/elements/collections.py CHANGED Viewed

@@ -859,7 +859,7 @@ class ElementCollection(
         distinct: bool = False,
         include_attrs: Optional[List[str]] = None,
         # --- Rendering Parameters ---
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         labels: bool = True,  # Use 'labels' consistent with service
         legend_position: str = "right",
         render_ocr: bool = False,
@@ -884,7 +884,7 @@ class ElementCollection(
             label_format: F-string to format group labels if group_by is used.
             distinct: Highlight each element distinctly (overrides group_by/label).
             include_attrs: Attributes to display on individual highlights.
-            scale: Scale factor for rendering image.
+            resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
             labels: Whether to include a legend for the temporary highlights.
             legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
             render_ocr: Whether to render OCR text.
@@ -900,6 +900,18 @@ class ElementCollection(
         Raises:
             ValueError: If the collection is empty or elements are on different pages/PDFs.
         """
+        # Apply global options as defaults, but allow explicit parameters to override
+        import natural_pdf
+        # Use global options if parameters are not explicitly set
+        if width is None:
+            width = natural_pdf.options.image.width
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         if not self._elements:
             raise ValueError("Cannot show an empty collection.")
@@ -967,7 +979,7 @@ class ElementCollection(
             img = service.render_preview(
                 page_index=page.index,
                 temporary_highlights=highlight_data_list,
-                scale=scale,
+                resolution=resolution,
                 width=width,  # Pass the width parameter
                 labels=labels,  # Use 'labels'
                 legend_position=legend_position,
@@ -982,7 +994,7 @@ class ElementCollection(
     def save(
         self,
         filename: str,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         width: Optional[int] = None,
         labels: bool = True,
         legend_position: str = "right",
@@ -993,7 +1005,7 @@ class ElementCollection(
         Args:
             filename: Path to save the image to
-            scale: Scale factor for rendering
+            resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
             width: Optional width for the output image in pixels
             labels: Whether to include a legend for labels
             legend_position: Position of the legend
@@ -1002,10 +1014,22 @@ class ElementCollection(
         Returns:
             Self for method chaining
         """
+        # Apply global options as defaults, but allow explicit parameters to override
+        import natural_pdf
+        # Use global options if parameters are not explicitly set
+        if width is None:
+            width = natural_pdf.options.image.width
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # Use to_image to generate and save the image
         self.to_image(
             path=filename,
-            scale=scale,
+            resolution=resolution,
             width=width,
             labels=labels,
             legend_position=legend_position,
@@ -1016,7 +1040,7 @@ class ElementCollection(
     def to_image(
         self,
         path: Optional[str] = None,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         width: Optional[int] = None,
         labels: bool = True,
         legend_position: str = "right",
@@ -1028,7 +1052,7 @@ class ElementCollection(
         Args:
             path: Optional path to save the image to
-            scale: Scale factor for rendering
+            resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
             width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
             labels: Whether to include a legend for labels
             legend_position: Position of the legend
@@ -1043,7 +1067,7 @@ class ElementCollection(
             # Generate the image using to_image
             return page.to_image(
                 path=path,
-                scale=scale,
+                resolution=resolution,
                 width=width,
                 labels=labels,
                 legend_position=legend_position,
@@ -1774,7 +1798,7 @@ class ElementCollection(
         self,
         padding: int = 1,
         threshold: float = 0.95,
-        resolution: float = 150,
+        resolution: Optional[float] = None,
         show_progress: bool = True,
     ) -> "ElementCollection":
         """
@@ -1786,12 +1810,20 @@ class ElementCollection(
         Args:
             padding: Number of pixels to keep as padding after trimming (default: 1)
             threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
-            resolution: Resolution for image rendering in DPI (default: 150)
+            resolution: Resolution for image rendering in DPI (default: uses global options, fallback to 144 DPI)
             show_progress: Whether to show a progress bar for the trimming operation
         Returns:
             New ElementCollection with trimmed regions
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         return self.apply(
             lambda element: element.trim(
                 padding=padding, threshold=threshold, resolution=resolution

natural_pdf/elements/image.py ADDED Viewed

@@ -0,0 +1,43 @@
+from typing import TYPE_CHECKING, Any, Dict, Tuple
+from natural_pdf.elements.base import Element
+if TYPE_CHECKING:
+    from natural_pdf.core.page import Page
+class ImageElement(Element):
+    """Represents a raster XObject (embedded image) on a PDF page."""
+    def __init__(self, obj: Dict[str, Any], page: "Page"):
+        super().__init__(obj, page)
+    # ------------------------------------------------------------------
+    # Simple attribute proxies
+    # ------------------------------------------------------------------
+    @property
+    def type(self) -> str:  # noqa: D401 – short description already given
+        return "image"
+    @property
+    def width(self) -> float:  # override just to use dict value directly
+        return float(self._obj.get("width", 0))
+    @property
+    def height(self) -> float:
+        return float(self._obj.get("height", 0))
+    @property
+    def srcsize(self) -> Tuple[float, float]:
+        """Original pixel dimensions of the embedded image (width, height)."""
+        return self._obj.get("srcsize", (None, None))
+    @property
+    def colorspace(self):  # raw pdfminer data
+        return self._obj.get("colorspace")
+    # No text extraction for images
+    def extract_text(self, *args, **kwargs) -> str:  # noqa: D401 – consistent signature
+        return ""
+    def __repr__(self):
+        return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"

natural_pdf/elements/region.py CHANGED Viewed

@@ -26,6 +26,11 @@ from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
+# ------------------------------------------------------------------
+# Table utilities
+# ------------------------------------------------------------------
+from natural_pdf.tables import TableResult
 # --- End Classification Imports --- #
@@ -590,8 +595,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
     def to_image(
         self,
-        scale: float = 2.0,
-        resolution: float = 150,
+        resolution: Optional[float] = None,
         crop: bool = False,
         include_highlights: bool = True,
         **kwargs,
@@ -600,7 +604,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Generate an image of just this region.
         Args:
-            resolution: Resolution in DPI for rendering (default: 150)
+            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             crop: If True, only crop the region without highlighting its boundaries
             include_highlights: Whether to include existing highlights (default: True)
             **kwargs: Additional parameters for page.to_image()
@@ -608,6 +612,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             PIL Image of just this region
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # Handle the case where user wants the cropped region to have a specific width
         page_kwargs = kwargs.copy()
         effective_resolution = resolution  # Start with the provided resolution
@@ -633,7 +645,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         # First get the full page image with highlights if requested
         page_image = self._page.to_image(
-            scale=scale,
             resolution=effective_resolution,
             include_highlights=include_highlights,
             **page_kwargs,
@@ -683,7 +694,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
     def show(
         self,
-        scale: float = 2.0,
+        resolution: Optional[float] = None,
         labels: bool = True,
         legend_position: str = "right",
         # Add a default color for standalone show
@@ -696,7 +707,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Show the page with just this region highlighted temporarily.
         Args:
-            scale: Scale factor for rendering
+            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             labels: Whether to include a legend for labels
             legend_position: Position of the legend
             color: Color to highlight this region (default: blue)
@@ -709,6 +720,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             PIL Image of the page with only this region highlighted
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         if not self._page:
             raise ValueError("Region must be associated with a page to show.")
@@ -737,7 +756,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         return service.render_preview(
             page_index=self._page.index,
             temporary_highlights=[temp_highlight_data],
-            scale=scale,
+            resolution=resolution,
             width=width,  # Pass the width parameter
             labels=labels,
             legend_position=legend_position,
@@ -745,31 +764,39 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         )
     def save(
-        self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
+        self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
     ) -> "Region":
         """
         Save the page with this region highlighted to an image file.
         Args:
             filename: Path to save the image to
-            scale: Scale factor for rendering
+            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             labels: Whether to include a legend for labels
             legend_position: Position of the legend
         Returns:
             Self for method chaining
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # Highlight this region if not already highlighted
         self.highlight()
         # Save the highlighted image
-        self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
+        self._page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
         return self
     def save_image(
         self,
         filename: str,
-        resolution: float = 150,
+        resolution: Optional[float] = None,
         crop: bool = False,
         include_highlights: bool = True,
         **kwargs,
@@ -779,7 +806,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Args:
             filename: Path to save the image to
-            resolution: Resolution in DPI for rendering (default: 150)
+            resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
             crop: If True, only crop the region without highlighting its boundaries
             include_highlights: Whether to include existing highlights (default: True)
             **kwargs: Additional parameters for page.to_image()
@@ -787,6 +814,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         Returns:
             Self for method chaining
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # Get the region image
         image = self.to_image(
             resolution=resolution,
@@ -803,7 +838,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         self,
         padding: int = 1,
         threshold: float = 0.95,
-        resolution: float = 150,
+        resolution: Optional[float] = None,
         pre_shrink: float = 0.5,
     ) -> "Region":
         """
@@ -817,7 +852,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
                       Higher values mean more strict whitespace detection.
                       E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
-            resolution: Resolution for image rendering in DPI (default: 150)
+            resolution: Resolution for image rendering in DPI (default: uses global options, fallback to 144 DPI)
             pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
                        This helps avoid detecting box borders/slivers as content.
@@ -834,6 +869,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
             # Conservative trimming with more padding
             loose = region.trim(padding=3, threshold=0.98)
         """
+        # Apply global options as defaults
+        import natural_pdf
+        if resolution is None:
+            if natural_pdf.options.image.resolution is not None:
+                resolution = natural_pdf.options.image.resolution
+            else:
+                resolution = 144  # Default resolution when none specified
         # Pre-shrink the region to avoid box slivers
         work_region = (
             self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
@@ -1172,7 +1215,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
         # --- NEW: Add tqdm control option --- #
         show_progress: bool = False,  # Controls progress bar for text method
-    ) -> List[List[Optional[str]]]:  # Return type allows Optional[str] for cells
+    ) -> TableResult:  # Return type allows Optional[str] for cells
         """
         Extract a table from this region.
@@ -1224,7 +1267,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
                     logger.debug(
                         f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
                     )
-                    return self._extract_table_from_cells(cell_regions_in_table)
+                    return TableResult(self._extract_table_from_cells(cell_regions_in_table))
                 # --------------------------------------------------------------- #
@@ -1280,19 +1323,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
         # Use the selected method
         if effective_method == "tatr":
-            return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
+            table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
         elif effective_method == "text":
             current_text_options = text_options.copy()
             current_text_options["cell_extraction_func"] = cell_extraction_func
             current_text_options["show_progress"] = show_progress
-            return self._extract_table_text(**current_text_options)
+            table_rows = self._extract_table_text(**current_text_options)
         elif effective_method == "pdfplumber":
-            return self._extract_table_plumber(table_settings)
+            table_rows = self._extract_table_plumber(table_settings)
         else:
             raise ValueError(
                 f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
             )
+        return TableResult(table_rows)
     def extract_tables(
         self,
         method: Optional[str] = None,

natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl

natural-pdf 0.1.27py3-none-any.whl → 0.1.30py3-none-any.whl