PyPI - natural-pdf - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

natural-pdf 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

natural_pdf/analyzers/shape_detection_mixin.py +1373 -0
natural_pdf/classification/manager.py +2 -3
natural_pdf/collections/pdf_collection.py +19 -39
natural_pdf/core/highlighting_service.py +29 -38
natural_pdf/core/page.py +284 -187
natural_pdf/core/pdf.py +4 -4
natural_pdf/elements/base.py +54 -20
natural_pdf/elements/collections.py +160 -9
natural_pdf/elements/line.py +5 -0
natural_pdf/elements/region.py +380 -38
natural_pdf/exporters/paddleocr.py +51 -11
natural_pdf/flows/__init__.py +12 -0
natural_pdf/flows/collections.py +533 -0
natural_pdf/flows/element.py +382 -0
natural_pdf/flows/flow.py +216 -0
natural_pdf/flows/region.py +458 -0
natural_pdf/selectors/parser.py +163 -8
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/METADATA +2 -1
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/RECORD +22 -17
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/WHEEL +1 -1
natural_pdf/utils/tqdm_utils.py +0 -51
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.12.dist-info → natural_pdf-0.1.14.dist-info}/top_level.txt +0 -0

natural_pdf/elements/region.py CHANGED Viewed

@@ -13,6 +13,7 @@ from natural_pdf.classification.manager import ClassificationManager  # Keep for
 from natural_pdf.classification.mixin import ClassificationMixin
 from natural_pdf.elements.base import DirectionalMixin
 from natural_pdf.extraction.mixin import ExtractionMixin  # Import extraction mixin
+from natural_pdf.elements.text import TextElement # ADDED IMPORT
 from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements  # Import utility
 from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
 from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
@@ -20,11 +21,12 @@ from natural_pdf.utils.locks import pdf_render_lock  # Import the lock
 # Import new utils
 from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
-# --- NEW: Import tqdm utility --- #
-from natural_pdf.utils.tqdm_utils import get_tqdm
+from tqdm.auto import tqdm
 # --- End Classification Imports --- #
+# --- Shape Detection Mixin --- #
+from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
+# --- End Shape Detection Mixin --- #
 if TYPE_CHECKING:
     # --- NEW: Add Image type hint for classification --- #
@@ -33,6 +35,7 @@ if TYPE_CHECKING:
     from natural_pdf.core.page import Page
     from natural_pdf.elements.collections import ElementCollection
     from natural_pdf.elements.text import TextElement
+    from natural_pdf.elements.base import Element # Added for type hint
 # Import OCRManager conditionally to avoid circular imports
 try:
@@ -44,7 +47,7 @@ except ImportError:
 logger = logging.getLogger(__name__)
-class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
+class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
     """
     Represents a rectangular region on a page.
     """
@@ -103,7 +106,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         direction: str,
         size: Optional[float] = None,
         cross_size: str = "full",
-        include_element: bool = False,
+        include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
         **kwargs,
@@ -115,7 +118,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             direction: 'left', 'right', 'above', or 'below'
             size: Size in the primary direction (width for horizontal, height for vertical)
             cross_size: Size in the cross direction ('full' or 'element')
-            include_element: Whether to include this region's area in the result
+            include_source: Whether to include this region's area in the result
             until: Optional selector string to specify a boundary element
             include_endpoint: Whether to include the boundary element found by 'until'
             **kwargs: Additional parameters for the 'until' selector search
@@ -129,7 +132,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         is_positive = direction in ("right", "below")  # right/below are positive directions
         pixel_offset = 1  # Offset for excluding elements/endpoints
-        # 1. Determine initial boundaries based on direction and include_element
+        # 1. Determine initial boundaries based on direction and include_source
         if is_horizontal:
             # Initial cross-boundaries (vertical)
             y0 = 0 if cross_size == "full" else self.top
@@ -137,11 +140,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             # Initial primary boundaries (horizontal)
             if is_positive:  # right
-                x0_initial = self.x0 if include_element else self.x1 + pixel_offset
+                x0_initial = self.x0 if include_source else self.x1 + pixel_offset
                 x1_initial = self.x1  # This edge moves
             else:  # left
                 x0_initial = self.x0  # This edge moves
-                x1_initial = self.x1 if include_element else self.x0 - pixel_offset
+                x1_initial = self.x1 if include_source else self.x0 - pixel_offset
         else:  # Vertical
             # Initial cross-boundaries (horizontal)
             x0 = 0 if cross_size == "full" else self.x0
@@ -149,11 +152,11 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             # Initial primary boundaries (vertical)
             if is_positive:  # below
-                y0_initial = self.top if include_element else self.bottom + pixel_offset
+                y0_initial = self.top if include_source else self.bottom + pixel_offset
                 y1_initial = self.bottom  # This edge moves
             else:  # above
                 y0_initial = self.top  # This edge moves
-                y1_initial = self.bottom if include_element else self.top - pixel_offset
+                y1_initial = self.bottom if include_source else self.top - pixel_offset
         # 2. Calculate the final primary boundary, considering 'size' or page limits
         if is_horizontal:
@@ -245,7 +248,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         # 5. Create and return Region
         region = Region(self.page, final_bbox)
         region.source_element = self
-        region.includes_source = include_element
+        region.includes_source = include_source
         # Optionally store the boundary element if found
         if target:
             region.boundary_element = target
@@ -256,7 +259,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         self,
         height: Optional[float] = None,
         width: str = "full",
-        include_element: bool = False,
+        include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
         **kwargs,
@@ -267,7 +270,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         Args:
             height: Height of the region above, in points
             width: Width mode - "full" for full page width or "element" for element width
-            include_element: Whether to include this region in the result (default: False)
+            include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify an upper boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
             **kwargs: Additional parameters
@@ -279,7 +282,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             direction="above",
             size=height,
             cross_size=width,
-            include_element=include_element,
+            include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
             **kwargs,
@@ -289,7 +292,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         self,
         height: Optional[float] = None,
         width: str = "full",
-        include_element: bool = False,
+        include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
         **kwargs,
@@ -300,7 +303,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         Args:
             height: Height of the region below, in points
             width: Width mode - "full" for full page width or "element" for element width
-            include_element: Whether to include this region in the result (default: False)
+            include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify a lower boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
             **kwargs: Additional parameters
@@ -312,7 +315,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             direction="below",
             size=height,
             cross_size=width,
-            include_element=include_element,
+            include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
             **kwargs,
@@ -322,7 +325,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         self,
         width: Optional[float] = None,
         height: str = "full",
-        include_element: bool = False,
+        include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
         **kwargs,
@@ -333,7 +336,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         Args:
             width: Width of the region to the left, in points
             height: Height mode - "full" for full page height or "element" for element height
-            include_element: Whether to include this region in the result (default: False)
+            include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify a left boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
             **kwargs: Additional parameters
@@ -345,7 +348,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             direction="left",
             size=width,
             cross_size=height,
-            include_element=include_element,
+            include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
             **kwargs,
@@ -355,7 +358,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         self,
         width: Optional[float] = None,
         height: str = "full",
-        include_element: bool = False,
+        include_source: bool = False,
         until: Optional[str] = None,
         include_endpoint: bool = True,
         **kwargs,
@@ -366,7 +369,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         Args:
             width: Width of the region to the right, in points
             height: Height mode - "full" for full page height or "element" for element height
-            include_element: Whether to include this region in the result (default: False)
+            include_source: Whether to include this region in the result (default: False)
             until: Optional selector string to specify a right boundary element
             include_endpoint: Whether to include the boundary element in the region (default: True)
             **kwargs: Additional parameters
@@ -378,7 +381,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             direction="right",
             size=width,
             cross_size=height,
-            include_element=include_element,
+            include_source=include_source,
             until=until,
             include_endpoint=include_endpoint,
             **kwargs,
@@ -720,14 +723,36 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         Returns:
             PIL Image of just this region
         """
+        # Handle the case where user wants the cropped region to have a specific width
+        page_kwargs = kwargs.copy()
+        effective_resolution = resolution  # Start with the provided resolution
+        if crop_only and 'width' in kwargs:
+            target_width = kwargs['width']
+            # Calculate what resolution is needed to make the region crop have target_width
+            region_width_points = self.width  # Region width in PDF points
+            if region_width_points > 0:
+                # Calculate scale needed: target_width / region_width_points
+                required_scale = target_width / region_width_points
+                # Convert scale to resolution: scale * 72 DPI
+                effective_resolution = required_scale * 72.0
+                page_kwargs.pop('width')  # Remove width parameter to avoid conflicts
+                logger.debug(f"Region {self.bbox}: Calculated required resolution {effective_resolution:.1f} DPI for region crop width {target_width}")
+            else:
+                logger.warning(f"Region {self.bbox}: Invalid region width {region_width_points}, using original resolution")
         # First get the full page image with highlights if requested
         page_image = self._page.to_image(
-            scale=scale, resolution=resolution, include_highlights=include_highlights, **kwargs
+            scale=scale, resolution=effective_resolution, include_highlights=include_highlights, **page_kwargs
         )
-        # Calculate the crop coordinates - apply resolution scaling factor
-        # PDF coordinates are in points (1/72 inch), but image is scaled by resolution
-        scale_factor = resolution / 72.0  # Scale based on DPI
+        # Calculate the actual scale factor used by the page image
+        if page_image.width > 0 and self._page.width > 0:
+            scale_factor = page_image.width / self._page.width
+        else:
+            # Fallback to resolution-based calculation if dimensions are invalid
+            scale_factor = resolution / 72.0
         # Apply scaling to the coordinates
         x0 = int(self.x0 * scale_factor)
@@ -874,6 +899,233 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         image.save(filename)
         return self
+    def trim(self, padding: int = 1, threshold: float = 0.95, resolution: float = 150, pre_shrink: float = 0.5) -> "Region":
+        """
+        Trim visual whitespace from the edges of this region.
+        Similar to Python's string .strip() method, but for visual whitespace in the region image.
+        Uses pixel analysis to detect rows/columns that are predominantly whitespace.
+        Args:
+            padding: Number of pixels to keep as padding after trimming (default: 1)
+            threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
+                      Higher values mean more strict whitespace detection.
+                      E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
+            resolution: Resolution for image rendering in DPI (default: 150)
+            pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
+                       This helps avoid detecting box borders/slivers as content.
+        Returns:
+            New Region with visual whitespace trimmed from all edges
+        Example:
+            # Basic trimming with 1 pixel padding and 0.5px pre-shrink
+            trimmed = region.trim()
+            # More aggressive trimming with no padding and no pre-shrink
+            tight = region.trim(padding=0, threshold=0.9, pre_shrink=0)
+            # Conservative trimming with more padding
+            loose = region.trim(padding=3, threshold=0.98)
+        """
+        # Pre-shrink the region to avoid box slivers
+        work_region = self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink) if pre_shrink > 0 else self
+        # Get the region image
+        image = work_region.to_image(resolution=resolution, crop_only=True, include_highlights=False)
+        if image is None:
+            logger.warning(f"Region {self.bbox}: Could not generate image for trimming. Returning original region.")
+            return self
+        # Convert to grayscale for easier analysis
+        import numpy as np
+        # Convert PIL image to numpy array
+        img_array = np.array(image.convert('L'))  # Convert to grayscale
+        height, width = img_array.shape
+        if height == 0 or width == 0:
+            logger.warning(f"Region {self.bbox}: Image has zero dimensions. Returning original region.")
+            return self
+        # Normalize pixel values to 0-1 range (255 = white = 1.0, 0 = black = 0.0)
+        normalized = img_array.astype(np.float32) / 255.0
+        # Find content boundaries by analyzing row and column averages
+        # Analyze rows (horizontal strips) to find top and bottom boundaries
+        row_averages = np.mean(normalized, axis=1)  # Average each row
+        content_rows = row_averages < threshold  # True where there's content (not whitespace)
+        # Find first and last rows with content
+        content_row_indices = np.where(content_rows)[0]
+        if len(content_row_indices) == 0:
+            # No content found, return a minimal region at the center
+            logger.warning(f"Region {self.bbox}: No content detected during trimming. Returning center point.")
+            center_x = (self.x0 + self.x1) / 2
+            center_y = (self.top + self.bottom) / 2
+            return Region(self.page, (center_x, center_y, center_x, center_y))
+        top_content_row = max(0, content_row_indices[0] - padding)
+        bottom_content_row = min(height - 1, content_row_indices[-1] + padding)
+        # Analyze columns (vertical strips) to find left and right boundaries
+        col_averages = np.mean(normalized, axis=0)  # Average each column
+        content_cols = col_averages < threshold  # True where there's content
+        content_col_indices = np.where(content_cols)[0]
+        if len(content_col_indices) == 0:
+            # No content found in columns either
+            logger.warning(f"Region {self.bbox}: No column content detected during trimming. Returning center point.")
+            center_x = (self.x0 + self.x1) / 2
+            center_y = (self.top + self.bottom) / 2
+            return Region(self.page, (center_x, center_y, center_x, center_y))
+        left_content_col = max(0, content_col_indices[0] - padding)
+        right_content_col = min(width - 1, content_col_indices[-1] + padding)
+        # Convert trimmed pixel coordinates back to PDF coordinates
+        scale_factor = resolution / 72.0  # Scale factor used in to_image()
+        # Calculate new PDF coordinates and ensure they are Python floats
+        trimmed_x0 = float(work_region.x0 + (left_content_col / scale_factor))
+        trimmed_top = float(work_region.top + (top_content_row / scale_factor))
+        trimmed_x1 = float(work_region.x0 + ((right_content_col + 1) / scale_factor))  # +1 because we want inclusive right edge
+        trimmed_bottom = float(work_region.top + ((bottom_content_row + 1) / scale_factor))  # +1 because we want inclusive bottom edge
+        # Ensure the trimmed region doesn't exceed the work region boundaries
+        final_x0 = max(work_region.x0, trimmed_x0)
+        final_top = max(work_region.top, trimmed_top)
+        final_x1 = min(work_region.x1, trimmed_x1)
+        final_bottom = min(work_region.bottom, trimmed_bottom)
+        # Ensure valid coordinates (width > 0, height > 0)
+        if final_x1 <= final_x0 or final_bottom <= final_top:
+            logger.warning(f"Region {self.bbox}: Trimming resulted in invalid dimensions. Returning original region.")
+            return self
+        # Create the trimmed region
+        trimmed_region = Region(self.page, (final_x0, final_top, final_x1, final_bottom))
+        # Expand back by the pre_shrink amount to restore original positioning
+        if pre_shrink > 0:
+            trimmed_region = trimmed_region.expand(left=pre_shrink, right=pre_shrink, top=pre_shrink, bottom=pre_shrink)
+        # Copy relevant metadata
+        trimmed_region.region_type = self.region_type
+        trimmed_region.normalized_type = self.normalized_type
+        trimmed_region.confidence = self.confidence
+        trimmed_region.model = self.model
+        trimmed_region.name = self.name
+        trimmed_region.label = self.label
+        trimmed_region.source = "trimmed"  # Indicate this is a derived region
+        trimmed_region.parent_region = self
+        logger.debug(f"Region {self.bbox}: Trimmed to {trimmed_region.bbox} (padding={padding}, threshold={threshold}, pre_shrink={pre_shrink})")
+        return trimmed_region
+    def clip(
+        self,
+        obj: Optional[Any] = None,
+        left: Optional[float] = None,
+        top: Optional[float] = None,
+        right: Optional[float] = None,
+        bottom: Optional[float] = None,
+    ) -> "Region":
+        """
+        Clip this region to specific bounds, either from another object with bbox or explicit coordinates.
+        The clipped region will be constrained to not exceed the specified boundaries.
+        You can provide either an object with bounding box properties, specific coordinates, or both.
+        When both are provided, explicit coordinates take precedence.
+        Args:
+            obj: Optional object with bbox properties (Region, Element, TextElement, etc.)
+            left: Optional left boundary (x0) to clip to
+            top: Optional top boundary to clip to
+            right: Optional right boundary (x1) to clip to
+            bottom: Optional bottom boundary to clip to
+        Returns:
+            New Region with bounds clipped to the specified constraints
+        Examples:
+            # Clip to another region's bounds
+            clipped = region.clip(container_region)
+            # Clip to any element's bounds
+            clipped = region.clip(text_element)
+            # Clip to specific coordinates
+            clipped = region.clip(left=100, right=400)
+            # Mix object bounds with specific overrides
+            clipped = region.clip(obj=container, bottom=page.height/2)
+        """
+        from natural_pdf.elements.base import extract_bbox
+        # Start with current region bounds
+        clip_x0 = self.x0
+        clip_top = self.top
+        clip_x1 = self.x1
+        clip_bottom = self.bottom
+        # Apply object constraints if provided
+        if obj is not None:
+            obj_bbox = extract_bbox(obj)
+            if obj_bbox is not None:
+                obj_x0, obj_top, obj_x1, obj_bottom = obj_bbox
+                # Constrain to the intersection with the provided object
+                clip_x0 = max(clip_x0, obj_x0)
+                clip_top = max(clip_top, obj_top)
+                clip_x1 = min(clip_x1, obj_x1)
+                clip_bottom = min(clip_bottom, obj_bottom)
+            else:
+                logger.warning(
+                    f"Region {self.bbox}: Cannot extract bbox from clipping object {type(obj)}. "
+                    "Object must have bbox property or x0/top/x1/bottom attributes."
+                )
+        # Apply explicit coordinate constraints (these take precedence)
+        if left is not None:
+            clip_x0 = max(clip_x0, left)
+        if top is not None:
+            clip_top = max(clip_top, top)
+        if right is not None:
+            clip_x1 = min(clip_x1, right)
+        if bottom is not None:
+            clip_bottom = min(clip_bottom, bottom)
+        # Ensure valid coordinates
+        if clip_x1 <= clip_x0 or clip_bottom <= clip_top:
+            logger.warning(
+                f"Region {self.bbox}: Clipping resulted in invalid dimensions "
+                f"({clip_x0}, {clip_top}, {clip_x1}, {clip_bottom}). Returning minimal region."
+            )
+            # Return a minimal region at the clip area's top-left
+            return Region(self.page, (clip_x0, clip_top, clip_x0, clip_top))
+        # Create the clipped region
+        clipped_region = Region(self.page, (clip_x0, clip_top, clip_x1, clip_bottom))
+        # Copy relevant metadata
+        clipped_region.region_type = self.region_type
+        clipped_region.normalized_type = self.normalized_type
+        clipped_region.confidence = self.confidence
+        clipped_region.model = self.model
+        clipped_region.name = self.name
+        clipped_region.label = self.label
+        clipped_region.source = "clipped"  # Indicate this is a derived region
+        clipped_region.parent_region = self
+        logger.debug(
+            f"Region {self.bbox}: Clipped to {clipped_region.bbox} "
+            f"(constraints: obj={type(obj).__name__ if obj else None}, "
+            f"left={left}, top={top}, right={right}, bottom={bottom})"
+        )
+        return clipped_region
     def get_elements(
         self, selector: Optional[str] = None, apply_exclusions=True, **kwargs
     ) -> List["Element"]:
@@ -1022,7 +1274,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
                 effective_method = "tatr"
             else:
-                effective_method = "text"
+                effective_method = "plumber"
         logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
@@ -1045,6 +1297,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
     def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
         """
         Extract table using pdfplumber's table extraction.
+        This method extracts the largest table within the region.
         Args:
             table_settings: Settings for pdfplumber table extraction
@@ -1055,12 +1308,12 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         # Create a crop of the page for this region
         cropped = self.page._page.crop(self.bbox)
-        # Extract table from the cropped area
-        tables = cropped.extract_tables(table_settings)
+        # Extract the single largest table from the cropped area
+        table = cropped.extract_table(table_settings)
-        # Return the first table or an empty list if none found
-        if tables:
-            return tables[0]
+        # Return the table or an empty list if none found
+        if table:
+            return table
         return []
     def _extract_table_tatr(self, use_ocr=False, ocr_config=None) -> List[List[str]]:
@@ -1261,8 +1514,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         unique_tops = cluster_coords(tops)
         unique_lefts = cluster_coords(lefts)
-        # --- Setup tqdm --- #
-        tqdm = get_tqdm()
         # Determine iterable for tqdm
         cell_iterator = cell_dicts
         if show_progress:
@@ -1777,7 +2028,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
     def get_sections(
         self, start_elements=None, end_elements=None, boundary_inclusion="both"
-    ) -> List["Region"]:
+    ) -> "ElementCollection[Region]":
         """
         Get sections within this region based on start/end elements.
@@ -1897,7 +2148,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
             section = self.get_section_between(start_element, end_element, boundary_inclusion)
             sections.append(section)
-        return sections
+        return ElementCollection(sections)
     def create_cells(self):
         """
@@ -2413,3 +2664,94 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
         return ElementCollection(cell_regions)
     # --- END NEW METHOD ---
+    def to_text_element(
+        self,
+        text_content: Optional[Union[str, Callable[["Region"], Optional[str]]]] = None,
+        source_label: str = "derived_from_region",
+        object_type: str = "word", # Or "char", controls how it's categorized
+        default_font_size: float = 10.0,
+        default_font_name: str = "RegionContent",
+        confidence: Optional[float] = None, # Allow overriding confidence
+        add_to_page: bool = False # NEW: Option to add to page
+    ) -> "TextElement":
+        """
+        Creates a new TextElement object based on this region's geometry.
+        The text for the new TextElement can be provided directly,
+        generated by a callback function, or left as None.
+        Args:
+            text_content:
+                - If a string, this will be the text of the new TextElement.
+                - If a callable, it will be called with this region instance
+                  and its return value (a string or None) will be the text.
+                - If None (default), the TextElement's text will be None.
+            source_label: The 'source' attribute for the new TextElement.
+            object_type: The 'object_type' for the TextElement's data dict
+                         (e.g., "word", "char").
+            default_font_size: Placeholder font size if text is generated.
+            default_font_name: Placeholder font name if text is generated.
+            confidence: Confidence score for the text. If text_content is None,
+                        defaults to 0.0. If text is provided/generated, defaults to 1.0
+                        unless specified.
+            add_to_page: If True, the created TextElement will be added to the
+                         region's parent page. (Default: False)
+        Returns:
+            A new TextElement instance.
+        Raises:
+            ValueError: If the region does not have a valid 'page' attribute.
+        """
+        actual_text: Optional[str] = None
+        if isinstance(text_content, str):
+            actual_text = text_content
+        elif callable(text_content):
+            try:
+                actual_text = text_content(self)
+            except Exception as e:
+                logger.error(f"Error executing text_content callback for region {self.bbox}: {e}", exc_info=True)
+                actual_text = None # Ensure actual_text is None on error
+        final_confidence = confidence
+        if final_confidence is None:
+            final_confidence = 1.0 if actual_text is not None and actual_text.strip() else 0.0
+        if not hasattr(self, 'page') or self.page is None:
+            raise ValueError("Region must have a valid 'page' attribute to create a TextElement.")
+        elem_data = {
+            "text": actual_text,
+            "x0": self.x0,
+            "top": self.top,
+            "x1": self.x1,
+            "bottom": self.bottom,
+            "width": self.width,
+            "height": self.height,
+            "object_type": object_type,
+            "page_number": self.page.page_number,
+            "stroking_color": getattr(self, 'stroking_color', (0,0,0)),
+            "non_stroking_color": getattr(self, 'non_stroking_color', (0,0,0)),
+            "fontname": default_font_name,
+            "size": default_font_size,
+            "upright": True,
+            "direction": 1,
+            "adv": self.width,
+            "source": source_label,
+            "confidence": final_confidence,
+            "_char_dicts": []
+        }
+        text_element = TextElement(elem_data, self.page)
+        if add_to_page:
+            if hasattr(self.page, '_element_mgr') and self.page._element_mgr is not None:
+                add_as_type = "words" if object_type == "word" else "chars" if object_type == "char" else object_type
+                # REMOVED try-except block around add_element
+                self.page._element_mgr.add_element(text_element, element_type=add_as_type)
+                logger.debug(f"TextElement created from region {self.bbox} and added to page {self.page.page_number} as {add_as_type}.")
+            else:
+                page_num_str = str(self.page.page_number) if hasattr(self.page, 'page_number') else 'N/A'
+                logger.warning(f"Cannot add TextElement to page: Page {page_num_str} for region {self.bbox} is missing '_element_mgr'.")
+        return text_element

natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

natural-pdf 0.1.12py3-none-any.whl → 0.1.14py3-none-any.whl