PyPI - natural-pdf - Versions diffs - 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

natural-pdf 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

natural_pdf/analyzers/shape_detection_mixin.py +43 -3
natural_pdf/classification/manager.py +1 -1
natural_pdf/classification/mixin.py +35 -14
natural_pdf/classification/results.py +16 -1
natural_pdf/cli.py +1 -0
natural_pdf/core/highlighting_service.py +23 -0
natural_pdf/core/page.py +32 -2
natural_pdf/core/pdf.py +24 -4
natural_pdf/describe/base.py +11 -1
natural_pdf/describe/summary.py +26 -0
natural_pdf/elements/base.py +81 -3
natural_pdf/elements/collections.py +162 -101
natural_pdf/elements/region.py +187 -160
natural_pdf/elements/text.py +15 -7
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +2 -2
natural_pdf/extraction/mixin.py +295 -11
natural_pdf/extraction/result.py +28 -1
natural_pdf/flows/region.py +117 -2
natural_pdf/ocr/engine_surya.py +25 -5
natural_pdf/qa/__init__.py +2 -1
natural_pdf/qa/document_qa.py +166 -113
natural_pdf/qa/qa_result.py +55 -0
natural_pdf/selectors/parser.py +22 -0
natural_pdf/utils/text_extraction.py +34 -14
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +22 -13
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +31 -30
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.22.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0

natural_pdf/elements/collections.py CHANGED Viewed

@@ -290,7 +290,13 @@ class ElementCollection(
         return ElementCollection(filtered)
-    def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
+    def extract_text(
+        self,
+        preserve_whitespace: bool = True,
+        use_exclusions: bool = True,
+        strip: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
         """
         Extract text from all TextElements in the collection, optionally using
         pdfplumber's layout engine if layout=True is specified.
@@ -303,6 +309,7 @@ class ElementCollection(
                       `chars_to_textmap` function ONLY if `layout=True` is passed.
                       See Page.extract_text docstring for common parameters.
                       If `layout=False` or omitted, performs a simple join.
+            strip: Whether to strip whitespace from the extracted text.
         Returns:
             Combined text from elements, potentially with layout-based spacing.
@@ -399,6 +406,12 @@ class ElementCollection(
             result = "".join(c.get("text", "") for c in all_char_dicts)
             # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
+        # Determine final strip flag – same rule as global helper unless caller overrides
+        strip_text = strip if strip is not None else (not use_layout)
+        if strip_text and isinstance(result, str):
+            result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
         return result
     def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
@@ -852,6 +865,7 @@ class ElementCollection(
         render_ocr: bool = False,
         width: Optional[int] = None,  # Add width parameter
         page: Optional[Any] = None,  # NEW: Optional page parameter for empty collections
+        crop: bool = False,  # NEW: If True, crop output to element bounds
     ) -> Optional["Image.Image"]:
         """
         Generates a temporary preview image highlighting elements in this collection
@@ -875,6 +889,9 @@ class ElementCollection(
             legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
             render_ocr: Whether to render OCR text.
             width: Optional width for the output image in pixels.
+            crop: If True, crop the resulting image to the tight bounding box
+                        containing all elements in the collection. The elements are
+                        still highlighted first, then the image is cropped.
         Returns:
             PIL Image object of the temporary preview, or None if rendering fails or
@@ -931,7 +948,23 @@ class ElementCollection(
         # 2. Call render_preview on the HighlightingService
         try:
-            return service.render_preview(
+            # Calculate crop bounding box in PDF coordinates if crop is requested
+            crop_bbox = None
+            if crop:
+                try:
+                    crop_bbox = (
+                        min(el.x0 for el in self._elements),
+                        min(el.top for el in self._elements),
+                        max(el.x1 for el in self._elements),
+                        max(el.bottom for el in self._elements),
+                    )
+                except Exception as bbox_err:
+                    logger.error(
+                        f"Error determining crop bbox for collection show: {bbox_err}",
+                        exc_info=True,
+                    )
+            img = service.render_preview(
                 page_index=page.index,
                 temporary_highlights=highlight_data_list,
                 scale=scale,
@@ -939,7 +972,9 @@ class ElementCollection(
                 labels=labels,  # Use 'labels'
                 legend_position=legend_position,
                 render_ocr=render_ocr,
+                crop_bbox=crop_bbox,
             )
+            return img
         except Exception as e:
             logger.error(f"Error calling highlighting_service.render_preview: {e}", exc_info=True)
             return None
@@ -1798,8 +1833,40 @@ class ElementCollection(
             # Mix object bounds with specific overrides
             clipped_elements = collection.clip(obj=container, bottom=page.height/2)
         """
+        # --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
+        from collections.abc import Sequence  # Local import to avoid top-level issues
+        # Detect if *obj* is a sequence meant to map one-to-one with the elements
+        clip_objs = None  # type: Optional[List[Any]]
+        if isinstance(obj, ElementCollection):
+            clip_objs = obj.elements
+        elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
+            clip_objs = list(obj)
+        if clip_objs is not None:
+            if len(clip_objs) != len(self._elements):
+                raise ValueError(
+                    f"Number of clipping objects ({len(clip_objs)}) does not match number of "
+                    f"elements in collection ({len(self._elements)})."
+                )
+            clipped_elements = [
+                el.clip(
+                    obj=clip_obj,
+                    left=left,
+                    top=top,
+                    right=right,
+                    bottom=bottom,
+                )
+                for el, clip_obj in zip(self._elements, clip_objs)
+            ]
+            return ElementCollection(clipped_elements)
+        # Fallback to original behaviour: apply same clipping parameters to all elements
         return self.apply(
-            lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
+            lambda element: element.clip(
+                obj=obj, left=left, top=top, right=right, bottom=bottom
+            )
         )
@@ -1838,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         """Return a string representation showing the page count."""
         return f"<PageCollection(count={len(self)})>"
-    def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
+    def extract_text(
+        self,
+        keep_blank_chars: bool = True,
+        apply_exclusions: bool = True,
+        strip: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
         """
         Extract text from all pages in the collection.
         Args:
             keep_blank_chars: Whether to keep blank characters (default: True)
             apply_exclusions: Whether to apply exclusion regions (default: True)
+            strip: Whether to strip whitespace from the extracted text.
             **kwargs: Additional extraction parameters
         Returns:
@@ -1853,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         texts = []
         for page in self.pages:
             text = page.extract_text(
-                keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
+                keep_blank_chars=keep_blank_chars,
+                apply_exclusions=apply_exclusions,
+                **kwargs,
             )
             texts.append(text)
-        return "\n".join(texts)
+        combined = "\n".join(texts)
+        # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
+        use_layout = kwargs.get("layout", False)
+        strip_final = strip if strip is not None else (not use_layout)
+        if strip_final:
+            combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
+        return combined
     def apply_ocr(
         self,
@@ -2253,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         # Generate sections
         sections = []
+        # --- Helper: build a FlowRegion spanning multiple pages ---
+        def _build_flow_region(start_el, end_el):
+            """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
+            If *end_el* is None, the region continues to the bottom of the last
+            page in this PageCollection."""
+            # Local imports to avoid top-level cycles
+            from natural_pdf.elements.region import Region
+            from natural_pdf.flows.flow import Flow
+            from natural_pdf.flows.element import FlowElement
+            from natural_pdf.flows.region import FlowRegion
+            start_pg = start_el.page
+            end_pg = end_el.page if end_el is not None else self.pages[-1]
+            parts: list[Region] = []
+            # Slice of first page
+            parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
+            # Full middle pages
+            for pg_idx in range(start_pg.index + 1, end_pg.index):
+                mid_pg = self.pages[pg_idx]
+                parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
+            # Slice of last page (if distinct)
+            if end_pg is not start_pg:
+                bottom = end_el.bottom if end_el is not None else end_pg.height
+                parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
+            flow = Flow(segments=parts, arrangement="vertical")
+            src_fe = FlowElement(physical_object=start_el, flow=flow)
+            return FlowRegion(flow=flow,
+                               constituent_regions=parts,
+                               source_flow_element=src_fe,
+                               boundary_element_found=end_el)
+        # ------------------------------------------------------------------
         current_start = None
         for i, boundary in enumerate(section_boundaries):
@@ -2273,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                     )
                     sections.append(section)
                 else:
-                    # Create a multi-page section
-                    from natural_pdf.elements.region import Region
-                    # Get the start and end pages
-                    start_page = start_element.page
-                    end_page = end_element.page
-                    # Create a combined region
-                    combined_region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
-                    )
-                    combined_region._spans_pages = True
-                    combined_region._page_range = (start_page.index, end_page.index)
-                    combined_region.start_element = start_element
-                    combined_region.end_element = end_element
-                    # Get all elements that fall within this multi-page region
-                    combined_elements = []
-                    # Get elements from the first page
-                    first_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == start_page and e.top >= start_element.top
-                    ]
-                    combined_elements.extend(first_page_elements)
-                    # Get elements from middle pages (if any)
-                    for page_idx in range(start_page.index + 1, end_page.index):
-                        middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
-                        combined_elements.extend(middle_page_elements)
-                    # Get elements from the last page
-                    last_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == end_page and e.bottom <= end_element.bottom
-                    ]
-                    combined_elements.extend(last_page_elements)
-                    # Store the elements in the combined region
-                    combined_region._multi_page_elements = combined_elements
-                    sections.append(combined_region)
+                    # Create FlowRegion spanning pages
+                    flow_region = _build_flow_region(start_element, end_element)
+                    sections.append(flow_region)
                 current_start = None
@@ -2372,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 last_page_elements.sort(key=lambda e: (e.top, e.x0))
                 end_element = last_page_elements[-1] if last_page_elements else None
-                # Create a multi-page section
-                from natural_pdf.elements.region import Region
-                if start_page == last_page:
-                    # Simple case - both on same page
-                    section = start_page.get_section_between(
-                        start_element, end_element, boundary_inclusion
-                    )
-                    sections.append(section)
-                else:
-                    # Create a multi-page section
-                    combined_region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
-                    )
-                    combined_region._spans_pages = True
-                    combined_region._page_range = (start_page.index, last_page.index)
-                    combined_region.start_element = start_element
-                    combined_region.end_element = end_element
-                    # Get all elements that fall within this multi-page region
-                    combined_elements = []
-                    # Get elements from the first page
-                    first_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == start_page and e.top >= start_element.top
-                    ]
-                    combined_elements.extend(first_page_elements)
-                    # Get elements from middle pages (if any)
-                    for page_idx in range(start_page.index + 1, last_page.index):
-                        middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
-                        combined_elements.extend(middle_page_elements)
-                    # Get elements from the last page
-                    last_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == last_page
-                        and (end_element is None or e.bottom <= end_element.bottom)
-                    ]
-                    combined_elements.extend(last_page_elements)
-                    # Store the elements in the combined region
-                    combined_region._multi_page_elements = combined_elements
-                    sections.append(combined_region)
+                # Create FlowRegion spanning multiple pages using helper
+                flow_region = _build_flow_region(start_element, end_element)
+                sections.append(flow_region)
             else:
                 # With start_elements only, create a section to the end of the current page
                 from natural_pdf.elements.region import Region
@@ -2607,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
     def to_image(
         self,
-        page_width: int = 300,
+        page_width: Optional[int] = None,
         cols: Optional[int] = 4,
         rows: Optional[int] = None,
         max_pages: Optional[int] = None,
         spacing: int = 10,
-        add_labels: bool = True,
-        show_category: bool = False,  # Add new flag
+        add_labels: bool = True,  # Add new flag
+        show_category: bool = False,
     ) -> Optional["Image.Image"]:
         """
         Generate a grid of page images for this collection.
@@ -2630,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         Returns:
             PIL Image of the page grid or None if no pages
         """
+        # Determine default page width from global options if not explicitly provided
+        if page_width is None:
+            try:
+                import natural_pdf
+                page_width = natural_pdf.options.image.width or 300
+            except Exception:
+                # Fallback if natural_pdf import fails in some edge context
+                page_width = 300
         # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
         try:
             from PIL import Image, ImageDraw, ImageFont
@@ -2927,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 # Re-raise the exception caught from the exporter
                 raise e  # Keep the original exception type (ValueError, RuntimeError, etc.)
             # <--- END MODIFIED
+    # Alias .to_image() to .show() for convenience
+    def show(
+        self,
+        *args,
+        **kwargs,
+    ) -> Optional["Image.Image"]:
+        """Display pages similarly to ``to_image``.
+        This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
+        ElementCollection, where ``show()`` already exists. It forwards all
+        arguments and returns the resulting ``PIL.Image`` instance.
+        """
+        return self.to_image(*args, **kwargs)

natural-pdf 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

natural-pdf 0.1.22py3-none-any.whl → 0.1.24py3-none-any.whl