PyPI - natural-pdf - Versions diffs - 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl - Mend

natural-pdf 0.1.23py3-none-any.whl → 0.1.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

natural_pdf/analyzers/shape_detection_mixin.py +40 -0
natural_pdf/core/highlighting_service.py +4 -4
natural_pdf/core/page.py +16 -2
natural_pdf/describe/base.py +11 -1
natural_pdf/describe/summary.py +26 -0
natural_pdf/elements/base.py +2 -2
natural_pdf/elements/collections.py +139 -100
natural_pdf/elements/region.py +133 -12
natural_pdf/elements/text.py +15 -7
natural_pdf/flows/region.py +116 -1
natural_pdf/qa/document_qa.py +162 -105
natural_pdf/utils/text_extraction.py +34 -14
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/METADATA +2 -1
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/RECORD +18 -18
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.23.dist-info → natural_pdf-0.1.24.dist-info}/top_level.txt +0 -0

natural_pdf/analyzers/shape_detection_mixin.py CHANGED Viewed

@@ -1490,6 +1490,45 @@ class ShapeDetectionMixin:
         element_manager = page_object_for_elements._element_mgr
+        # ------------------------------------------------------------------
+        # CLEAN-UP existing table-related regions from earlier runs to avoid duplicates
+        # ------------------------------------------------------------------
+        try:
+            _purge_types = {"table", "table_row", "table_column", "table_cell"}
+            if (
+                hasattr(element_manager, "_elements")
+                and "regions" in element_manager._elements
+            ):
+                _orig_len = len(element_manager._elements["regions"])
+                element_manager._elements["regions"] = [
+                    r
+                    for r in element_manager._elements["regions"]
+                    if not (
+                        getattr(r, "source", None) == source_label
+                        and getattr(r, "region_type", None) in _purge_types
+                    )
+                ]
+                _removed = _orig_len - len(element_manager._elements["regions"])
+                if _removed:
+                    logger.info(
+                        f"Removed {_removed} previous table-related regions (source='{source_label}') before regeneration."
+                    )
+            if hasattr(page_object_for_elements, "_regions") and "detected" in page_object_for_elements._regions:
+                page_object_for_elements._regions["detected"] = [
+                    r
+                    for r in page_object_for_elements._regions["detected"]
+                    if not (
+                        getattr(r, "source", None) == source_label
+                        and getattr(r, "region_type", None) in _purge_types
+                    )
+                ]
+        except Exception as _cleanup_err:
+            logger.warning(
+                f"Table-region cleanup failed: {_cleanup_err}", exc_info=True
+            )
         # Get lines with the specified source
         all_lines = element_manager.lines  # Access lines from the correct element manager
         filtered_lines = [
@@ -1724,6 +1763,7 @@ class ShapeDetectionMixin:
         logger.info(
             f"Created {tables_created} table, {rows_created} rows, {cols_created} columns, and {cells_created} table cells from detected lines (source: '{source_label}') for {self}."
         )
         return self

natural_pdf/core/highlighting_service.py CHANGED Viewed

@@ -863,10 +863,10 @@ class HighlightingService:
             if crop_bbox is not None:
                 cb_x0, cb_top, cb_x1, cb_bottom = crop_bbox
                 # Convert to pixel coordinates using actual scales
-                left_px = int(cb_x0 * actual_scale_x) - 2
-                top_px = int(cb_top * actual_scale_y) - 2
-                right_px = int(cb_x1 * actual_scale_x) + 2
-                bottom_px = int(cb_bottom * actual_scale_y) + 2
+                left_px = int(cb_x0 * actual_scale_x) - 1
+                top_px = int(cb_top * actual_scale_y) - 1
+                right_px = int(cb_x1 * actual_scale_x) + 1
+                bottom_px = int(cb_bottom * actual_scale_y) + 1
                 # Safeguard coordinates within bounds
                 left_px = max(0, min(left_px, rendered_image.width - 1))

natural_pdf/core/page.py CHANGED Viewed

@@ -2235,12 +2235,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
     def ask(
         self,
-        question: str,
+        question: Union[str, List[str], Tuple[str, ...]],
         min_confidence: float = 0.1,
         model: str = None,
         debug: bool = False,
         **kwargs,
-    ) -> Dict[str, Any]:
+    ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """
         Ask a question about the page content using document QA.
         """
@@ -2824,3 +2824,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
         if not hasattr(self, "metadata") or self.metadata is None:
             self.metadata = {}
         self.metadata["analysis"] = value
+    def inspect(self, limit: int = 30) -> "InspectionSummary":
+        """
+        Inspect all elements on this page with detailed tabular view.
+        Equivalent to page.find_all('*').inspect().
+        Args:
+            limit: Maximum elements per type to show (default: 30)
+        Returns:
+            InspectionSummary with element tables showing coordinates,
+            properties, and other details for each element
+        """
+        return self.find_all('*').inspect(limit=limit)

natural_pdf/describe/base.py CHANGED Viewed

@@ -269,7 +269,7 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
     base_columns = ['x0', 'top', 'x1', 'bottom']
     if element_type == 'word':
-        columns = ['text'] + base_columns + ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']
+        columns = ['text'] + base_columns + ['font_family', 'font_variant', 'size', 'bold', 'italic', 'source', 'confidence']
         # Add color for text elements
         columns.append('color')
     elif element_type == 'rect':
@@ -315,6 +315,16 @@ def _extract_element_value(element: "Element", column: str) -> Any:
             # Fallback to fontname
             return getattr(element, 'fontname', '')
+        elif column == 'font_variant':
+            variant = getattr(element, 'font_variant', None)
+            if variant:
+                return variant
+            # Fallback – try to derive from fontname if property missing
+            fontname = getattr(element, 'fontname', '')
+            if "+" in fontname:
+                return fontname.split("+", 1)[0]
+            return ''
         elif column in ['bold', 'italic']:
             value = getattr(element, column, False)
             return value if isinstance(value, bool) else False

natural_pdf/describe/summary.py CHANGED Viewed

@@ -128,6 +128,32 @@ class ElementSummary:
             ""
         ]
+    # Added for better VS Code and other frontends support
+    def _repr_html_(self) -> str:  # type: ignore
+        """Return HTML representation so rich rendering works in more frontends.
+        Many notebook frontends (including VS Code) give priority to the
+        ``_repr_html_`` method over Markdown. When available, we convert the
+        generated Markdown to HTML using the *markdown* library. If the
+        library is not installed we simply wrap the Markdown in a ``<pre>``
+        block so that at least the plain-text representation is visible.
+        """
+        md_source = self._to_markdown()
+        try:
+            import markdown as _markdown  # pylint: disable=import-error
+            # Convert markdown to HTML. We explicitly enable tables so the
+            # element and inspection summaries render nicely.
+            return _markdown.markdown(md_source, extensions=["tables"])
+        except Exception:  # noqa: BLE001, broad-except
+            # Fallback: present the Markdown as-is inside a <pre> block.
+            escaped = (
+                md_source.replace("&", "&amp;")
+                .replace("<", "&lt;")
+                .replace(">", "&gt;")
+            )
+            return f"<pre>{escaped}</pre>"
 class InspectionSummary(ElementSummary):
     """

natural_pdf/elements/base.py CHANGED Viewed

@@ -174,8 +174,8 @@ class DirectionalMixin:
                 # Adjust cross boundaries if cross_size is 'element'
                 if cross_size == "element":
                     if is_horizontal:  # Adjust y0, y1
-                        y0 = min(y0, self.y0)
-                        y1 = max(y1, self.y1)
+                        y0 = min(y0, self.top)
+                        y1 = max(y1, self.bottom)
                     else:  # Adjust x0, x1
                         x0 = min(x0, self.x0)
                         x1 = max(x1, self.x1)

natural_pdf/elements/collections.py CHANGED Viewed

@@ -290,7 +290,13 @@ class ElementCollection(
         return ElementCollection(filtered)
-    def extract_text(self, preserve_whitespace=True, use_exclusions=True, **kwargs) -> str:
+    def extract_text(
+        self,
+        preserve_whitespace: bool = True,
+        use_exclusions: bool = True,
+        strip: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
         """
         Extract text from all TextElements in the collection, optionally using
         pdfplumber's layout engine if layout=True is specified.
@@ -303,6 +309,7 @@ class ElementCollection(
                       `chars_to_textmap` function ONLY if `layout=True` is passed.
                       See Page.extract_text docstring for common parameters.
                       If `layout=False` or omitted, performs a simple join.
+            strip: Whether to strip whitespace from the extracted text.
         Returns:
             Combined text from elements, potentially with layout-based spacing.
@@ -399,6 +406,12 @@ class ElementCollection(
             result = "".join(c.get("text", "") for c in all_char_dicts)
             # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
+        # Determine final strip flag – same rule as global helper unless caller overrides
+        strip_text = strip if strip is not None else (not use_layout)
+        if strip_text and isinstance(result, str):
+            result = "\n".join(line.rstrip() for line in result.splitlines()).strip()
         return result
     def filter(self, func: Callable[["Element"], bool]) -> "ElementCollection":
@@ -1820,8 +1833,40 @@ class ElementCollection(
             # Mix object bounds with specific overrides
             clipped_elements = collection.clip(obj=container, bottom=page.height/2)
         """
+        # --- NEW BEHAVIOUR: support per-element clipping with sequences --- #
+        from collections.abc import Sequence  # Local import to avoid top-level issues
+        # Detect if *obj* is a sequence meant to map one-to-one with the elements
+        clip_objs = None  # type: Optional[List[Any]]
+        if isinstance(obj, ElementCollection):
+            clip_objs = obj.elements
+        elif isinstance(obj, Sequence) and not isinstance(obj, (str, bytes)):
+            clip_objs = list(obj)
+        if clip_objs is not None:
+            if len(clip_objs) != len(self._elements):
+                raise ValueError(
+                    f"Number of clipping objects ({len(clip_objs)}) does not match number of "
+                    f"elements in collection ({len(self._elements)})."
+                )
+            clipped_elements = [
+                el.clip(
+                    obj=clip_obj,
+                    left=left,
+                    top=top,
+                    right=right,
+                    bottom=bottom,
+                )
+                for el, clip_obj in zip(self._elements, clip_objs)
+            ]
+            return ElementCollection(clipped_elements)
+        # Fallback to original behaviour: apply same clipping parameters to all elements
         return self.apply(
-            lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
+            lambda element: element.clip(
+                obj=obj, left=left, top=top, right=right, bottom=bottom
+            )
         )
@@ -1860,13 +1905,20 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         """Return a string representation showing the page count."""
         return f"<PageCollection(count={len(self)})>"
-    def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
+    def extract_text(
+        self,
+        keep_blank_chars: bool = True,
+        apply_exclusions: bool = True,
+        strip: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
         """
         Extract text from all pages in the collection.
         Args:
             keep_blank_chars: Whether to keep blank characters (default: True)
             apply_exclusions: Whether to apply exclusion regions (default: True)
+            strip: Whether to strip whitespace from the extracted text.
             **kwargs: Additional extraction parameters
         Returns:
@@ -1875,11 +1927,22 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         texts = []
         for page in self.pages:
             text = page.extract_text(
-                keep_blank_chars=keep_blank_chars, apply_exclusions=apply_exclusions, **kwargs
+                keep_blank_chars=keep_blank_chars,
+                apply_exclusions=apply_exclusions,
+                **kwargs,
             )
             texts.append(text)
-        return "\n".join(texts)
+        combined = "\n".join(texts)
+        # Default strip behaviour: if caller picks, honour; else respect layout flag passed via kwargs.
+        use_layout = kwargs.get("layout", False)
+        strip_final = strip if strip is not None else (not use_layout)
+        if strip_final:
+            combined = "\n".join(line.rstrip() for line in combined.splitlines()).strip()
+        return combined
     def apply_ocr(
         self,
@@ -2275,6 +2338,44 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         # Generate sections
         sections = []
+        # --- Helper: build a FlowRegion spanning multiple pages ---
+        def _build_flow_region(start_el, end_el):
+            """Return a FlowRegion that covers from *start_el* to *end_el* (inclusive).
+            If *end_el* is None, the region continues to the bottom of the last
+            page in this PageCollection."""
+            # Local imports to avoid top-level cycles
+            from natural_pdf.elements.region import Region
+            from natural_pdf.flows.flow import Flow
+            from natural_pdf.flows.element import FlowElement
+            from natural_pdf.flows.region import FlowRegion
+            start_pg = start_el.page
+            end_pg = end_el.page if end_el is not None else self.pages[-1]
+            parts: list[Region] = []
+            # Slice of first page
+            parts.append(Region(start_pg, (0, start_el.top, start_pg.width, start_pg.height)))
+            # Full middle pages
+            for pg_idx in range(start_pg.index + 1, end_pg.index):
+                mid_pg = self.pages[pg_idx]
+                parts.append(Region(mid_pg, (0, 0, mid_pg.width, mid_pg.height)))
+            # Slice of last page (if distinct)
+            if end_pg is not start_pg:
+                bottom = end_el.bottom if end_el is not None else end_pg.height
+                parts.append(Region(end_pg, (0, 0, end_pg.width, bottom)))
+            flow = Flow(segments=parts, arrangement="vertical")
+            src_fe = FlowElement(physical_object=start_el, flow=flow)
+            return FlowRegion(flow=flow,
+                               constituent_regions=parts,
+                               source_flow_element=src_fe,
+                               boundary_element_found=end_el)
+        # ------------------------------------------------------------------
         current_start = None
         for i, boundary in enumerate(section_boundaries):
@@ -2295,50 +2396,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                     )
                     sections.append(section)
                 else:
-                    # Create a multi-page section
-                    from natural_pdf.elements.region import Region
-                    # Get the start and end pages
-                    start_page = start_element.page
-                    end_page = end_element.page
-                    # Create a combined region
-                    combined_region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
-                    )
-                    combined_region._spans_pages = True
-                    combined_region._page_range = (start_page.index, end_page.index)
-                    combined_region.start_element = start_element
-                    combined_region.end_element = end_element
-                    # Get all elements that fall within this multi-page region
-                    combined_elements = []
-                    # Get elements from the first page
-                    first_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == start_page and e.top >= start_element.top
-                    ]
-                    combined_elements.extend(first_page_elements)
-                    # Get elements from middle pages (if any)
-                    for page_idx in range(start_page.index + 1, end_page.index):
-                        middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
-                        combined_elements.extend(middle_page_elements)
-                    # Get elements from the last page
-                    last_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == end_page and e.bottom <= end_element.bottom
-                    ]
-                    combined_elements.extend(last_page_elements)
-                    # Store the elements in the combined region
-                    combined_region._multi_page_elements = combined_elements
-                    sections.append(combined_region)
+                    # Create FlowRegion spanning pages
+                    flow_region = _build_flow_region(start_element, end_element)
+                    sections.append(flow_region)
                 current_start = None
@@ -2394,54 +2454,9 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 last_page_elements.sort(key=lambda e: (e.top, e.x0))
                 end_element = last_page_elements[-1] if last_page_elements else None
-                # Create a multi-page section
-                from natural_pdf.elements.region import Region
-                if start_page == last_page:
-                    # Simple case - both on same page
-                    section = start_page.get_section_between(
-                        start_element, end_element, boundary_inclusion
-                    )
-                    sections.append(section)
-                else:
-                    # Create a multi-page section
-                    combined_region = Region(
-                        start_page, (0, start_element.top, start_page.width, start_page.height)
-                    )
-                    combined_region._spans_pages = True
-                    combined_region._page_range = (start_page.index, last_page.index)
-                    combined_region.start_element = start_element
-                    combined_region.end_element = end_element
-                    # Get all elements that fall within this multi-page region
-                    combined_elements = []
-                    # Get elements from the first page
-                    first_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == start_page and e.top >= start_element.top
-                    ]
-                    combined_elements.extend(first_page_elements)
-                    # Get elements from middle pages (if any)
-                    for page_idx in range(start_page.index + 1, last_page.index):
-                        middle_page_elements = [e for e in all_elements if e.page.index == page_idx]
-                        combined_elements.extend(middle_page_elements)
-                    # Get elements from the last page
-                    last_page_elements = [
-                        e
-                        for e in all_elements
-                        if e.page == last_page
-                        and (end_element is None or e.bottom <= end_element.bottom)
-                    ]
-                    combined_elements.extend(last_page_elements)
-                    # Store the elements in the combined region
-                    combined_region._multi_page_elements = combined_elements
-                    sections.append(combined_region)
+                # Create FlowRegion spanning multiple pages using helper
+                flow_region = _build_flow_region(start_element, end_element)
+                sections.append(flow_region)
             else:
                 # With start_elements only, create a section to the end of the current page
                 from natural_pdf.elements.region import Region
@@ -2629,13 +2644,13 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
     def to_image(
         self,
-        page_width: int = 300,
+        page_width: Optional[int] = None,
         cols: Optional[int] = 4,
         rows: Optional[int] = None,
         max_pages: Optional[int] = None,
         spacing: int = 10,
-        add_labels: bool = True,
-        show_category: bool = False,  # Add new flag
+        add_labels: bool = True,  # Add new flag
+        show_category: bool = False,
     ) -> Optional["Image.Image"]:
         """
         Generate a grid of page images for this collection.
@@ -2652,6 +2667,16 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
         Returns:
             PIL Image of the page grid or None if no pages
         """
+        # Determine default page width from global options if not explicitly provided
+        if page_width is None:
+            try:
+                import natural_pdf
+                page_width = natural_pdf.options.image.width or 300
+            except Exception:
+                # Fallback if natural_pdf import fails in some edge context
+                page_width = 300
         # Ensure PIL is imported, handle potential ImportError if not done globally/lazily
         try:
             from PIL import Image, ImageDraw, ImageFont
@@ -2949,3 +2974,17 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
                 # Re-raise the exception caught from the exporter
                 raise e  # Keep the original exception type (ValueError, RuntimeError, etc.)
             # <--- END MODIFIED
+    # Alias .to_image() to .show() for convenience
+    def show(
+        self,
+        *args,
+        **kwargs,
+    ) -> Optional["Image.Image"]:
+        """Display pages similarly to ``to_image``.
+        This is a thin wrapper around :py:meth:`to_image` so that the API mirrors
+        ElementCollection, where ``show()`` already exists. It forwards all
+        arguments and returns the resulting ``PIL.Image`` instance.
+        """
+        return self.to_image(*args, **kwargs)

natural-pdf 0.1.23__py3-none-any.whl → 0.1.24__py3-none-any.whl

natural-pdf 0.1.23py3-none-any.whl → 0.1.24py3-none-any.whl