PyPI - natural-pdf - Versions diffs - 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +11 -6
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +252 -399
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +231 -89
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +405 -280
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +25 -0
natural_pdf/flows/flow.py +1658 -19
natural_pdf/flows/region.py +757 -263
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +35 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +101 -0
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/ocr_options.py CHANGED Viewed

@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Union
 # --- Base Options ---
 @dataclass
 class BaseOCROptions:
@@ -54,7 +53,6 @@ class EasyOCROptions(BaseOCROptions):
     output_format: str = "standard"
 # --- PaddleOCR Specific Options ---
 @dataclass
 class PaddleOCROptions(BaseOCROptions):

natural_pdf/ocr/utils.py CHANGED Viewed

@@ -90,7 +90,8 @@ def direct_ocr_llm(
     buffered = io.BytesIO()
     # Use the global PDF render lock when rendering images
     with pdf_render_lock:
-        region_img = region.to_image(resolution=resolution, include_highlights=False)
+        # Use render() for clean image without highlights
+        region_img = region.render(resolution=resolution)
     # Handle cases where image creation might fail (e.g., zero-dim region)
     if region_img is None:

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 from PIL import Image, ImageDraw
-from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.elements.element_collection import ElementCollection
 from .qa_result import QAResult
@@ -63,8 +63,22 @@ class DocumentQA:
             logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
-            # Initialize the pipeline
-            self.pipe = pipeline("document-question-answering", model=model_name, device=device)
+            # Try MPS, fallback to CPU if OOM
+            if device is None and torch.backends.mps.is_available():
+                try:
+                    self.pipe = pipeline(
+                        "document-question-answering", model=model_name, device="mps"
+                    )
+                    self.device = "mps"
+                except RuntimeError as e:
+                    logger.warning(f"MPS OOM: {e}, falling back to CPU")
+                    self.pipe = pipeline(
+                        "document-question-answering", model=model_name, device="cpu"
+                    )
+                    self.device = "cpu"
+            else:
+                self.pipe = pipeline("document-question-answering", model=model_name, device=device)
+                self.device = device
             self.model_name = model_name
             self.device = device
@@ -356,7 +370,8 @@ class DocumentQA:
             temp_path = temp_file.name
         # Save a high resolution image (300 DPI)
-        page_image = page.to_image(resolution=300, include_highlights=False)
+        # Use render() for clean image without highlights
+        page_image = page.render(resolution=300)
         page_image.save(temp_path)
         try:
@@ -470,7 +485,8 @@ class DocumentQA:
             temp_path = temp_file.name
         # Get page image at high resolution - this returns a PIL Image directly
-        page_image = region.page.to_image(resolution=300, include_highlights=False)
+        # Use render() for clean image without highlights
+        page_image = region.page.render(resolution=300)
         # Crop to region
         x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)

natural_pdf/search/search_service_protocol.py CHANGED Viewed

@@ -49,7 +49,7 @@ class Indexable(Protocol):
         """
         Return the primary content of this item.
         The SearchService implementation will determine how to process this content
-        (e.g., call .extract_text(), .to_image(), or handle directly).
+        (e.g., call .extract_text(), .render(), or handle directly).
         """
         ...

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -24,7 +24,7 @@ This enables powerful document navigation like:
 - page.find('text[size>12]:bold:contains("Summary")')
 - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
 - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
-- page.find('text:regex("[\u2500-\u257F]")')  # Box drawing characters
+- page.find('text:regex("[\u2500-\u257f]")')  # Box drawing characters
 """
 import ast
@@ -101,6 +101,12 @@ def safe_parse_color(value_str: str) -> tuple:
     """
     value_str = value_str.strip()
+    # Strip quotes first if it's a quoted string (same logic as safe_parse_value)
+    if (value_str.startswith('"') and value_str.endswith('"')) or (
+        value_str.startswith("'") and value_str.endswith("'")
+    ):
+        value_str = value_str[1:-1]
     # Try parsing as a Python literal (for RGB tuples)
     try:
         # If it's already a valid tuple or list, parse it
@@ -504,6 +510,21 @@ def _is_approximate_match(value1, value2) -> bool:
     return value1 == value2
+def _is_exact_color_match(value1, value2) -> bool:
+    """
+    Check if two color values match exactly (with small tolerance for color variations).
+    For colors: Uses Delta E color difference with strict tolerance of 2.0
+    For non-colors: Falls back to exact equality
+    """
+    # First check if both values are colors
+    if _is_color_value(value1) and _is_color_value(value2):
+        return _color_distance(value1, value2) <= 2.0
+    # Default to exact match for non-colors
+    return value1 == value2
 PSEUDO_CLASS_FUNCTIONS = {
     "bold": lambda el: hasattr(el, "bold") and el.bold,
     "italic": lambda el: hasattr(el, "italic") and el.italic,
@@ -603,7 +624,19 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
             # Determine compare_func based on op (reuse existing logic)
             if op == "=":
-                compare_func = lambda el_val, sel_val: el_val == sel_val
+                # For color attributes, use exact color matching with small tolerance
+                if name in [
+                    "color",
+                    "non_stroking_color",
+                    "fill",
+                    "stroke",
+                    "strokeColor",
+                    "fillColor",
+                ]:
+                    op_desc = f"= {value!r} (exact color)"
+                    compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
+                else:
+                    compare_func = lambda el_val, sel_val: el_val == sel_val
             elif op == "!=":
                 compare_func = lambda el_val, sel_val: el_val != sel_val
             elif op == "~=":

natural_pdf/tables/result.py CHANGED Viewed

@@ -39,7 +39,13 @@ class TableResult(Sequence):
         """Quick property alias → calls :py:meth:`to_df` with default args."""
         return self.to_df()
-    def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
+    def to_df(
+        self,
+        header: Union[str, int, List[int], None] = "first",
+        index_col=None,
+        skip_repeating_headers=None,
+        **kwargs,
+    ):
         """Convert to *pandas* DataFrame.
         Parameters
@@ -47,6 +53,10 @@ class TableResult(Sequence):
         header : "first" | int | list[int] | None, default "first"
             • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • None/False– no header.
         index_col : same semantics as pandas, forwarded.
+        skip_repeating_headers : bool, optional
+            Whether to remove body rows that exactly match the header row(s).
+            Defaults to True when header is truthy, False otherwise.
+            Useful for PDFs where headers repeat throughout the table body.
         **kwargs  : forwarded to :pyclass:`pandas.DataFrame`.
         """
         try:
@@ -60,6 +70,10 @@ class TableResult(Sequence):
         if not rows:
             return pd.DataFrame()
+        # Determine default for skip_repeating_headers based on header parameter
+        if skip_repeating_headers is None:
+            skip_repeating_headers = header is not None and header is not False
         # Determine header rows and body rows
         body = rows
         hdr = None
@@ -78,6 +92,26 @@ class TableResult(Sequence):
         else:
             raise ValueError("Invalid value for header parameter")
+        # Skip repeating headers in body if requested
+        if skip_repeating_headers and hdr is not None and body:
+            original_body_len = len(body)
+            if isinstance(hdr, list) and len(hdr) > 0 and not isinstance(hdr[0], list):
+                # Single header row (most common case)
+                body = [row for row in body if row != hdr]
+            elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
+                # Multi-row header (less common)
+                hdr_set = {tuple(h) if isinstance(h, list) else h for h in hdr}
+                body = [
+                    row
+                    for row in body
+                    if (tuple(row) if isinstance(row, list) else row) not in hdr_set
+                ]
+            skipped_count = original_body_len - len(body)
+            if skipped_count > 0:
+                # Could add logging here if desired
+                pass
         df = pd.DataFrame(body, columns=hdr)
         if index_col is not None and not df.empty:
             df.set_index(

natural_pdf/text_mixin.py ADDED Viewed

@@ -0,0 +1,101 @@
+from __future__ import annotations
+import logging
+from typing import Any, Callable, Optional
+logger = logging.getLogger(__name__)
+class TextMixin:  # pylint: disable=too-few-public-methods
+    """Mixin that adds general text-replacement capabilities.
+    Two public entry points are exposed to any class that inherits this mix-in:
+    1. ``update_text`` (preferred) –  iterate over text elements selected via the
+       ``selector`` argument (default: ``"text"``) and apply a *correction* callback
+       which optionally returns replacement text.  If the callback returns a
+       non-``None`` string that differs from the current value, the element's
+       ``text`` attribute is updated in-place.
+    2. ``correct_ocr`` – legacy name kept for backward compatibility.  It simply
+       forwards to :py:meth:`update_text` while forcing
+       ``selector="text[source=ocr]"`` so that the historic behaviour (acting only
+       on OCR-generated elements) is preserved.
+    """
+    # ---------------------------------------------------------------------
+    # Back-compat shim
+    # ---------------------------------------------------------------------
+    def correct_ocr(self, *args, selector: str = "text[source=ocr]", **kwargs):  # type: ignore[override]
+        """Backward-compatibility wrapper that forwards to *update_text*.
+        Parameters
+        ----------
+        *args, **kwargs
+            Forwarded verbatim to :py:meth:`update_text` (after injecting the
+            ``selector`` default shown above).
+        """
+        # Delegate – subclasses may have overridden *update_text* with a richer
+        # signature so we pass everything through untouched.
+        return self.update_text(*args, selector=selector, **kwargs)  # type: ignore[arg-type]
+    # ------------------------------------------------------------------
+    # Generic fallback implementation
+    # ------------------------------------------------------------------
+    def update_text(  # type: ignore[override]
+        self,
+        transform: Callable[[Any], Optional[str]],
+        *,
+        selector: str = "text",
+        apply_exclusions: bool = False,
+        **_,
+    ):
+        """Generic implementation that works for any object exposing *find_all*.
+        Classes that require more sophisticated behaviour (parallelism, page
+        delegation, etc.) are expected to *override* this method while keeping
+        the same public contract.
+        """
+        if not callable(transform):
+            raise TypeError("transform must be callable")
+        # We rely on the presence of *find_all* to obtain elements.  If the
+        # subclass does not implement it then it *must* override update_text.
+        if not hasattr(self, "find_all"):
+            raise NotImplementedError(
+                f"{self.__class__.__name__} must implement `update_text` explicitly "
+                "(no `find_all` method found)."
+            )
+        try:
+            elements_collection = self.find_all(
+                selector=selector, apply_exclusions=apply_exclusions
+            )
+        except Exception as exc:  # pragma: no cover – defensive
+            raise RuntimeError(
+                f"Failed to gather elements with selector '{selector}': {exc}"
+            ) from exc
+        # `find_all` returns an ElementCollection; fall back gracefully otherwise.
+        elements_iter = getattr(elements_collection, "elements", elements_collection)
+        updated = 0
+        for element in elements_iter:
+            if not hasattr(element, "text"):
+                continue
+            new_text = transform(element)
+            if new_text is not None and isinstance(new_text, str) and new_text != element.text:
+                element.text = new_text
+                updated += 1
+        logger.info(
+            "%s.update_text – processed %d element(s); updated %d.",
+            self.__class__.__name__,
+            len(elements_iter),
+            updated,
+        )
+        return self

natural_pdf/utils/debug.py CHANGED Viewed

@@ -24,7 +24,8 @@ def _get_page_image_base64(page: Page) -> str:
     """Generate a base64 encoded image of the page."""
     # Create a clean image of the page without highlights for the base background
     # Use a fixed scale consistent with the HTML/JS rendering logic
-    img = page.to_image(scale=2.0, include_highlights=False)
+    # Use render() for clean image without highlights
+    img = page.render(resolution=144)
     if img is None:
         raise ValueError(f"Failed to render image for page {page.number}")

natural_pdf/utils/highlighting.py CHANGED Viewed

@@ -7,6 +7,7 @@ The main highlighting logic is now centralized in `natural_pdf.core.highlighting
 # Re-export necessary functions from visualization
 from .visualization import (
+    create_colorbar,
     create_legend,
     get_next_highlight_color,
     merge_images_with_legend,

natural_pdf/utils/layout.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import List, Optional, Tuple
 def merge_bboxes(
-    bboxes: List[Optional[Tuple[float, float, float, float]]]
+    bboxes: List[Optional[Tuple[float, float, float, float]]],
 ) -> Optional[Tuple[float, float, float, float]]:
     """
     Merge multiple bounding boxes into a single one that encompasses all of them.
@@ -23,4 +23,4 @@ def merge_bboxes(
     x0s, tops, x1s, bottoms = zip(*valid_bboxes)
-    return (min(x0s), min(tops), max(x1s), max(bottoms))
+    return (min(x0s), min(tops), max(x1s), max(bottoms))

natural_pdf/utils/packaging.py CHANGED Viewed

@@ -18,9 +18,9 @@ from natural_pdf.elements.text import TextElement
 # Import the specific PDF/Page types if possible, otherwise use Any
 if TYPE_CHECKING:
-    from natural_pdf.collections.pdf_collection import PDFCollection
     from natural_pdf.core.page import Page
     from natural_pdf.core.pdf import PDF
+    from natural_pdf.core.pdf_collection import PDFCollection
 else:
     PDF = Any
     Page = Any
@@ -145,9 +145,10 @@ def create_correction_task_package(
                 image_filename = f"{pdf_short_id}_page_{page.index}.png"
                 image_save_path = os.path.join(images_dir, image_filename)
                 try:
-                    img = page.to_image(resolution=resolution, include_highlights=False)
+                    # Use render() for clean image without highlights
+                    img = page.render(resolution=resolution)
                     if img is None:
-                        raise ValueError("page.to_image returned None")
+                        raise ValueError("page.render returned None")
                     img.save(image_save_path, "PNG")
                 except Exception as e:
                     logger.error(

natural_pdf/utils/text_extraction.py CHANGED Viewed

@@ -175,28 +175,27 @@ def filter_chars_spatially(
 def _apply_content_filter(
-    char_dicts: List[Dict[str, Any]],
-    content_filter: Union[str, Callable[[str], bool], List[str]]
+    char_dicts: List[Dict[str, Any]], content_filter: Union[str, Callable[[str], bool], List[str]]
 ) -> List[Dict[str, Any]]:
     """
     Applies content filtering to character dictionaries based on their text content.
     Args:
         char_dicts: List of character dictionaries to filter.
         content_filter: Can be:
             - A regex pattern string (characters matching the pattern are EXCLUDED)
             - A callable that takes text and returns True to KEEP the character
             - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
     Returns:
         Filtered list of character dictionaries.
     """
     if not char_dicts or content_filter is None:
         return char_dicts
     initial_count = len(char_dicts)
     filtered_chars = []
     # Handle different filter types
     if isinstance(content_filter, str):
         # Single regex pattern - exclude matching characters
@@ -207,9 +206,11 @@ def _apply_content_filter(
                 if not pattern.search(text):
                     filtered_chars.append(char_dict)
         except re.error as e:
-            logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
+            logger.warning(
+                f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering."
+            )
             return char_dicts
     elif isinstance(content_filter, list):
         # List of regex patterns - exclude characters matching ANY pattern
         try:
@@ -221,7 +222,7 @@ def _apply_content_filter(
         except re.error as e:
             logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
             return char_dicts
     elif callable(content_filter):
         # Callable filter - keep characters where function returns True
         try:
@@ -233,13 +234,15 @@ def _apply_content_filter(
             logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
             return char_dicts
     else:
-        logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
+        logger.warning(
+            f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering."
+        )
         return char_dicts
     filtered_count = initial_count - len(filtered_chars)
     if filtered_count > 0:
         logger.debug(f"Content filter removed {filtered_count} characters.")
     return filtered_chars

natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.38py3-none-any.whl → 0.2.0py3-none-any.whl