PyPI - natural-pdf - Versions diffs - 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

natural-pdf 0.1.40py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

natural_pdf/__init__.py +6 -7
natural_pdf/analyzers/__init__.py +6 -1
natural_pdf/analyzers/guides.py +354 -258
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +18 -4
natural_pdf/analyzers/layout/paddle.py +11 -0
natural_pdf/analyzers/layout/surya.py +2 -3
natural_pdf/analyzers/shape_detection_mixin.py +25 -34
natural_pdf/analyzers/text_structure.py +2 -2
natural_pdf/classification/manager.py +1 -1
natural_pdf/collections/mixins.py +3 -2
natural_pdf/core/highlighting_service.py +743 -32
natural_pdf/core/page.py +236 -383
natural_pdf/core/page_collection.py +1249 -0
natural_pdf/core/pdf.py +172 -83
natural_pdf/{collections → core}/pdf_collection.py +18 -11
natural_pdf/core/render_spec.py +335 -0
natural_pdf/describe/base.py +1 -1
natural_pdf/elements/__init__.py +1 -0
natural_pdf/elements/base.py +108 -83
natural_pdf/elements/{collections.py → element_collection.py} +566 -1487
natural_pdf/elements/line.py +0 -1
natural_pdf/elements/rect.py +0 -1
natural_pdf/elements/region.py +318 -243
natural_pdf/elements/text.py +9 -7
natural_pdf/exporters/base.py +2 -2
natural_pdf/exporters/original_pdf.py +1 -1
natural_pdf/exporters/paddleocr.py +2 -4
natural_pdf/exporters/searchable_pdf.py +3 -2
natural_pdf/extraction/mixin.py +1 -3
natural_pdf/flows/collections.py +1 -69
natural_pdf/flows/element.py +4 -4
natural_pdf/flows/flow.py +1200 -243
natural_pdf/flows/region.py +707 -261
natural_pdf/ocr/ocr_options.py +0 -2
natural_pdf/ocr/utils.py +2 -1
natural_pdf/qa/document_qa.py +21 -5
natural_pdf/search/search_service_protocol.py +1 -1
natural_pdf/selectors/parser.py +2 -2
natural_pdf/tables/result.py +35 -1
natural_pdf/text_mixin.py +7 -3
natural_pdf/utils/debug.py +2 -1
natural_pdf/utils/highlighting.py +1 -0
natural_pdf/utils/layout.py +2 -2
natural_pdf/utils/packaging.py +4 -3
natural_pdf/utils/text_extraction.py +15 -12
natural_pdf/utils/visualization.py +385 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -53
optimization/memory_comparison.py +1 -1
optimization/pdf_analyzer.py +2 -2
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.40.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0

natural_pdf/ocr/ocr_options.py CHANGED Viewed

@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple, Union
 # --- Base Options ---
 @dataclass
 class BaseOCROptions:
@@ -54,7 +53,6 @@ class EasyOCROptions(BaseOCROptions):
     output_format: str = "standard"
 # --- PaddleOCR Specific Options ---
 @dataclass
 class PaddleOCROptions(BaseOCROptions):

natural_pdf/ocr/utils.py CHANGED Viewed

@@ -90,7 +90,8 @@ def direct_ocr_llm(
     buffered = io.BytesIO()
     # Use the global PDF render lock when rendering images
     with pdf_render_lock:
-        region_img = region.to_image(resolution=resolution, include_highlights=False)
+        # Use render() for clean image without highlights
+        region_img = region.render(resolution=resolution)
     # Handle cases where image creation might fail (e.g., zero-dim region)
     if region_img is None:

natural_pdf/qa/document_qa.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 import numpy as np
 from PIL import Image, ImageDraw
-from natural_pdf.elements.collections import ElementCollection
+from natural_pdf.elements.element_collection import ElementCollection
 from .qa_result import QAResult
@@ -63,8 +63,22 @@ class DocumentQA:
             logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
-            # Initialize the pipeline
-            self.pipe = pipeline("document-question-answering", model=model_name, device=device)
+            # Try MPS, fallback to CPU if OOM
+            if device is None and torch.backends.mps.is_available():
+                try:
+                    self.pipe = pipeline(
+                        "document-question-answering", model=model_name, device="mps"
+                    )
+                    self.device = "mps"
+                except RuntimeError as e:
+                    logger.warning(f"MPS OOM: {e}, falling back to CPU")
+                    self.pipe = pipeline(
+                        "document-question-answering", model=model_name, device="cpu"
+                    )
+                    self.device = "cpu"
+            else:
+                self.pipe = pipeline("document-question-answering", model=model_name, device=device)
+                self.device = device
             self.model_name = model_name
             self.device = device
@@ -356,7 +370,8 @@ class DocumentQA:
             temp_path = temp_file.name
         # Save a high resolution image (300 DPI)
-        page_image = page.to_image(resolution=300, include_highlights=False)
+        # Use render() for clean image without highlights
+        page_image = page.render(resolution=300)
         page_image.save(temp_path)
         try:
@@ -470,7 +485,8 @@ class DocumentQA:
             temp_path = temp_file.name
         # Get page image at high resolution - this returns a PIL Image directly
-        page_image = region.page.to_image(resolution=300, include_highlights=False)
+        # Use render() for clean image without highlights
+        page_image = region.page.render(resolution=300)
         # Crop to region
         x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)

natural_pdf/search/search_service_protocol.py CHANGED Viewed

@@ -49,7 +49,7 @@ class Indexable(Protocol):
         """
         Return the primary content of this item.
         The SearchService implementation will determine how to process this content
-        (e.g., call .extract_text(), .to_image(), or handle directly).
+        (e.g., call .extract_text(), .render(), or handle directly).
         """
         ...

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -24,7 +24,7 @@ This enables powerful document navigation like:
 - page.find('text[size>12]:bold:contains("Summary")')
 - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
 - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
-- page.find('text:regex("[\u2500-\u257F]")')  # Box drawing characters
+- page.find('text:regex("[\u2500-\u257f]")')  # Box drawing characters
 """
 import ast
@@ -100,7 +100,7 @@ def safe_parse_color(value_str: str) -> tuple:
         ValueError: If the color cannot be parsed
     """
     value_str = value_str.strip()
     # Strip quotes first if it's a quoted string (same logic as safe_parse_value)
     if (value_str.startswith('"') and value_str.endswith('"')) or (
         value_str.startswith("'") and value_str.endswith("'")

natural_pdf/tables/result.py CHANGED Viewed

@@ -39,7 +39,13 @@ class TableResult(Sequence):
         """Quick property alias → calls :py:meth:`to_df` with default args."""
         return self.to_df()
-    def to_df(self, header: Union[str, int, List[int], None] = "first", index_col=None, **kwargs):
+    def to_df(
+        self,
+        header: Union[str, int, List[int], None] = "first",
+        index_col=None,
+        skip_repeating_headers=None,
+        **kwargs,
+    ):
         """Convert to *pandas* DataFrame.
         Parameters
@@ -47,6 +53,10 @@ class TableResult(Sequence):
         header : "first" | int | list[int] | None, default "first"
             • "first" – use row 0 as column names.\n            • int       – use that row index.\n            • list[int] – multi-row header.\n            • None/False– no header.
         index_col : same semantics as pandas, forwarded.
+        skip_repeating_headers : bool, optional
+            Whether to remove body rows that exactly match the header row(s).
+            Defaults to True when header is truthy, False otherwise.
+            Useful for PDFs where headers repeat throughout the table body.
         **kwargs  : forwarded to :pyclass:`pandas.DataFrame`.
         """
         try:
@@ -60,6 +70,10 @@ class TableResult(Sequence):
         if not rows:
             return pd.DataFrame()
+        # Determine default for skip_repeating_headers based on header parameter
+        if skip_repeating_headers is None:
+            skip_repeating_headers = header is not None and header is not False
         # Determine header rows and body rows
         body = rows
         hdr = None
@@ -78,6 +92,26 @@ class TableResult(Sequence):
         else:
             raise ValueError("Invalid value for header parameter")
+        # Skip repeating headers in body if requested
+        if skip_repeating_headers and hdr is not None and body:
+            original_body_len = len(body)
+            if isinstance(hdr, list) and len(hdr) > 0 and not isinstance(hdr[0], list):
+                # Single header row (most common case)
+                body = [row for row in body if row != hdr]
+            elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
+                # Multi-row header (less common)
+                hdr_set = {tuple(h) if isinstance(h, list) else h for h in hdr}
+                body = [
+                    row
+                    for row in body
+                    if (tuple(row) if isinstance(row, list) else row) not in hdr_set
+                ]
+            skipped_count = original_body_len - len(body)
+            if skipped_count > 0:
+                # Could add logging here if desired
+                pass
         df = pd.DataFrame(body, columns=hdr)
         if index_col is not None and not df.empty:
             df.set_index(

natural_pdf/text_mixin.py CHANGED Viewed

@@ -70,9 +70,13 @@ class TextMixin:  # pylint: disable=too-few-public-methods
             )
         try:
-            elements_collection = self.find_all(selector=selector, apply_exclusions=apply_exclusions)
+            elements_collection = self.find_all(
+                selector=selector, apply_exclusions=apply_exclusions
+            )
         except Exception as exc:  # pragma: no cover – defensive
-            raise RuntimeError(f"Failed to gather elements with selector '{selector}': {exc}") from exc
+            raise RuntimeError(
+                f"Failed to gather elements with selector '{selector}': {exc}"
+            ) from exc
         # `find_all` returns an ElementCollection; fall back gracefully otherwise.
         elements_iter = getattr(elements_collection, "elements", elements_collection)
@@ -94,4 +98,4 @@ class TextMixin:  # pylint: disable=too-few-public-methods
             updated,
         )
-        return self
+        return self

natural_pdf/utils/debug.py CHANGED Viewed

@@ -24,7 +24,8 @@ def _get_page_image_base64(page: Page) -> str:
     """Generate a base64 encoded image of the page."""
     # Create a clean image of the page without highlights for the base background
     # Use a fixed scale consistent with the HTML/JS rendering logic
-    img = page.to_image(scale=2.0, include_highlights=False)
+    # Use render() for clean image without highlights
+    img = page.render(resolution=144)
     if img is None:
         raise ValueError(f"Failed to render image for page {page.number}")

natural_pdf/utils/highlighting.py CHANGED Viewed

@@ -7,6 +7,7 @@ The main highlighting logic is now centralized in `natural_pdf.core.highlighting
 # Re-export necessary functions from visualization
 from .visualization import (
+    create_colorbar,
     create_legend,
     get_next_highlight_color,
     merge_images_with_legend,

natural_pdf/utils/layout.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import List, Optional, Tuple
 def merge_bboxes(
-    bboxes: List[Optional[Tuple[float, float, float, float]]]
+    bboxes: List[Optional[Tuple[float, float, float, float]]],
 ) -> Optional[Tuple[float, float, float, float]]:
     """
     Merge multiple bounding boxes into a single one that encompasses all of them.
@@ -23,4 +23,4 @@ def merge_bboxes(
     x0s, tops, x1s, bottoms = zip(*valid_bboxes)
-    return (min(x0s), min(tops), max(x1s), max(bottoms))
+    return (min(x0s), min(tops), max(x1s), max(bottoms))

natural_pdf/utils/packaging.py CHANGED Viewed

@@ -18,9 +18,9 @@ from natural_pdf.elements.text import TextElement
 # Import the specific PDF/Page types if possible, otherwise use Any
 if TYPE_CHECKING:
-    from natural_pdf.collections.pdf_collection import PDFCollection
     from natural_pdf.core.page import Page
     from natural_pdf.core.pdf import PDF
+    from natural_pdf.core.pdf_collection import PDFCollection
 else:
     PDF = Any
     Page = Any
@@ -145,9 +145,10 @@ def create_correction_task_package(
                 image_filename = f"{pdf_short_id}_page_{page.index}.png"
                 image_save_path = os.path.join(images_dir, image_filename)
                 try:
-                    img = page.to_image(resolution=resolution, include_highlights=False)
+                    # Use render() for clean image without highlights
+                    img = page.render(resolution=resolution)
                     if img is None:
-                        raise ValueError("page.to_image returned None")
+                        raise ValueError("page.render returned None")
                     img.save(image_save_path, "PNG")
                 except Exception as e:
                     logger.error(

natural_pdf/utils/text_extraction.py CHANGED Viewed

@@ -175,28 +175,27 @@ def filter_chars_spatially(
 def _apply_content_filter(
-    char_dicts: List[Dict[str, Any]],
-    content_filter: Union[str, Callable[[str], bool], List[str]]
+    char_dicts: List[Dict[str, Any]], content_filter: Union[str, Callable[[str], bool], List[str]]
 ) -> List[Dict[str, Any]]:
     """
     Applies content filtering to character dictionaries based on their text content.
     Args:
         char_dicts: List of character dictionaries to filter.
         content_filter: Can be:
             - A regex pattern string (characters matching the pattern are EXCLUDED)
             - A callable that takes text and returns True to KEEP the character
             - A list of regex patterns (characters matching ANY pattern are EXCLUDED)
     Returns:
         Filtered list of character dictionaries.
     """
     if not char_dicts or content_filter is None:
         return char_dicts
     initial_count = len(char_dicts)
     filtered_chars = []
     # Handle different filter types
     if isinstance(content_filter, str):
         # Single regex pattern - exclude matching characters
@@ -207,9 +206,11 @@ def _apply_content_filter(
                 if not pattern.search(text):
                     filtered_chars.append(char_dict)
         except re.error as e:
-            logger.warning(f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering.")
+            logger.warning(
+                f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering."
+            )
             return char_dicts
     elif isinstance(content_filter, list):
         # List of regex patterns - exclude characters matching ANY pattern
         try:
@@ -221,7 +222,7 @@ def _apply_content_filter(
         except re.error as e:
             logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
             return char_dicts
     elif callable(content_filter):
         # Callable filter - keep characters where function returns True
         try:
@@ -233,13 +234,15 @@ def _apply_content_filter(
             logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
             return char_dicts
     else:
-        logger.warning(f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering.")
+        logger.warning(
+            f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering."
+        )
         return char_dicts
     filtered_count = initial_count - len(filtered_chars)
     if filtered_count > 0:
         logger.debug(f"Content filter removed {filtered_count} characters.")
     return filtered_chars

natural-pdf 0.1.40__py3-none-any.whl → 0.2.0__py3-none-any.whl

natural-pdf 0.1.40py3-none-any.whl → 0.2.0py3-none-any.whl