PyPI - natural-pdf - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl - Mend

natural-pdf 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

natural_pdf/__init__.py +45 -0
natural_pdf/analyzers/guides.py +359 -0
natural_pdf/core/element_manager.py +4 -0
natural_pdf/core/page.py +130 -31
natural_pdf/core/page_collection.py +75 -0
natural_pdf/core/pdf.py +33 -0
natural_pdf/describe/base.py +48 -7
natural_pdf/elements/base.py +408 -43
natural_pdf/elements/element_collection.py +83 -10
natural_pdf/elements/region.py +217 -178
natural_pdf/elements/text.py +5 -3
natural_pdf/flows/element.py +1 -0
natural_pdf/flows/flow.py +175 -480
natural_pdf/flows/region.py +76 -0
natural_pdf/selectors/parser.py +180 -9
natural_pdf/utils/pdfminer_patches.py +136 -0
natural_pdf/utils/sections.py +346 -0
natural_pdf/utils/spatial.py +172 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/METADATA +1 -1
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/RECORD +24 -21
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/WHEEL +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.2.16.dist-info → natural_pdf-0.2.18.dist-info}/top_level.txt +0 -0

natural_pdf/flows/region.py CHANGED Viewed

@@ -1191,6 +1191,82 @@ class FlowRegion(Visualizable):
         return all_tables
+    def get_sections(
+        self,
+        start_elements=None,
+        end_elements=None,
+        new_section_on_page_break: bool = False,
+        include_boundaries: str = "both",
+        orientation: str = "vertical",
+    ) -> "ElementCollection":
+        """
+        Extract logical sections from this FlowRegion based on start/end boundary elements.
+        This delegates to the parent Flow's get_sections() method, but only operates
+        on the segments that are part of this FlowRegion.
+        Args:
+            start_elements: Elements or selector string that mark the start of sections
+            end_elements: Elements or selector string that mark the end of sections
+            new_section_on_page_break: Whether to start a new section at page boundaries
+            include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
+            orientation: 'vertical' (default) or 'horizontal' - determines section direction
+        Returns:
+            ElementCollection of FlowRegion objects representing the extracted sections
+        Example:
+            # Split a multi-page table region by headers
+            table_region = flow.find("text:contains('Table 4')").below(until="text:contains('Table 5')")
+            sections = table_region.get_sections(start_elements="text:bold")
+        """
+        # Create a temporary Flow with just our constituent regions as segments
+        from natural_pdf.flows.flow import Flow
+        temp_flow = Flow(
+            segments=self.constituent_regions,
+            arrangement=self.flow.arrangement,
+            alignment=self.flow.alignment,
+            segment_gap=self.flow.segment_gap,
+        )
+        # Delegate to Flow's get_sections implementation
+        return temp_flow.get_sections(
+            start_elements=start_elements,
+            end_elements=end_elements,
+            new_section_on_page_break=new_section_on_page_break,
+            include_boundaries=include_boundaries,
+            orientation=orientation,
+        )
+    def split(
+        self, by: Optional[str] = None, page_breaks: bool = True, **kwargs
+    ) -> "ElementCollection":
+        """
+        Split this FlowRegion into sections.
+        This is a convenience method that wraps get_sections() with common splitting patterns.
+        Args:
+            by: Selector string for elements that mark section boundaries (e.g., "text:bold")
+            page_breaks: Whether to also split at page boundaries (default: True)
+            **kwargs: Additional arguments passed to get_sections()
+        Returns:
+            ElementCollection of FlowRegion objects representing the sections
+        Example:
+            # Split by bold headers
+            sections = flow_region.split(by="text:bold")
+            # Split only by specific text pattern, ignoring page breaks
+            sections = flow_region.split(
+                by="text:contains('Section')",
+                page_breaks=False
+            )
+        """
+        return self.get_sections(start_elements=by, new_section_on_page_break=page_breaks, **kwargs)
     @property
     def normalized_type(self) -> Optional[str]:
         """

natural_pdf/selectors/parser.py CHANGED Viewed

@@ -30,6 +30,7 @@ This enables powerful document navigation like:
 import ast
 import logging
 import re
+from collections import Counter
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 from colormath2.color_conversions import convert_color
@@ -86,6 +87,47 @@ def safe_parse_value(value_str: str) -> Any:
         return value_str
+def _parse_aggregate_function(value_str: str) -> Optional[Dict[str, Any]]:
+    """Parse aggregate function syntax like min(), max(), avg(), closest("red").
+    Returns:
+        Dict with 'type': 'aggregate', 'func': function name, 'args': optional args
+        or None if not an aggregate function.
+    """
+    value_str = value_str.strip()
+    # Pattern for aggregate functions: funcname() or funcname(args)
+    # Supports: min(), max(), avg(), mean(), median(), mode(), most_common(), closest(...)
+    func_pattern = re.match(
+        r"^(min|max|avg|mean|median|mode|most_common|closest)\s*\((.*?)\)$",
+        value_str,
+        re.IGNORECASE,
+    )
+    if not func_pattern:
+        return None
+    func_name = func_pattern.group(1).lower()
+    args_str = func_pattern.group(2).strip()
+    # Normalize function aliases
+    if func_name == "mean":
+        func_name = "avg"
+    elif func_name == "most_common":
+        func_name = "mode"
+    # Parse arguments if present
+    args = None
+    if args_str:
+        # For closest(), parse the color argument
+        if func_name == "closest":
+            args = safe_parse_color(args_str)
+        else:
+            args = safe_parse_value(args_str)
+    return {"type": "aggregate", "func": func_name, "args": args}
 def safe_parse_color(value_str: str) -> tuple:
     """
     Parse a color value which could be an RGB tuple, color name, hex code, or CSS-style rgb(...)/rgba(...).
@@ -362,9 +404,14 @@ def parse_selector(selector: str) -> Dict[str, Any]:
                     raise ValueError(
                         f"Invalid selector: Attribute '[{name}{op}]' must have a value. Use '[{name}{op}\"\"]' for empty string or '[{name}]' for presence. Full selector: '{original_selector_for_error}'"
                     )
-                # Parse value
+                # Parse value - check for aggregate functions first
                 parsed_value: Any
-                if name in [
+                aggregate_func = _parse_aggregate_function(value_str)
+                if aggregate_func:
+                    # Store aggregate function info
+                    parsed_value = aggregate_func
+                elif name in [
                     "color",
                     "non_stroking_color",
                     "fill",
@@ -564,12 +611,15 @@ PSEUDO_CLASS_FUNCTIONS = {
 }
-def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]:
+def _build_filter_list(
+    selector: Dict[str, Any], aggregates: Optional[Dict[str, Any]] = None, **kwargs
+) -> List[Dict[str, Any]]:
     """
     Convert a parsed selector to a list of named filter functions.
     Args:
         selector: Parsed selector dictionary
+        aggregates: Pre-calculated aggregate values (optional)
         **kwargs: Additional filter parameters including:
                  - regex: Whether to use regex for text search
                  - case: Whether to do case-sensitive text search
@@ -581,6 +631,9 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
     filters: List[Dict[str, Any]] = []
     selector_type = selector["type"]
+    if aggregates is None:
+        aggregates = {}
     # Filter by element type
     if selector_type != "any":
         filter_name = f"type is '{selector_type}'"
@@ -611,6 +664,15 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
         value = attr_filter["value"]
         python_name = name.replace("-", "_")  # Convert CSS-style names
+        # Check if value is an aggregate function
+        if isinstance(value, dict) and value.get("type") == "aggregate":
+            # Use pre-calculated aggregate value
+            aggregate_value = aggregates.get(name)
+            if aggregate_value is None:
+                # Skip this filter if aggregate couldn't be calculated
+                continue
+            value = aggregate_value
         # --- Define the core value retrieval logic ---
         def get_element_value(
             element, name=name, python_name=python_name, selector_type=selector_type
@@ -761,15 +823,15 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
                 )
             # Recursively get the filter function for the inner selector
-            # Pass kwargs down in case regex/case flags affect the inner selector
-            inner_filter_func = selector_to_filter_func(args, **kwargs)
+            # Pass kwargs and aggregates down in case regex/case flags affect the inner selector
+            inner_filter_func = selector_to_filter_func(args, aggregates=aggregates, **kwargs)
             # The filter lambda applies the inner function and inverts the result
             filter_lambda = lambda el, inner_func=inner_filter_func: not inner_func(el)
             # Try to create a descriptive name (can be long)
             # Maybe simplify this later if needed
-            inner_filter_list = _build_filter_list(args, **kwargs)
+            inner_filter_list = _build_filter_list(args, aggregates=aggregates, **kwargs)
             inner_filter_names = ", ".join([f["name"] for f in inner_filter_list])
             filter_name = f"pseudo-class :not({inner_filter_names})"
@@ -929,7 +991,113 @@ def _assemble_filter_func(filters: List[Dict[str, Any]]) -> Callable[[Any], bool
     return combined_filter
-def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any], bool]:
+def _calculate_aggregates(elements: List[Any], selector: Dict[str, Any]) -> Dict[str, Any]:
+    """Calculate aggregate values for a selector.
+    Args:
+        elements: List of elements to calculate aggregates from
+        selector: Parsed selector dictionary
+    Returns:
+        Dict mapping attribute names to their aggregate values
+    """
+    aggregates = {}
+    # Find all aggregate functions in attributes
+    for attr in selector.get("attributes", []):
+        value = attr.get("value")
+        if isinstance(value, dict) and value.get("type") == "aggregate":
+            attr_name = attr["name"]
+            func_name = value["func"]
+            func_args = value.get("args")
+            # Extract attribute values from elements
+            values = []
+            for el in elements:
+                try:
+                    # Handle special bbox attributes
+                    if attr_name in ["x0", "y0", "x1", "y1"]:
+                        bbox_mapping = {"x0": 0, "y0": 1, "x1": 2, "y1": 3}
+                        bbox = getattr(el, "_bbox", None) or getattr(el, "bbox", None)
+                        if bbox:
+                            val = bbox[bbox_mapping[attr_name]]
+                            values.append(val)
+                    else:
+                        # General attribute access
+                        val = getattr(el, attr_name.replace("-", "_"), None)
+                        if val is not None:
+                            values.append(val)
+                except Exception:
+                    continue
+            if not values:
+                # No valid values found, aggregate is None
+                aggregates[attr_name] = None
+                continue
+            # Calculate aggregate based on function
+            if func_name == "min":
+                aggregates[attr_name] = min(values)
+            elif func_name == "max":
+                aggregates[attr_name] = max(values)
+            elif func_name == "avg":
+                try:
+                    aggregates[attr_name] = sum(values) / len(values)
+                except TypeError:
+                    # Non-numeric values
+                    aggregates[attr_name] = None
+            elif func_name == "median":
+                try:
+                    sorted_values = sorted(values)
+                    n = len(sorted_values)
+                    if n % 2 == 0:
+                        aggregates[attr_name] = (
+                            sorted_values[n // 2 - 1] + sorted_values[n // 2]
+                        ) / 2
+                    else:
+                        aggregates[attr_name] = sorted_values[n // 2]
+                except TypeError:
+                    # Non-numeric values
+                    aggregates[attr_name] = None
+            elif func_name == "mode":
+                # Works for any type
+                counter = Counter(values)
+                most_common = counter.most_common(1)
+                if most_common:
+                    aggregates[attr_name] = most_common[0][0]
+                else:
+                    aggregates[attr_name] = None
+            elif func_name == "closest" and func_args is not None:
+                # For colors, find the value with minimum distance
+                if attr_name in [
+                    "color",
+                    "non_stroking_color",
+                    "fill",
+                    "stroke",
+                    "strokeColor",
+                    "fillColor",
+                ]:
+                    min_distance = float("inf")
+                    closest_value = None
+                    for val in values:
+                        try:
+                            distance = _color_distance(val, func_args)
+                            if distance < min_distance:
+                                min_distance = distance
+                                closest_value = val
+                        except:
+                            continue
+                    aggregates[attr_name] = closest_value
+                else:
+                    # For non-colors, closest doesn't make sense
+                    aggregates[attr_name] = None
+    return aggregates
+def selector_to_filter_func(
+    selector: Dict[str, Any], aggregates: Optional[Dict[str, Any]] = None, **kwargs
+) -> Callable[[Any], bool]:
     """
     Convert a parsed selector to a single filter function.
@@ -938,6 +1106,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
     Args:
         selector: Parsed selector dictionary (single or compound OR selector)
+        aggregates: Pre-calculated aggregate values (optional)
         **kwargs: Additional filter parameters (e.g., regex, case).
     Returns:
@@ -953,7 +1122,9 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
         # Create filter functions for each sub-selector
         sub_filter_funcs = []
         for sub_selector in sub_selectors:
-            sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
+            sub_filter_funcs.append(
+                selector_to_filter_func(sub_selector, aggregates=aggregates, **kwargs)
+            )
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
@@ -973,7 +1144,7 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
         return or_filter
     # Handle single selectors (existing logic)
-    filter_list = _build_filter_list(selector, **kwargs)
+    filter_list = _build_filter_list(selector, aggregates=aggregates, **kwargs)
     if logger.isEnabledFor(logging.DEBUG):
         filter_names = [f["name"] for f in filter_list]

natural_pdf/utils/pdfminer_patches.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Monkey patches for pdfminer.six bugs.
+This module contains patches for known bugs in pdfminer.six that affect
+natural_pdf functionality. These patches are applied automatically when
+natural_pdf is imported.
+"""
+import logging
+from typing import List, Optional, Tuple, Union
+logger = logging.getLogger(__name__)
+# Track if patches have been applied
+_patches_applied = False
+# Allow disabling patches via environment variable
+import os
+DISABLE_PATCHES = os.environ.get("NATURAL_PDF_DISABLE_PDFMINER_PATCHES", "").lower() in (
+    "1",
+    "true",
+    "yes",
+)
+def _patch_color_space_bug():
+    """
+    Fix pdfminer.six color parsing bug for bare 'sc' commands.
+    Bug: When a PDF uses 'sc' without an explicit color space (e.g., '1 1 0 sc'),
+    pdfminer defaults to DeviceGray (1 component) and only reads one value,
+    resulting in wrong colors.
+    This patch detects when there are more color components on the stack than
+    expected and handles RGB colors correctly.
+    Reference: https://github.com/jsvine/pdfplumber/issues/XXX
+    """
+    try:
+        import pdfminer.pdfinterp
+        from pdfminer.casting import safe_rgb
+        # Save original method
+        original_do_scn = pdfminer.pdfinterp.PDFPageInterpreter.do_scn
+        def patched_do_scn(self):
+            """Patched do_scn that handles RGB colors without explicit color space."""
+            # Get expected components from current color space
+            n = self.graphicstate.ncs.ncomponents
+            # Special handling for DeviceGray with potential RGB values
+            if n == 1 and len(self.argstack) >= 3:
+                # Peek at the last 3 values
+                last_three = self.argstack[-3:]
+                # Check if they look like RGB values (all numeric, 0-1 range)
+                try:
+                    values = []
+                    for v in last_three:
+                        if isinstance(v, (int, float)):
+                            values.append(float(v))
+                        else:
+                            # Not numeric, use original behavior
+                            return original_do_scn(self)
+                    # If all values are in 0-1 range, treat as RGB
+                    if all(0 <= v <= 1 for v in values):
+                        # Pop 3 values and set as RGB
+                        components = self.pop(3)
+                        rgb = safe_rgb(*components)
+                        if rgb is not None:
+                            self.graphicstate.ncolor = rgb
+                            return
+                except (ValueError, TypeError, AttributeError):
+                    # Any error, fall back to original
+                    pass
+            # Use original behavior for all other cases
+            return original_do_scn(self)
+        # Apply the patch
+        pdfminer.pdfinterp.PDFPageInterpreter.do_scn = patched_do_scn
+        logger.debug("Applied pdfminer color space bug patch")
+        return True
+    except Exception as e:
+        logger.warning(f"Failed to apply pdfminer color patch: {e}")
+        return False
+def apply_patches():
+    """Apply all pdfminer patches. Safe to call multiple times."""
+    global _patches_applied
+    if _patches_applied or DISABLE_PATCHES:
+        return
+    patches = [
+        ("color_space_bug", _patch_color_space_bug),
+        # Add more patches here as needed
+    ]
+    applied = []
+    failed = []
+    for name, patch_func in patches:
+        if patch_func():
+            applied.append(name)
+        else:
+            failed.append(name)
+    if applied:
+        logger.info(f"Applied pdfminer patches: {', '.join(applied)}")
+    if failed:
+        logger.warning(f"Failed to apply patches: {', '.join(failed)}")
+    _patches_applied = True
+def get_patch_status() -> dict:
+    """Get information about applied patches."""
+    return {
+        "patches_applied": _patches_applied,
+        "pdfminer_version": _get_pdfminer_version(),
+    }
+def _get_pdfminer_version() -> str:
+    """Get the installed pdfminer version."""
+    try:
+        import pdfminer
+        return getattr(pdfminer, "__version__", "unknown")
+    except ImportError:
+        return "not installed"

natural-pdf 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

natural-pdf 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl