PyPI - natural-pdf - Versions diffs - 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl - Mend

natural-pdf 0.1.15py3-none-any.whl → 0.1.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

natural_pdf/__init__.py +31 -0
natural_pdf/analyzers/layout/gemini.py +137 -162
natural_pdf/analyzers/layout/layout_manager.py +9 -5
natural_pdf/analyzers/layout/layout_options.py +77 -7
natural_pdf/analyzers/layout/paddle.py +318 -165
natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
natural_pdf/analyzers/shape_detection_mixin.py +770 -405
natural_pdf/classification/mixin.py +2 -8
natural_pdf/collections/pdf_collection.py +25 -30
natural_pdf/core/highlighting_service.py +47 -32
natural_pdf/core/page.py +119 -76
natural_pdf/core/pdf.py +19 -22
natural_pdf/describe/__init__.py +21 -0
natural_pdf/describe/base.py +457 -0
natural_pdf/describe/elements.py +411 -0
natural_pdf/describe/mixin.py +84 -0
natural_pdf/describe/summary.py +186 -0
natural_pdf/elements/base.py +11 -10
natural_pdf/elements/collections.py +116 -51
natural_pdf/elements/region.py +204 -127
natural_pdf/exporters/paddleocr.py +38 -13
natural_pdf/flows/__init__.py +3 -3
natural_pdf/flows/collections.py +303 -132
natural_pdf/flows/element.py +277 -132
natural_pdf/flows/flow.py +33 -16
natural_pdf/flows/region.py +142 -79
natural_pdf/ocr/engine_doctr.py +37 -4
natural_pdf/ocr/engine_easyocr.py +23 -3
natural_pdf/ocr/engine_paddle.py +281 -30
natural_pdf/ocr/engine_surya.py +8 -3
natural_pdf/ocr/ocr_manager.py +75 -76
natural_pdf/ocr/ocr_options.py +52 -87
natural_pdf/search/__init__.py +25 -12
natural_pdf/search/lancedb_search_service.py +91 -54
natural_pdf/search/numpy_search_service.py +86 -65
natural_pdf/search/searchable_mixin.py +2 -2
natural_pdf/selectors/parser.py +125 -81
natural_pdf/widgets/__init__.py +1 -1
natural_pdf/widgets/viewer.py +205 -449
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0

natural_pdf/describe/elements.py ADDED Viewed

@@ -0,0 +1,411 @@
+"""
+Element-specific describe functions.
+"""
+import logging
+from collections import Counter
+from typing import TYPE_CHECKING, Any, Dict, List
+if TYPE_CHECKING:
+    from natural_pdf.elements.base import Element
+logger = logging.getLogger(__name__)
+def describe_text_elements(elements: List["Element"]) -> Dict[str, Any]:
+    """
+    Describe text elements with typography and OCR analysis.
+    Args:
+        elements: List of text elements
+    Returns:
+        Dictionary with text analysis sections
+    """
+    if not elements:
+        return {"message": "No text elements found"}
+    result = {}
+    # Source breakdown
+    sources = Counter()
+    ocr_elements = []
+    for element in elements:
+        source = getattr(element, 'source', 'unknown')
+        sources[source] += 1
+        if source == 'ocr':
+            ocr_elements.append(element)
+    if len(sources) > 1:
+        result['sources'] = dict(sources)
+    # Typography analysis
+    typography = _analyze_typography(elements)
+    if typography:
+        result['typography'] = typography
+    # OCR quality analysis
+    if ocr_elements:
+        ocr_quality = _analyze_ocr_quality(ocr_elements)
+        if ocr_quality:
+            result['ocr_quality'] = ocr_quality
+    return result
+def describe_rect_elements(elements: List["Element"]) -> Dict[str, Any]:
+    """
+    Describe rectangle elements with size and style analysis.
+    Args:
+        elements: List of rectangle elements
+    Returns:
+        Dictionary with rectangle analysis
+    """
+    if not elements:
+        return {"message": "No rectangle elements found"}
+    result = {}
+    # Size analysis
+    sizes = []
+    stroke_count = 0
+    fill_count = 0
+    colors = Counter()
+    stroke_widths = []
+    for element in elements:
+        # Size
+        width = getattr(element, 'width', 0)
+        height = getattr(element, 'height', 0)
+        if width and height:
+            sizes.append((width, height))
+        # Style properties - use RectangleElement properties
+        stroke = getattr(element, 'stroke', None)
+        if stroke and stroke != (0, 0, 0):  # Check if stroke color exists and isn't black
+            stroke_count += 1
+        fill = getattr(element, 'fill', None)
+        if fill and fill != (0, 0, 0):  # Check if fill color exists and isn't black
+            fill_count += 1
+        # Stroke width
+        stroke_width = getattr(element, 'stroke_width', 0)
+        if stroke_width > 0:
+            stroke_widths.append(stroke_width)
+        # Color - use the element's stroke/fill properties
+        color = stroke or fill
+        if color:
+            if isinstance(color, (tuple, list)):
+                if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
+                    colors['black'] += 1
+                elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
+                    colors['white'] += 1
+                else:
+                    colors[str(color)] += 1
+            else:
+                colors[str(color)] += 1
+    # Size statistics
+    if sizes:
+        widths = [s[0] for s in sizes]
+        heights = [s[1] for s in sizes]
+        result['size_stats'] = {
+            'width_range': f"{min(widths):.0f}-{max(widths):.0f}",
+            'height_range': f"{min(heights):.0f}-{max(heights):.0f}",
+            'avg_area': f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts"
+        }
+    # Style breakdown
+    style_info = {}
+    if stroke_count:
+        style_info['stroke'] = stroke_count
+    if fill_count:
+        style_info['fill'] = fill_count
+    if stroke_widths:
+        stroke_width_counts = Counter(stroke_widths)
+        # Convert float keys to strings to avoid formatting issues
+        stroke_width_dict = {str(k): v for k, v in stroke_width_counts.most_common()}
+        style_info['stroke_widths'] = stroke_width_dict
+    if colors:
+        style_info['colors'] = dict(colors.most_common(5))
+    if style_info:
+        result['styles'] = style_info
+    return result
+def describe_line_elements(elements: List["Element"]) -> Dict[str, Any]:
+    """
+    Describe line elements with length and style analysis.
+    Args:
+        elements: List of line elements
+    Returns:
+        Dictionary with line analysis
+    """
+    if not elements:
+        return {"message": "No line elements found"}
+    result = {}
+    lengths = []
+    widths = []
+    colors = Counter()
+    for element in elements:
+        # Calculate length
+        x0 = getattr(element, 'x0', 0)
+        y0 = getattr(element, 'top', 0)
+        x1 = getattr(element, 'x1', 0)
+        y1 = getattr(element, 'bottom', 0)
+        length = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
+        if length > 0:
+            lengths.append(length)
+        # Line width - use the element's width property
+        width = getattr(element, 'width', 0)  # LineElement has a width property
+        if width:
+            widths.append(width)
+        # Color - use the element's color property
+        color = getattr(element, 'color', None)  # LineElement has a color property
+        if color:
+            if isinstance(color, (tuple, list)):
+                if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
+                    colors['black'] += 1
+                else:
+                    colors[str(color)] += 1
+            else:
+                colors[str(color)] += 1
+    # Length statistics
+    if lengths:
+        result['length_stats'] = {
+            'min': f"{min(lengths):.0f}",
+            'max': f"{max(lengths):.0f}",
+            'avg': f"{sum(lengths)/len(lengths):.0f}"
+        }
+    # Width statistics
+    if widths:
+        width_counts = Counter(widths)
+        # Convert float keys to strings to avoid formatting issues
+        result['line_widths'] = {str(k): v for k, v in width_counts.most_common()}
+    # Orientation analysis
+    horizontal_count = sum(1 for el in elements if getattr(el, 'is_horizontal', False))
+    vertical_count = sum(1 for el in elements if getattr(el, 'is_vertical', False))
+    diagonal_count = len(elements) - horizontal_count - vertical_count
+    if horizontal_count or vertical_count or diagonal_count:
+        orientation_info = {}
+        if horizontal_count:
+            orientation_info['horizontal'] = horizontal_count
+        if vertical_count:
+            orientation_info['vertical'] = vertical_count
+        if diagonal_count:
+            orientation_info['diagonal'] = diagonal_count
+        result['orientations'] = orientation_info
+    # Colors
+    if colors:
+        result['colors'] = dict(colors.most_common())
+    return result
+def describe_region_elements(elements: List["Element"]) -> Dict[str, Any]:
+    """
+    Describe region elements with type and metadata analysis.
+    Args:
+        elements: List of region elements
+    Returns:
+        Dictionary with region analysis
+    """
+    if not elements:
+        return {"message": "No region elements found"}
+    result = {}
+    # Region types
+    types = Counter()
+    sizes = []
+    metadata_keys = set()
+    for element in elements:
+        # Type
+        region_type = getattr(element, 'type', 'unknown')
+        types[region_type] += 1
+        # Size
+        width = getattr(element, 'width', 0)
+        height = getattr(element, 'height', 0)
+        if width and height:
+            sizes.append(width * height)
+        # Metadata keys
+        if hasattr(element, 'metadata') and element.metadata:
+            metadata_keys.update(element.metadata.keys())
+    # Type breakdown
+    if types:
+        result['types'] = dict(types.most_common())
+    # Size statistics
+    if sizes:
+        result['size_stats'] = {
+            'min_area': f"{min(sizes):.0f} sq pts",
+            'max_area': f"{max(sizes):.0f} sq pts",
+            'avg_area': f"{sum(sizes)/len(sizes):.0f} sq pts"
+        }
+    # Metadata
+    if metadata_keys:
+        result['metadata_keys'] = sorted(list(metadata_keys))
+    return result
+def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
+    """Analyze typography patterns in text elements."""
+    fonts = Counter()
+    sizes = Counter()
+    styles = {'bold': 0, 'italic': 0}
+    colors = Counter()
+    for element in elements:
+        # Font family - use TextElement's font_family property for cleaner names
+        font_family = getattr(element, 'font_family', None)
+        fontname = getattr(element, 'fontname', 'Unknown')
+        display_font = font_family if font_family and font_family != fontname else fontname
+        if display_font:
+            fonts[display_font] += 1
+        # Size
+        size = getattr(element, 'size', None)
+        if size:
+            # Round to nearest 0.5
+            rounded_size = round(size * 2) / 2
+            sizes[f"{rounded_size}pt"] += 1
+        # Styles
+        if getattr(element, 'bold', False):
+            styles['bold'] += 1
+        if getattr(element, 'italic', False):
+            styles['italic'] += 1
+        # Color - use TextElement's color property
+        color = getattr(element, 'color', None)
+        if color:
+            if isinstance(color, (tuple, list)):
+                if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
+                    colors['black'] += 1
+                elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
+                    colors['white'] += 1
+                else:
+                    colors['other'] += 1
+            else:
+                colors[str(color)] += 1
+    result = {}
+    # Fonts
+    if fonts:
+        result['fonts'] = dict(fonts.most_common(10))
+    # Sizes (as horizontal table)
+    if sizes:
+        result['sizes'] = dict(sizes.most_common())
+    # Styles
+    style_list = []
+    if styles['bold']:
+        style_list.append(f"{styles['bold']} bold")
+    if styles['italic']:
+        style_list.append(f"{styles['italic']} italic")
+    if style_list:
+        result['styles'] = ", ".join(style_list)
+    # Colors
+    if colors and len(colors) > 1:  # Only show if there are multiple colors
+        result['colors'] = dict(colors.most_common())
+    return result
+def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
+    """Analyze OCR quality metrics."""
+    confidences = []
+    for element in elements:
+        confidence = getattr(element, 'confidence', None)
+        if confidence is not None:
+            confidences.append(confidence)
+    if not confidences:
+        return {}
+    result = {}
+    # Basic stats
+    result['confidence_stats'] = {
+        'mean': f"{sum(confidences)/len(confidences):.2f}",
+        'min': f"{min(confidences):.2f}",
+        'max': f"{max(confidences):.2f}"
+    }
+    # Threshold analysis with ASCII bars
+    thresholds = [
+        ('99%+', 0.99),
+        ('95%+', 0.95),
+        ('90%+', 0.90),
+    ]
+    element_count = len(elements)
+    threshold_bars = {}
+    for label, threshold in thresholds:
+        count = sum(1 for c in confidences if c >= threshold)
+        percentage = count / element_count
+        # Create ASCII bar (40 characters wide)
+        filled_chars = int(percentage * 40)
+        empty_chars = 40 - filled_chars
+        bar = '█' * filled_chars + '░' * empty_chars
+        # Format: "95%+ (32/43) 74%: `████████████████████████████████░░░░░░░░`"
+        threshold_bars[f"{label} ({count}/{element_count}) {percentage:.0%}"] = f"`{bar}`"
+    result['quality_distribution'] = threshold_bars
+    # Show lowest quality items
+    element_confidences = []
+    for element in elements:
+        confidence = getattr(element, 'confidence', None)
+        if confidence is not None:
+            # Get text content for display
+            text = getattr(element, 'text', '').strip()
+            if text:
+                # Truncate long text
+                display_text = text[:50] + "..." if len(text) > 50 else text
+                element_confidences.append((confidence, display_text))
+    if element_confidences:
+        # Sort by confidence (lowest first) and take bottom 10
+        lowest_quality = sorted(element_confidences, key=lambda x: x[0])[:10]
+        if lowest_quality:
+            lowest_items = {}
+            for i, (confidence, text) in enumerate(lowest_quality, 1):
+                lowest_items[f"#{i}"] = f"**{confidence:.2f}**: {text}"
+            result['lowest_scoring'] = lowest_items
+    return result

natural_pdf/describe/mixin.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+Mixin for describe functionality.
+"""
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from natural_pdf.describe.summary import ElementSummary, InspectionSummary
+class DescribeMixin:
+    """
+    Mixin providing describe functionality for pages, collections, and regions.
+    Classes that inherit from this mixin get:
+    - .describe() method for high-level summaries
+    - .inspect() method for detailed tabular views (collections only)
+    """
+    def describe(self) -> "ElementSummary":
+        """
+        Describe this object with type-specific analysis.
+        Returns:
+            ElementSummary with analysis appropriate for the object type
+        """
+        from natural_pdf.describe import describe_page, describe_collection, describe_region, describe_element
+        # Determine the appropriate describe function based on class type
+        class_name = self.__class__.__name__
+        if class_name == 'Page':
+            return describe_page(self)
+        elif class_name == 'ElementCollection':
+            return describe_collection(self)
+        elif class_name == 'Region':
+            return describe_region(self)
+        else:
+            # Check if it's an individual element (inherits from Element base class)
+            from natural_pdf.elements.base import Element
+            if isinstance(self, Element):
+                return describe_element(self)
+            # Fallback - try to determine based on available methods/attributes
+            if hasattr(self, 'get_elements') and hasattr(self, 'width') and hasattr(self, 'height'):
+                # Looks like a page or region
+                if hasattr(self, 'number'):
+                    return describe_page(self)  # Page
+                else:
+                    return describe_region(self)  # Region
+            elif hasattr(self, '__iter__') and hasattr(self, '__len__'):
+                # Looks like a collection
+                return describe_collection(self)
+            else:
+                # Unknown type - create a basic summary
+                from natural_pdf.describe.summary import ElementSummary
+                data = {
+                    "object_type": class_name,
+                    "message": f"Describe not fully implemented for {class_name}"
+                }
+                return ElementSummary(data, f"{class_name} Summary")
+class InspectMixin:
+    """
+    Mixin providing inspect functionality for collections.
+    Classes that inherit from this mixin get:
+    - .inspect() method for detailed tabular element views
+    """
+    def inspect(self, limit: int = 30) -> "InspectionSummary":
+        """
+        Inspect elements with detailed tabular view.
+        Args:
+            limit: Maximum elements per type to show (default: 30)
+        Returns:
+            InspectionSummary with element tables showing coordinates,
+            properties, and other details for each element
+        """
+        from natural_pdf.describe import inspect_collection
+        return inspect_collection(self, limit=limit)

natural_pdf/describe/summary.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""
+Summary objects for describe functionality.
+"""
+from typing import Any, Dict, List, Union
+class ElementSummary:
+    """
+    Container for element summary data with markdown rendering.
+    Automatically renders as markdown in Jupyter notebooks and provides
+    access to underlying data as dictionaries.
+    """
+    def __init__(self, data: Dict[str, Any], title: str = "Summary"):
+        """
+        Initialize summary with data and optional title.
+        Args:
+            data: Dictionary containing summary sections
+            title: Title for the summary display
+        """
+        self.data = data
+        self.title = title
+    def __str__(self) -> str:
+        """String representation as markdown."""
+        return self._to_markdown()
+    def __repr__(self) -> str:
+        """Repr as markdown for better display."""
+        return self._to_markdown()
+    def _repr_markdown_(self) -> str:
+        """Jupyter notebook markdown rendering."""
+        return self._to_markdown()
+    def to_dict(self) -> Dict[str, Any]:
+        """Return underlying data as dictionary."""
+        return self.data.copy()
+    def _to_markdown(self) -> str:
+        """Convert data to markdown format."""
+        lines = [f"## {self.title}", ""]
+        for section_name, section_data in self.data.items():
+            lines.extend(self._format_section(section_name, section_data))
+            lines.append("")  # Empty line between sections
+        return "\n".join(lines).rstrip()
+    def _format_section(self, name: str, data: Any) -> List[str]:
+        """Format a single section as markdown."""
+        # Use bold text instead of headers for more compact display
+        section_title = name.replace('_', ' ').title()
+        if isinstance(data, dict):
+            lines = [f"**{section_title}**:"]
+            lines.extend(self._format_dict(data, indent="  "))
+        elif isinstance(data, list):
+            lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
+        else:
+            lines = [f"**{section_title}**: {data}"]
+        return lines
+    def _format_dict(self, data: Dict[str, Any], indent: str = "") -> List[str]:
+        """Format dictionary as markdown list."""
+        lines = []
+        for key, value in data.items():
+            key_display = key.replace('_', ' ')
+            if isinstance(value, dict):
+                # Nested dict - always format as list items
+                lines.append(f"{indent}- **{key_display}**:")
+                for subkey, subvalue in value.items():
+                    subkey_display = subkey.replace('_', ' ')
+                    if isinstance(subvalue, dict):
+                        # Another level of nesting
+                        lines.append(f"{indent}  - **{subkey_display}**:")
+                        for subsubkey, subsubvalue in subvalue.items():
+                            subsubkey_display = subsubkey.replace('_', ' ')
+                            lines.append(f"{indent}    - {subsubkey_display}: {subsubvalue}")
+                    else:
+                        lines.append(f"{indent}  - {subkey_display}: {subvalue}")
+            elif isinstance(value, list):
+                if len(value) <= 5:
+                    value_str = ", ".join(str(v) for v in value)
+                    lines.append(f"{indent}- **{key_display}**: {value_str}")
+                else:
+                    lines.append(f"{indent}- **{key_display}**: {len(value)} items")
+            else:
+                lines.append(f"{indent}- **{key_display}**: {value}")
+        return lines
+    def _format_list(self, data: List[Any]) -> List[str]:
+        """Format list as markdown."""
+        lines = []
+        for item in data:
+            if isinstance(item, dict):
+                # Could be table rows
+                lines.append(f"- {item}")
+            else:
+                lines.append(f"- {item}")
+        return lines
+    def _format_horizontal_table(self, title: str, data: Dict[str, Any]) -> List[str]:
+        """Format dict as horizontal table."""
+        headers = list(data.keys())
+        values = list(data.values())
+        # Create table
+        header_row = "| " + " | ".join(headers) + " |"
+        separator = "|" + "|".join("------" for _ in headers) + "|"
+        value_row = "| " + " | ".join(str(v) for v in values) + " |"
+        return [
+            f"- **{title}**:",
+            "",
+            header_row,
+            separator,
+            value_row,
+            ""
+        ]
+class InspectionSummary(ElementSummary):
+    """
+    Summary for element inspection with tabular data.
+    """
+    def _format_section(self, name: str, data: Any) -> List[str]:
+        """Format inspection section with element tables."""
+        section_title = name.replace('_', ' ').title()
+        if isinstance(data, dict) and 'elements' in data:
+            # This is an element table section - use ### header for inspect
+            elements = data['elements']
+            lines = [f"### {section_title}"]
+            if elements:
+                lines.extend(self._format_element_table(elements, data.get('columns', [])))
+                # Add note if truncated
+                if 'note' in data:
+                    lines.append(f"_{data['note']}_")
+            else:
+                lines.append("No elements found.")
+        else:
+            # Regular section formatting
+            lines = [f"**{section_title}**: {data}"]
+        return lines
+    def _format_element_table(self, elements: List[Dict[str, Any]], columns: List[str]) -> List[str]:
+        """Format elements as markdown table."""
+        if not elements or not columns:
+            return ["No elements to display."]
+        lines = [""]  # Empty line before table
+        # Table header
+        header_row = "| " + " | ".join(columns) + " |"
+        separator = "|" + "|".join("------" for _ in columns) + "|"
+        lines.extend([header_row, separator])
+        # Table rows
+        for element in elements:
+            row_values = []
+            for col in columns:
+                value = element.get(col, "")
+                if value is None:
+                    value = ""
+                elif isinstance(value, float):
+                    value = str(int(round(value)))
+                elif isinstance(value, str) and len(value) > 50:
+                    value = value[:50] + "..."
+                row_values.append(str(value))
+            row = "| " + " | ".join(row_values) + " |"
+            lines.append(row)
+        return lines

natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

natural-pdf 0.1.15py3-none-any.whl → 0.1.17py3-none-any.whl