PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

natural_pdf/describe/elements.py CHANGED Viewed

@@ -15,263 +15,263 @@ logger = logging.getLogger(__name__)
 def describe_text_elements(elements: List["Element"]) -> Dict[str, Any]:
     """
     Describe text elements with typography and OCR analysis.
     Args:
         elements: List of text elements
     Returns:
         Dictionary with text analysis sections
     """
     if not elements:
         return {"message": "No text elements found"}
     result = {}
     # Source breakdown
     sources = Counter()
     ocr_elements = []
     for element in elements:
-        source = getattr(element, 'source', 'unknown')
+        source = getattr(element, "source", "unknown")
         sources[source] += 1
-        if source == 'ocr':
+        if source == "ocr":
             ocr_elements.append(element)
     if len(sources) > 1:
-        result['sources'] = dict(sources)
+        result["sources"] = dict(sources)
     # Typography analysis
     typography = _analyze_typography(elements)
     if typography:
-        result['typography'] = typography
+        result["typography"] = typography
     # OCR quality analysis
     if ocr_elements:
         ocr_quality = _analyze_ocr_quality(ocr_elements)
         if ocr_quality:
-            result['ocr_quality'] = ocr_quality
+            result["ocr_quality"] = ocr_quality
     return result
 def describe_rect_elements(elements: List["Element"]) -> Dict[str, Any]:
     """
     Describe rectangle elements with size and style analysis.
     Args:
         elements: List of rectangle elements
     Returns:
         Dictionary with rectangle analysis
     """
     if not elements:
         return {"message": "No rectangle elements found"}
     result = {}
     # Size analysis
     sizes = []
     stroke_count = 0
     fill_count = 0
     colors = Counter()
     stroke_widths = []
     for element in elements:
         # Size
-        width = getattr(element, 'width', 0)
-        height = getattr(element, 'height', 0)
+        width = getattr(element, "width", 0)
+        height = getattr(element, "height", 0)
         if width and height:
             sizes.append((width, height))
         # Style properties - use RectangleElement properties
-        stroke = getattr(element, 'stroke', None)
+        stroke = getattr(element, "stroke", None)
         if stroke and stroke != (0, 0, 0):  # Check if stroke color exists and isn't black
             stroke_count += 1
-        fill = getattr(element, 'fill', None)
+        fill = getattr(element, "fill", None)
         if fill and fill != (0, 0, 0):  # Check if fill color exists and isn't black
             fill_count += 1
         # Stroke width
-        stroke_width = getattr(element, 'stroke_width', 0)
+        stroke_width = getattr(element, "stroke_width", 0)
         if stroke_width > 0:
             stroke_widths.append(stroke_width)
         # Color - use the element's stroke/fill properties
         color = stroke or fill
         if color:
             if isinstance(color, (tuple, list)):
                 if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
-                    colors['black'] += 1
+                    colors["black"] += 1
                 elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
-                    colors['white'] += 1
+                    colors["white"] += 1
                 else:
                     colors[str(color)] += 1
             else:
                 colors[str(color)] += 1
     # Size statistics
     if sizes:
         widths = [s[0] for s in sizes]
         heights = [s[1] for s in sizes]
-        result['size_stats'] = {
-            'width_range': f"{min(widths):.0f}-{max(widths):.0f}",
-            'height_range': f"{min(heights):.0f}-{max(heights):.0f}",
-            'avg_area': f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts"
+        result["size_stats"] = {
+            "width_range": f"{min(widths):.0f}-{max(widths):.0f}",
+            "height_range": f"{min(heights):.0f}-{max(heights):.0f}",
+            "avg_area": f"{sum(w*h for w,h in sizes)/len(sizes):.0f} sq pts",
         }
     # Style breakdown
     style_info = {}
     if stroke_count:
-        style_info['stroke'] = stroke_count
+        style_info["stroke"] = stroke_count
     if fill_count:
-        style_info['fill'] = fill_count
+        style_info["fill"] = fill_count
     if stroke_widths:
         stroke_width_counts = Counter(stroke_widths)
         # Convert float keys to strings to avoid formatting issues
         stroke_width_dict = {str(k): v for k, v in stroke_width_counts.most_common()}
-        style_info['stroke_widths'] = stroke_width_dict
+        style_info["stroke_widths"] = stroke_width_dict
     if colors:
-        style_info['colors'] = dict(colors.most_common(5))
+        style_info["colors"] = dict(colors.most_common(5))
     if style_info:
-        result['styles'] = style_info
+        result["styles"] = style_info
     return result
 def describe_line_elements(elements: List["Element"]) -> Dict[str, Any]:
     """
     Describe line elements with length and style analysis.
     Args:
         elements: List of line elements
     Returns:
         Dictionary with line analysis
     """
     if not elements:
         return {"message": "No line elements found"}
     result = {}
     lengths = []
     widths = []
     colors = Counter()
     for element in elements:
         # Calculate length
-        x0 = getattr(element, 'x0', 0)
-        y0 = getattr(element, 'top', 0)
-        x1 = getattr(element, 'x1', 0)
-        y1 = getattr(element, 'bottom', 0)
+        x0 = getattr(element, "x0", 0)
+        y0 = getattr(element, "top", 0)
+        x1 = getattr(element, "x1", 0)
+        y1 = getattr(element, "bottom", 0)
         length = ((x1 - x0) ** 2 + (y1 - y0) ** 2) ** 0.5
         if length > 0:
             lengths.append(length)
         # Line width - use the element's width property
-        width = getattr(element, 'width', 0)  # LineElement has a width property
+        width = getattr(element, "width", 0)  # LineElement has a width property
         if width:
             widths.append(width)
         # Color - use the element's color property
-        color = getattr(element, 'color', None)  # LineElement has a color property
+        color = getattr(element, "color", None)  # LineElement has a color property
         if color:
             if isinstance(color, (tuple, list)):
                 if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
-                    colors['black'] += 1
+                    colors["black"] += 1
                 else:
                     colors[str(color)] += 1
             else:
                 colors[str(color)] += 1
     # Length statistics
     if lengths:
-        result['length_stats'] = {
-            'min': f"{min(lengths):.0f}",
-            'max': f"{max(lengths):.0f}",
-            'avg': f"{sum(lengths)/len(lengths):.0f}"
+        result["length_stats"] = {
+            "min": f"{min(lengths):.0f}",
+            "max": f"{max(lengths):.0f}",
+            "avg": f"{sum(lengths)/len(lengths):.0f}",
         }
     # Width statistics
     if widths:
         width_counts = Counter(widths)
         # Convert float keys to strings to avoid formatting issues
-        result['line_widths'] = {str(k): v for k, v in width_counts.most_common()}
+        result["line_widths"] = {str(k): v for k, v in width_counts.most_common()}
     # Orientation analysis
-    horizontal_count = sum(1 for el in elements if getattr(el, 'is_horizontal', False))
-    vertical_count = sum(1 for el in elements if getattr(el, 'is_vertical', False))
+    horizontal_count = sum(1 for el in elements if getattr(el, "is_horizontal", False))
+    vertical_count = sum(1 for el in elements if getattr(el, "is_vertical", False))
     diagonal_count = len(elements) - horizontal_count - vertical_count
     if horizontal_count or vertical_count or diagonal_count:
         orientation_info = {}
         if horizontal_count:
-            orientation_info['horizontal'] = horizontal_count
+            orientation_info["horizontal"] = horizontal_count
         if vertical_count:
-            orientation_info['vertical'] = vertical_count
+            orientation_info["vertical"] = vertical_count
         if diagonal_count:
-            orientation_info['diagonal'] = diagonal_count
-        result['orientations'] = orientation_info
+            orientation_info["diagonal"] = diagonal_count
+        result["orientations"] = orientation_info
     # Colors
     if colors:
-        result['colors'] = dict(colors.most_common())
+        result["colors"] = dict(colors.most_common())
     return result
 def describe_region_elements(elements: List["Element"]) -> Dict[str, Any]:
     """
     Describe region elements with type and metadata analysis.
     Args:
         elements: List of region elements
     Returns:
         Dictionary with region analysis
     """
     if not elements:
         return {"message": "No region elements found"}
     result = {}
     # Region types
     types = Counter()
     sizes = []
     metadata_keys = set()
     for element in elements:
         # Type
-        region_type = getattr(element, 'type', 'unknown')
+        region_type = getattr(element, "type", "unknown")
         types[region_type] += 1
         # Size
-        width = getattr(element, 'width', 0)
-        height = getattr(element, 'height', 0)
+        width = getattr(element, "width", 0)
+        height = getattr(element, "height", 0)
         if width and height:
             sizes.append(width * height)
         # Metadata keys
-        if hasattr(element, 'metadata') and element.metadata:
+        if hasattr(element, "metadata") and element.metadata:
             metadata_keys.update(element.metadata.keys())
     # Type breakdown
     if types:
-        result['types'] = dict(types.most_common())
+        result["types"] = dict(types.most_common())
     # Size statistics
     if sizes:
-        result['size_stats'] = {
-            'min_area': f"{min(sizes):.0f} sq pts",
-            'max_area': f"{max(sizes):.0f} sq pts",
-            'avg_area': f"{sum(sizes)/len(sizes):.0f} sq pts"
+        result["size_stats"] = {
+            "min_area": f"{min(sizes):.0f} sq pts",
+            "max_area": f"{max(sizes):.0f} sq pts",
+            "avg_area": f"{sum(sizes)/len(sizes):.0f} sq pts",
         }
     # Metadata
     if metadata_keys:
-        result['metadata_keys'] = sorted(list(metadata_keys))
+        result["metadata_keys"] = sorted(list(metadata_keys))
     return result
@@ -279,131 +279,131 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
     """Analyze typography patterns in text elements."""
     fonts = Counter()
     sizes = Counter()
-    styles = {'bold': 0, 'italic': 0, 'strikeout': 0, 'underline': 0, 'highlight': 0}
+    styles = {"bold": 0, "italic": 0, "strikeout": 0, "underline": 0, "highlight": 0}
     colors = Counter()
     for element in elements:
         # Font family - use TextElement's font_family property for cleaner names
-        font_family = getattr(element, 'font_family', None)
-        fontname = getattr(element, 'fontname', 'Unknown')
+        font_family = getattr(element, "font_family", None)
+        fontname = getattr(element, "fontname", "Unknown")
         display_font = font_family if font_family and font_family != fontname else fontname
         if display_font:
             fonts[display_font] += 1
         # Size
-        size = getattr(element, 'size', None)
+        size = getattr(element, "size", None)
         if size:
             # Round to nearest 0.5
             rounded_size = round(size * 2) / 2
             sizes[f"{rounded_size}pt"] += 1
         # Styles
-        if getattr(element, 'bold', False):
-            styles['bold'] += 1
-        if getattr(element, 'italic', False):
-            styles['italic'] += 1
-        if getattr(element, 'strikeout', False):
-            styles['strikeout'] += 1
-        if getattr(element, 'underline', False):
-            styles['underline'] += 1
-        if getattr(element, 'highlight', False):
-            styles['highlight'] += 1
+        if getattr(element, "bold", False):
+            styles["bold"] += 1
+        if getattr(element, "italic", False):
+            styles["italic"] += 1
+        if getattr(element, "strikeout", False):
+            styles["strikeout"] += 1
+        if getattr(element, "underline", False):
+            styles["underline"] += 1
+        if getattr(element, "highlight", False):
+            styles["highlight"] += 1
         # Color - use TextElement's color property
-        color = getattr(element, 'color', None)
+        color = getattr(element, "color", None)
         if color:
             if isinstance(color, (tuple, list)):
                 if color == (0, 0, 0) or color == (0.0, 0.0, 0.0):
-                    colors['black'] += 1
+                    colors["black"] += 1
                 elif color == (1, 1, 1) or color == (1.0, 1.0, 1.0):
-                    colors['white'] += 1
+                    colors["white"] += 1
                 else:
-                    colors['other'] += 1
+                    colors["other"] += 1
             else:
                 colors[str(color)] += 1
     result = {}
     # Fonts
     if fonts:
-        result['fonts'] = dict(fonts.most_common(10))
+        result["fonts"] = dict(fonts.most_common(10))
     # Sizes (as horizontal table)
     if sizes:
-        result['sizes'] = dict(sizes.most_common())
+        result["sizes"] = dict(sizes.most_common())
     # Styles
     style_list = []
     for style, count in styles.items():
         if count > 0:
             style_list.append(f"{count} {style}")
     if style_list:
-        result['styles'] = ", ".join(style_list)
+        result["styles"] = ", ".join(style_list)
     # Colors
     if colors and len(colors) > 1:  # Only show if there are multiple colors
-        result['colors'] = dict(colors.most_common())
+        result["colors"] = dict(colors.most_common())
     return result
 def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
     """Analyze OCR quality metrics."""
     confidences = []
     for element in elements:
-        confidence = getattr(element, 'confidence', None)
+        confidence = getattr(element, "confidence", None)
         if confidence is not None:
             confidences.append(confidence)
     if not confidences:
         return {}
     result = {}
     # Basic stats
-    result['confidence_stats'] = {
-        'mean': f"{sum(confidences)/len(confidences):.2f}",
-        'min': f"{min(confidences):.2f}",
-        'max': f"{max(confidences):.2f}"
+    result["confidence_stats"] = {
+        "mean": f"{sum(confidences)/len(confidences):.2f}",
+        "min": f"{min(confidences):.2f}",
+        "max": f"{max(confidences):.2f}",
     }
     # Threshold analysis with ASCII bars
     thresholds = [
-        ('99%+', 0.99),
-        ('95%+', 0.95),
-        ('90%+', 0.90),
+        ("99%+", 0.99),
+        ("95%+", 0.95),
+        ("90%+", 0.90),
     ]
     element_count = len(elements)
     threshold_bars = {}
     for label, threshold in thresholds:
         count = sum(1 for c in confidences if c >= threshold)
         percentage = count / element_count
         # Create ASCII bar (40 characters wide)
         filled_chars = int(percentage * 40)
         empty_chars = 40 - filled_chars
-        bar = '█' * filled_chars + '░' * empty_chars
+        bar = "█" * filled_chars + "░" * empty_chars
         # Format: "95%+ (32/43) 74%: `████████████████████████████████░░░░░░░░`"
         threshold_bars[f"{label} ({count}/{element_count}) {percentage:.0%}"] = f"`{bar}`"
-    result['quality_distribution'] = threshold_bars
+    result["quality_distribution"] = threshold_bars
     # Show lowest quality items
     element_confidences = []
     for element in elements:
-        confidence = getattr(element, 'confidence', None)
+        confidence = getattr(element, "confidence", None)
         if confidence is not None:
             # Get text content for display
-            text = getattr(element, 'text', '').strip()
+            text = getattr(element, "text", "").strip()
             if text:
                 # Truncate long text
                 display_text = text[:60] + "..." if len(text) > 60 else text
                 element_confidences.append((confidence, display_text))
     if element_confidences:
         # Sort by confidence (lowest first) and take bottom 10
         lowest_quality = sorted(element_confidences, key=lambda x: x[0])[:10]
@@ -411,6 +411,6 @@ def _analyze_ocr_quality(elements: List["Element"]) -> Dict[str, Any]:
             lowest_items = {}
             for i, (confidence, text) in enumerate(lowest_quality, 1):
                 lowest_items[f"#{i}"] = f"**{confidence:.2f}**: {text}"
-            result['lowest_scoring'] = lowest_items
-    return result
+            result["lowest_scoring"] = lowest_items
+    return result

natural_pdf/describe/mixin.py CHANGED Viewed

@@ -11,52 +11,59 @@ if TYPE_CHECKING:
 class DescribeMixin:
     """
     Mixin providing describe functionality for pages, collections, and regions.
     Classes that inherit from this mixin get:
     - .describe() method for high-level summaries
     - .inspect() method for detailed tabular views (collections only)
     """
     def describe(self) -> "ElementSummary":
         """
         Describe this object with type-specific analysis.
         Returns:
             ElementSummary with analysis appropriate for the object type
         """
-        from natural_pdf.describe import describe_page, describe_collection, describe_region, describe_element
+        from natural_pdf.describe import (
+            describe_collection,
+            describe_element,
+            describe_page,
+            describe_region,
+        )
         # Determine the appropriate describe function based on class type
         class_name = self.__class__.__name__
-        if class_name == 'Page':
+        if class_name == "Page":
             return describe_page(self)
-        elif class_name == 'ElementCollection':
+        elif class_name == "ElementCollection":
             return describe_collection(self)
-        elif class_name == 'Region':
+        elif class_name == "Region":
             return describe_region(self)
         else:
             # Check if it's an individual element (inherits from Element base class)
             from natural_pdf.elements.base import Element
             if isinstance(self, Element):
                 return describe_element(self)
             # Fallback - try to determine based on available methods/attributes
-            if hasattr(self, 'get_elements') and hasattr(self, 'width') and hasattr(self, 'height'):
+            if hasattr(self, "get_elements") and hasattr(self, "width") and hasattr(self, "height"):
                 # Looks like a page or region
-                if hasattr(self, 'number'):
+                if hasattr(self, "number"):
                     return describe_page(self)  # Page
                 else:
                     return describe_region(self)  # Region
-            elif hasattr(self, '__iter__') and hasattr(self, '__len__'):
+            elif hasattr(self, "__iter__") and hasattr(self, "__len__"):
                 # Looks like a collection
                 return describe_collection(self)
             else:
                 # Unknown type - create a basic summary
                 from natural_pdf.describe.summary import ElementSummary
                 data = {
                     "object_type": class_name,
-                    "message": f"Describe not fully implemented for {class_name}"
+                    "message": f"Describe not fully implemented for {class_name}",
                 }
                 return ElementSummary(data, f"{class_name} Summary")
@@ -64,21 +71,22 @@ class DescribeMixin:
 class InspectMixin:
     """
     Mixin providing inspect functionality for collections.
     Classes that inherit from this mixin get:
     - .inspect() method for detailed tabular element views
     """
     def inspect(self, limit: int = 30) -> "InspectionSummary":
         """
         Inspect elements with detailed tabular view.
         Args:
             limit: Maximum elements per type to show (default: 30)
         Returns:
             InspectionSummary with element tables showing coordinates,
             properties, and other details for each element
         """
         from natural_pdf.describe import inspect_collection
-        return inspect_collection(self, limit=limit)
+        return inspect_collection(self, limit=limit)

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl