PyPI - natural-pdf - Versions diffs - 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl - Mend

natural-pdf 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +751 -607
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +131 -45
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +120 -23
natural_pdf/core/pdf.py +477 -75
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +222 -108
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
natural_pdf-0.1.35.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.33.dist-info/RECORD +0 -118
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0

natural_pdf/describe/base.py CHANGED Viewed

@@ -26,41 +26,41 @@ logger = logging.getLogger(__name__)
 def describe_page(page: "Page") -> ElementSummary:
     """
     Describe what's on a page with high-level summary.
     Args:
         page: Page to describe
     Returns:
         ElementSummary with page overview
     """
     data = {}
     # Get all elements
     all_elements = page.get_elements()
     if not all_elements:
         data["message"] = "No elements found on page"
         return ElementSummary(data, f"Page {page.number} Summary")
     # Element counts by type (exclude chars - too granular)
     type_counts = Counter()
     for element in all_elements:
-        element_type = getattr(element, 'type', 'unknown')
-        if element_type != 'char':  # Skip character elements
+        element_type = getattr(element, "type", "unknown")
+        if element_type != "char":  # Skip character elements
             type_counts[element_type] += 1
     # Format element counts as dictionary for proper list formatting
     element_summary = {}
     for element_type, count in type_counts.most_common():
-        type_display = element_type.replace('_', ' ').title()
-        if element_type == 'word':
+        type_display = element_type.replace("_", " ").title()
+        if element_type == "word":
             # Add source breakdown for text
-            text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
+            text_elements = [e for e in all_elements if getattr(e, "type", "") == "word"]
             sources = Counter()
             for elem in text_elements:
-                source = getattr(elem, 'source', 'unknown')
+                source = getattr(elem, "source", "unknown")
                 sources[source] += 1
             if len(sources) > 1:
                 source_parts = []
                 for source, source_count in sources.most_common():
@@ -70,86 +70,83 @@ def describe_page(page: "Page") -> ElementSummary:
                 element_summary["text"] = f"{count} elements"
         else:
             element_summary[element_type] = f"{count} elements"
     data["elements"] = element_summary
     # Text analysis if we have text elements (exclude chars - too granular)
-    text_elements = [e for e in all_elements if getattr(e, 'type', '') == 'word']
+    text_elements = [e for e in all_elements if getattr(e, "type", "") == "word"]
     if text_elements:
         text_analysis = describe_text_elements(text_elements)
-        if text_analysis and 'message' not in text_analysis:
+        if text_analysis and "message" not in text_analysis:
             data["text_analysis"] = text_analysis
     return ElementSummary(data, f"Page {page.number} Summary")
 def describe_collection(collection: "ElementCollection") -> ElementSummary:
     """
     Describe an element collection with type-specific analysis.
     Args:
         collection: ElementCollection to describe
     Returns:
         ElementSummary with collection analysis
     """
     elements = list(collection)
     if not elements:
         data = {"message": "Empty collection"}
         return ElementSummary(data, "Collection Summary")
     data = {}
     # Group elements by type
     by_type = {}
     for element in elements:
-        element_type = getattr(element, 'type', 'unknown')
+        element_type = getattr(element, "type", "unknown")
         by_type.setdefault(element_type, []).append(element)
     # Overall summary for mixed collections (exclude chars from overview)
     if len(by_type) > 1:
-        type_counts = {k: len(v) for k, v in by_type.items() if k != 'char'}
+        type_counts = {k: len(v) for k, v in by_type.items() if k != "char"}
         total = sum(type_counts.values())
         summary_parts = []
         for element_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
-            type_display = element_type.replace('_', ' ').title()
+            type_display = element_type.replace("_", " ").title()
             summary_parts.append(f"**{type_display}**: {count}")
         if summary_parts:  # Only add overview if we have non-char elements
-            data["overview"] = {
-                "total_elements": total,
-                "type_breakdown": summary_parts
-            }
+            data["overview"] = {"total_elements": total, "type_breakdown": summary_parts}
     # Type-specific analysis (exclude chars - too granular)
     for element_type, type_elements in by_type.items():
-        if element_type == 'char':
+        if element_type == "char":
             # Skip character elements - too granular for useful analysis
             continue
-        elif element_type == 'word':
+        elif element_type == "word":
             analysis = describe_text_elements(type_elements)
-        elif element_type == 'rect':
+        elif element_type == "rect":
             analysis = describe_rect_elements(type_elements)
-        elif element_type == 'line':
+        elif element_type == "line":
             analysis = describe_line_elements(type_elements)
-        elif element_type == 'region':
+        elif element_type == "region":
             analysis = describe_region_elements(type_elements)
         else:
             analysis = {"count": len(type_elements)}
-        if analysis and 'message' not in analysis:
-            section_name = element_type.replace('_', ' ').title()
+        if analysis and "message" not in analysis:
+            section_name = element_type.replace("_", " ").title()
             if len(by_type) == 1:
                 # Single type collection - flatten the structure
                 data.update(analysis)
             else:
                 # Mixed collection - keep sections separate
                 data[section_name] = analysis
     # Count non-char elements for title
-    non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
+    non_char_count = len([e for e in elements if getattr(e, "type", "unknown") != "char"])
     title = f"Collection Summary ({non_char_count} elements)"
     return ElementSummary(data, title)
@@ -157,29 +154,29 @@ def describe_collection(collection: "ElementCollection") -> ElementSummary:
 def describe_region(region: "Region") -> ElementSummary:
     """
     Describe a region with its properties and contents.
     Args:
         region: Region to describe
     Returns:
         ElementSummary with region analysis
     """
     data = {}
     # Region info
     region_info = {
         "page": region.page.number,
         "dimensions": f"{region.width:.0f}×{region.height:.0f} pts",
         "area": f"{region.width * region.height:.0f} sq pts",
-        "position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})"
+        "position": f"({region.x0:.0f}, {region.top:.0f}) to ({region.x1:.0f}, {region.bottom:.0f})",
     }
     # Add metadata if available
-    if hasattr(region, 'metadata') and region.metadata:
+    if hasattr(region, "metadata") and region.metadata:
         region_info["metadata"] = region.metadata
     data["region_info"] = region_info
     # Content analysis
     content_elements = region.find_all("*")
     if content_elements:
@@ -188,54 +185,54 @@ def describe_region(region: "Region") -> ElementSummary:
         data["content"] = content_analysis.to_dict()
     else:
         data["content"] = {"message": "No elements found in region"}
     return ElementSummary(data, "Region Summary")
 def inspect_collection(collection: "ElementCollection", limit: int = 30) -> InspectionSummary:
     """
     Inspect elements in a collection with detailed tabular view.
     Args:
         collection: ElementCollection to inspect
         limit: Maximum elements per type to show (default: 30)
     Returns:
         InspectionSummary with element tables
     """
     elements = list(collection)
     if not elements:
         data = {"message": "Empty collection"}
         return InspectionSummary(data, "Collection Inspection")
     data = {}
     # Check if multi-page
     pages = set()
     for element in elements:
-        if hasattr(element, 'page') and hasattr(element.page, 'number'):
+        if hasattr(element, "page") and hasattr(element.page, "number"):
             pages.add(element.page.number)
     show_page_column = len(pages) > 1
     # Group by type
     by_type = {}
     for element in elements:
-        element_type = getattr(element, 'type', 'unknown')
+        element_type = getattr(element, "type", "unknown")
         by_type.setdefault(element_type, []).append(element)
     # Create tables for each type (exclude chars - too granular)
     for element_type, type_elements in by_type.items():
-        if element_type == 'char':
+        if element_type == "char":
             # Skip character elements - too granular for useful inspection
             continue
         # Limit elements shown
         display_elements = type_elements[:limit]
         # Get appropriate columns for this type
         columns = _get_columns_for_type(element_type, show_page_column)
         # Extract data for each element
         element_data = []
         for element in display_elements:
@@ -244,110 +241,113 @@ def inspect_collection(collection: "ElementCollection", limit: int = 30) -> Insp
                 value = _extract_element_value(element, col)
                 row[col] = value
             element_data.append(row)
         # Create section
         section_name = f"{element_type}_elements"
-        section_data = {
-            "elements": element_data,
-            "columns": columns
-        }
+        section_data = {"elements": element_data, "columns": columns}
         # Add note if truncated
         if len(type_elements) > limit:
-            section_data["note"] = f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
+            section_data["note"] = (
+                f"Showing {limit} of {len(type_elements)} elements (pass limit= to see more)"
+            )
         data[section_name] = section_data
     # Count non-char elements for title
-    non_char_count = len([e for e in elements if getattr(e, 'type', 'unknown') != 'char'])
+    non_char_count = len([e for e in elements if getattr(e, "type", "unknown") != "char"])
     title = f"Collection Inspection ({non_char_count} elements)"
     return InspectionSummary(data, title)
 def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str]:
     """Get appropriate columns for element type."""
-    base_columns = ['x0', 'top', 'x1', 'bottom']
-    if element_type == 'word':
-        columns = ['text'] + base_columns + [
-            'font_family',
-            'font_variant',
-            'size',
-            'bold',
-            'italic',
-            'strike',
-            'underline',
-            'highlight',
-            'source',
-            'confidence',
-        ]
+    base_columns = ["x0", "top", "x1", "bottom"]
+    if element_type == "word":
+        columns = (
+            ["text"]
+            + base_columns
+            + [
+                "font_family",
+                "font_variant",
+                "size",
+                "bold",
+                "italic",
+                "strike",
+                "underline",
+                "highlight",
+                "source",
+                "confidence",
+            ]
+        )
         # Add foreground text colour too
-        columns.append('color')
-    elif element_type == 'rect':
-        columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
-    elif element_type == 'line':
-        columns = base_columns + ['width', 'is_horizontal', 'is_vertical']  # LineElement properties
-    elif element_type == 'region':
-        columns = base_columns + ['width', 'height', 'type', 'color']
-    elif element_type == 'blob':
-        columns = base_columns + ['width', 'height', 'color']
+        columns.append("color")
+    elif element_type == "rect":
+        columns = base_columns + ["width", "height", "stroke", "fill", "stroke_width"]
+    elif element_type == "line":
+        columns = base_columns + ["width", "is_horizontal", "is_vertical"]  # LineElement properties
+    elif element_type == "region":
+        columns = base_columns + ["width", "height", "type", "color"]
+    elif element_type == "blob":
+        columns = base_columns + ["width", "height", "color"]
     else:
-        columns = base_columns + ['type']
+        columns = base_columns + ["type"]
     if show_page_column:
-        columns.append('page')
+        columns.append("page")
     return columns
 def _extract_element_value(element: "Element", column: str) -> Any:
     """Extract value for a column from an element."""
     try:
-        if column == 'text':
-            text = getattr(element, 'text', '')
+        if column == "text":
+            text = getattr(element, "text", "")
             if text and len(text) > 60:
                 return text[:60] + "..."
             return text or ""
-        elif column == 'page':
-            if hasattr(element, 'page') and hasattr(element.page, 'number'):
+        elif column == "page":
+            if hasattr(element, "page") and hasattr(element.page, "number"):
                 return element.page.number
             return ""
-        elif column == 'confidence':
-            confidence = getattr(element, 'confidence', None)
+        elif column == "confidence":
+            confidence = getattr(element, "confidence", None)
             if confidence is not None and isinstance(confidence, (int, float)):
                 return f"{confidence:.2f}"
             return ""
-        elif column == 'font_family':
+        elif column == "font_family":
             # Use the cleaner font_family property from TextElement
-            font_family = getattr(element, 'font_family', None)
+            font_family = getattr(element, "font_family", None)
             if font_family:
                 return font_family
             # Fallback to fontname
-            return getattr(element, 'fontname', '')
-        elif column == 'font_variant':
-            variant = getattr(element, 'font_variant', None)
+            return getattr(element, "fontname", "")
+        elif column == "font_variant":
+            variant = getattr(element, "font_variant", None)
             if variant:
                 return variant
             # Fallback – try to derive from fontname if property missing
-            fontname = getattr(element, 'fontname', '')
+            fontname = getattr(element, "fontname", "")
             if "+" in fontname:
                 return fontname.split("+", 1)[0]
-            return ''
-        elif column in ['bold', 'italic', 'strike', 'underline']:
+            return ""
+        elif column in ["bold", "italic", "strike", "underline"]:
             value = getattr(element, column, False)
             return value if isinstance(value, bool) else False
-        elif column == 'highlight':
+        elif column == "highlight":
             # If element is highlighted, return its colour; otherwise blank
-            if getattr(element, 'highlight', False):
-                col_val = getattr(element, 'highlight_color', None)
+            if getattr(element, "highlight", False):
+                col_val = getattr(element, "highlight_color", None)
                 if col_val is None:
-                    return 'True'  # fallback if colour missing
+                    return "True"  # fallback if colour missing
                 # Convert tuple to hex
                 if isinstance(col_val, (tuple, list)) and len(col_val) >= 3:
                     try:
@@ -356,9 +356,9 @@ def _extract_element_value(element: "Element", column: str) -> Any:
                     except Exception:
                         return str(col_val)
                 return str(col_val)
-            return ''
-        elif column in ['stroke', 'fill', 'color']:
+            return ""
+        elif column in ["stroke", "fill", "color"]:
             value = getattr(element, column, None)
             # If already a string (e.g. '#ff00aa' or 'red') return as is
             if isinstance(value, str):
@@ -371,24 +371,24 @@ def _extract_element_value(element: "Element", column: str) -> Any:
                 except Exception:
                     return str(value)
             return ""
-        elif column in ['x0', 'top', 'x1', 'bottom', 'width', 'height', 'size', 'stroke_width']:
+        elif column in ["x0", "top", "x1", "bottom", "width", "height", "size", "stroke_width"]:
             value = getattr(element, column, 0)
             if isinstance(value, (int, float)) and not isinstance(value, bool):
                 return int(round(value))
             return 0
-        elif column in ['is_horizontal', 'is_vertical']:
+        elif column in ["is_horizontal", "is_vertical"]:
             value = getattr(element, column, False)
             return value if isinstance(value, bool) else False
         else:
             # Generic attribute access
-            value = getattr(element, column, '')
+            value = getattr(element, column, "")
             if value is None:
                 return ""
             return str(value)
     except Exception as e:
         # Fallback for any unexpected errors
         logger.warning(f"Error extracting {column} from element: {e}")
@@ -398,64 +398,71 @@ def _extract_element_value(element: "Element", column: str) -> Any:
 def describe_element(element: "Element") -> "ElementSummary":
     """
     Describe an individual element with its properties and attributes.
     Args:
         element: The element to describe
     Returns:
         ElementSummary with formatted element properties
     """
     from natural_pdf.describe.summary import ElementSummary
     # Get basic element info
-    element_type = getattr(element, 'type', element.__class__.__name__)
+    element_type = getattr(element, "type", element.__class__.__name__)
     # Build the description data - use dict structure for proper list formatting
     data = {
         "info": {
             "object_type": "element",
             "element_type": element_type,
-            "class_name": element.__class__.__name__
+            "class_name": element.__class__.__name__,
         }
     }
     # Add geometric properties - use dict structure for proper list formatting
-    if hasattr(element, 'bbox'):
+    if hasattr(element, "bbox"):
         data["geometry"] = {
             "position": f"({round(element.x0, 1)}, {round(element.top, 1)}, {round(element.x1, 1)}, {round(element.bottom, 1)})",
-            "size": f"({round(element.width, 1)}, {round(element.height, 1)})"
+            "size": f"({round(element.width, 1)}, {round(element.height, 1)})",
         }
     # Add text content if available - use dict structure for proper list formatting
-    if hasattr(element, 'text') and element.text:
+    if hasattr(element, "text") and element.text:
         text = str(element.text).strip()
         display_text = text[:50] + "..." if len(text) > 50 else text
-        data["content"] = {
-            "text": f"'{display_text}'",
-            "length": f"{len(text)} chars"
-        }
+        data["content"] = {"text": f"'{display_text}'", "length": f"{len(text)} chars"}
     # Add common text properties - use dict structure for proper list formatting
     text_props = {}
-    for prop in ['font_family', 'size', 'bold', 'italic', 'strike', 'underline', 'highlight', 'source', 'confidence']:
+    for prop in [
+        "font_family",
+        "size",
+        "bold",
+        "italic",
+        "strike",
+        "underline",
+        "highlight",
+        "source",
+        "confidence",
+    ]:
         if hasattr(element, prop):
             value = getattr(element, prop)
             if value is not None:
-                if prop == 'confidence' and isinstance(value, (int, float)):
+                if prop == "confidence" and isinstance(value, (int, float)):
                     text_props[prop] = round(value, 3)
-                elif prop == 'size' and isinstance(value, (int, float)):
+                elif prop == "size" and isinstance(value, (int, float)):
                     text_props[prop] = round(value, 1)
-                elif prop in ['bold', 'italic', 'strike', 'underline']:
+                elif prop in ["bold", "italic", "strike", "underline"]:
                     text_props[prop] = value
                 else:
                     text_props[prop] = value
     if text_props:
         data["properties"] = text_props
     # Add color information - use dict structure for proper list formatting
     color_info = {}
-    for prop in ['color', 'fill', 'stroke']:
+    for prop in ["color", "fill", "stroke"]:
         if hasattr(element, prop):
             value = getattr(element, prop)
             if value is not None:
@@ -471,28 +478,28 @@ def describe_element(element: "Element") -> "ElementSummary":
                         color_info[prop] = str(value)
                 else:
                     color_info[prop] = str(value)
     if color_info:
         data["colors"] = color_info
     # Add page information - use dict structure for proper list formatting
-    if hasattr(element, 'page') and element.page:
-        page_num = getattr(element.page, 'number', None)
+    if hasattr(element, "page") and element.page:
+        page_num = getattr(element.page, "number", None)
         if page_num is not None:
             data["page"] = {"number": page_num}
     # Add polygon information if available - use dict structure for proper list formatting
-    if hasattr(element, 'has_polygon') and element.has_polygon:
-        if hasattr(element, 'polygon'):
+    if hasattr(element, "has_polygon") and element.has_polygon:
+        if hasattr(element, "polygon"):
             polygon = element.polygon
             if polygon and len(polygon) > 0:
                 data["shape"] = {"polygon_points": len(polygon)}
     # Create title
     title = f"{element_type.title()} Element"
-    if hasattr(element, 'text') and element.text:
+    if hasattr(element, "text") and element.text:
         preview = str(element.text).strip()[:30]
         if preview:
             title += f": '{preview}'"
-    return ElementSummary(data, title)
+    return ElementSummary(data, title)

natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

natural-pdf 0.1.33py3-none-any.whl → 0.1.35py3-none-any.whl