PyPI - natural-pdf - Versions diffs - 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl - Mend

natural-pdf 0.1.28py3-none-any.whl → 0.1.31py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

bad_pdf_analysis/analyze_10_more.py +300 -0
bad_pdf_analysis/analyze_final_10.py +552 -0
bad_pdf_analysis/analyze_specific_pages.py +394 -0
bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +44 -0
natural_pdf/analyzers/layout/surya.py +1 -1
natural_pdf/analyzers/shape_detection_mixin.py +228 -0
natural_pdf/classification/manager.py +67 -0
natural_pdf/core/element_manager.py +578 -27
natural_pdf/core/highlighting_service.py +98 -43
natural_pdf/core/page.py +86 -20
natural_pdf/core/pdf.py +0 -2
natural_pdf/describe/base.py +40 -9
natural_pdf/describe/elements.py +11 -6
natural_pdf/elements/base.py +134 -20
natural_pdf/elements/collections.py +43 -11
natural_pdf/elements/image.py +43 -0
natural_pdf/elements/region.py +64 -19
natural_pdf/elements/text.py +118 -11
natural_pdf/flows/collections.py +4 -4
natural_pdf/flows/region.py +17 -2
natural_pdf/ocr/ocr_manager.py +50 -0
natural_pdf/selectors/parser.py +27 -7
natural_pdf/tables/__init__.py +5 -0
natural_pdf/tables/result.py +101 -0
natural_pdf/utils/bidi_mirror.py +36 -0
natural_pdf/utils/visualization.py +15 -1
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
optimization/memory_comparison.py +172 -0
optimization/pdf_analyzer.py +410 -0
optimization/performance_analysis.py +397 -0
optimization/test_cleanup_methods.py +155 -0
optimization/test_memory_fix.py +162 -0
tools/bad_pdf_eval/__init__.py +1 -0
tools/bad_pdf_eval/analyser.py +302 -0
tools/bad_pdf_eval/collate_summaries.py +130 -0
tools/bad_pdf_eval/eval_suite.py +116 -0
tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
tools/bad_pdf_eval/llm_enrich.py +273 -0
tools/bad_pdf_eval/reporter.py +17 -0
tools/bad_pdf_eval/utils.py +127 -0
tools/rtl_smoke_test.py +80 -0
natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0

optimization/performance_analysis.py ADDED Viewed

@@ -0,0 +1,397 @@
+#!/usr/bin/env python3
+"""
+Natural PDF Performance Analysis Micro-Suite
+This script analyzes memory usage and performance characteristics of Natural PDF
+operations using real large PDFs to inform memory management decisions.
+"""
+import gc
+import json
+import os
+import psutil
+import sys
+import time
+import tracemalloc
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Callable
+import pandas as pd
+import matplotlib.pyplot as plt
+import natural_pdf as npdf
+@dataclass
+class MemorySnapshot:
+    """Snapshot of memory usage at a point in time"""
+    timestamp: float
+    rss_mb: float  # Resident Set Size
+    vms_mb: float  # Virtual Memory Size
+    python_objects: int
+    operation: str
+    page_count: int
+    pdf_name: str
+    additional_info: Dict[str, Any]
+class PerformanceProfiler:
+    """Profiles memory usage and performance of Natural PDF operations"""
+    def __init__(self, output_dir: str = "performance_results"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        self.snapshots: List[MemorySnapshot] = []
+        self.process = psutil.Process()
+        self.start_time = time.time()
+        # Start tracemalloc for detailed Python memory tracking
+        tracemalloc.start()
+    def take_snapshot(self, operation: str, page_count: int = 0,
+                     pdf_name: str = "", **additional_info):
+        """Take a memory usage snapshot"""
+        gc.collect()  # Force garbage collection for accurate measurement
+        memory_info = self.process.memory_info()
+        python_objects = len(gc.get_objects())
+        snapshot = MemorySnapshot(
+            timestamp=time.time() - self.start_time,
+            rss_mb=memory_info.rss / 1024 / 1024,
+            vms_mb=memory_info.vms / 1024 / 1024,
+            python_objects=python_objects,
+            operation=operation,
+            page_count=page_count,
+            pdf_name=pdf_name,
+            additional_info=additional_info
+        )
+        self.snapshots.append(snapshot)
+        print(f"[{snapshot.timestamp:.1f}s] {operation}: {snapshot.rss_mb:.1f}MB RSS, {python_objects} objects")
+    def save_results(self, test_name: str):
+        """Save results to JSON and CSV"""
+        # Convert to list of dicts for JSON serialization
+        data = [asdict(s) for s in self.snapshots]
+        # Save JSON
+        json_path = self.output_dir / f"{test_name}_snapshots.json"
+        with open(json_path, 'w') as f:
+            json.dump(data, f, indent=2)
+        # Save CSV for easy analysis
+        df = pd.DataFrame(data)
+        csv_path = self.output_dir / f"{test_name}_snapshots.csv"
+        df.to_csv(csv_path, index=False)
+        print(f"Results saved to {json_path} and {csv_path}")
+        return df
+class PDFPerformanceTester:
+    """Tests specific PDF operations and measures their performance"""
+    def __init__(self, pdf_path: str, profiler: PerformanceProfiler):
+        self.pdf_path = Path(pdf_path)
+        self.pdf_name = self.pdf_path.stem
+        self.profiler = profiler
+        self.pdf = None
+    def test_load_pdf(self):
+        """Test just loading the PDF"""
+        self.profiler.take_snapshot("before_load", pdf_name=self.pdf_name)
+        self.pdf = npdf.PDF(str(self.pdf_path))
+        self.profiler.take_snapshot("after_load", pdf_name=self.pdf_name,
+                                  total_pages=len(self.pdf.pages))
+    def test_page_access(self, max_pages: int = 10):
+        """Test accessing pages sequentially"""
+        if not self.pdf:
+            self.test_load_pdf()
+        pages_to_test = min(max_pages, len(self.pdf.pages))
+        for i in range(pages_to_test):
+            page = self.pdf.pages[i]
+            # Just access the page to trigger lazy loading
+            _ = page.width, page.height
+            self.profiler.take_snapshot(
+                f"page_access_{i+1}",
+                page_count=i+1,
+                pdf_name=self.pdf_name,
+                page_width=page.width,
+                page_height=page.height
+            )
+    def test_describe_pages(self, max_pages: int = 5):
+        """Test using .describe() on pages"""
+        if not self.pdf:
+            self.test_load_pdf()
+        pages_to_test = min(max_pages, len(self.pdf.pages))
+        for i in range(pages_to_test):
+            page = self.pdf.pages[i]
+            # Use describe to understand page content
+            try:
+                description = page.describe()
+                self.profiler.take_snapshot(
+                    f"describe_{i+1}",
+                    page_count=i+1,
+                    pdf_name=self.pdf_name,
+                    description_length=len(description) if description else 0
+                )
+            except Exception as e:
+                self.profiler.take_snapshot(
+                    f"describe_{i+1}_error",
+                    page_count=i+1,
+                    pdf_name=self.pdf_name,
+                    error=str(e)
+                )
+    def test_element_collections(self, max_pages: int = 5):
+        """Test find_all operations that create element collections"""
+        if not self.pdf:
+            self.test_load_pdf()
+        pages_to_test = min(max_pages, len(self.pdf.pages))
+        for i in range(pages_to_test):
+            page = self.pdf.pages[i]
+            # Test different element collection operations
+            operations = [
+                ("words", lambda p: p.find_all("words")),
+                ("text_elements", lambda p: p.find_all("text")),
+                ("rects", lambda p: p.find_all("rect")),
+                ("large_text", lambda p: p.find_all("text[size>12]")),
+            ]
+            for op_name, operation in operations:
+                try:
+                    elements = operation(page)
+                    element_count = len(elements) if elements else 0
+                    self.profiler.take_snapshot(
+                        f"{op_name}_{i+1}",
+                        page_count=i+1,
+                        pdf_name=self.pdf_name,
+                        operation_type=op_name,
+                        element_count=element_count
+                    )
+                except Exception as e:
+                    self.profiler.take_snapshot(
+                        f"{op_name}_{i+1}_error",
+                        page_count=i+1,
+                        pdf_name=self.pdf_name,
+                        operation_type=op_name,
+                        error=str(e)
+                    )
+    def test_image_generation(self, max_pages: int = 3, resolutions: List[int] = [72, 144, 216]):
+        """Test image generation at different resolutions"""
+        if not self.pdf:
+            self.test_load_pdf()
+        pages_to_test = min(max_pages, len(self.pdf.pages))
+        for i in range(pages_to_test):
+            page = self.pdf.pages[i]
+            for resolution in resolutions:
+                try:
+                    img = page.to_image(resolution=resolution)
+                    self.profiler.take_snapshot(
+                        f"image_{resolution}dpi_{i+1}",
+                        page_count=i+1,
+                        pdf_name=self.pdf_name,
+                        resolution=resolution,
+                        image_size=f"{img.width}x{img.height}" if img else "None"
+                    )
+                    # Clean up image immediately to test memory release
+                    del img
+                except Exception as e:
+                    self.profiler.take_snapshot(
+                        f"image_{resolution}dpi_{i+1}_error",
+                        page_count=i+1,
+                        pdf_name=self.pdf_name,
+                        resolution=resolution,
+                        error=str(e)
+                    )
+    def test_ocr(self, max_pages: int = 2):
+        """Test OCR operations (expensive!)"""
+        if not self.pdf:
+            self.test_load_pdf()
+        pages_to_test = min(max_pages, len(self.pdf.pages))
+        for i in range(pages_to_test):
+            page = self.pdf.pages[i]
+            try:
+                # Run OCR
+                page.apply_ocr(engine="easyocr")  # Default engine
+                self.profiler.take_snapshot(
+                    f"ocr_{i+1}",
+                    page_count=i+1,
+                    pdf_name=self.pdf_name,
+                    operation_type="ocr"
+                )
+            except Exception as e:
+                self.profiler.take_snapshot(
+                    f"ocr_{i+1}_error",
+                    page_count=i+1,
+                    pdf_name=self.pdf_name,
+                    operation_type="ocr",
+                    error=str(e)
+                )
+    def test_layout_analysis(self, max_pages: int = 3):
+        """Test layout analysis operations"""
+        if not self.pdf:
+            self.test_load_pdf()
+        pages_to_test = min(max_pages, len(self.pdf.pages))
+        for i in range(pages_to_test):
+            page = self.pdf.pages[i]
+            try:
+                # Run layout analysis
+                layout_result = page.analyze_layout()
+                self.profiler.take_snapshot(
+                    f"layout_{i+1}",
+                    page_count=i+1,
+                    pdf_name=self.pdf_name,
+                    operation_type="layout",
+                    layout_regions=len(layout_result) if layout_result else 0
+                )
+            except Exception as e:
+                self.profiler.take_snapshot(
+                    f"layout_{i+1}_error",
+                    page_count=i+1,
+                    pdf_name=self.pdf_name,
+                    operation_type="layout",
+                    error=str(e)
+                )
+def run_comprehensive_test(pdf_path: str, test_name: str):
+    """Run a comprehensive test suite on a PDF"""
+    print(f"\n{'='*60}")
+    print(f"COMPREHENSIVE TEST: {test_name}")
+    print(f"PDF: {pdf_path}")
+    print(f"{'='*60}")
+    profiler = PerformanceProfiler()
+    tester = PDFPerformanceTester(pdf_path, profiler)
+    # Initial baseline
+    profiler.take_snapshot("baseline_start", pdf_name=Path(pdf_path).stem)
+    # Test sequence
+    print("\n1. Testing PDF Load...")
+    tester.test_load_pdf()
+    print("\n2. Testing Page Access...")
+    tester.test_page_access(max_pages=10)
+    print("\n3. Testing Describe Operations...")
+    tester.test_describe_pages(max_pages=5)
+    print("\n4. Testing Element Collections...")
+    tester.test_element_collections(max_pages=5)
+    print("\n5. Testing Image Generation...")
+    tester.test_image_generation(max_pages=3)
+    print("\n6. Testing Layout Analysis...")
+    tester.test_layout_analysis(max_pages=3)
+    # OCR test (only for image-heavy PDFs)
+    if "OCR" in pdf_path or "image" in test_name.lower():
+        print("\n7. Testing OCR (Image-heavy PDF)...")
+        tester.test_ocr(max_pages=2)
+    # Final snapshot
+    profiler.take_snapshot("test_complete", pdf_name=Path(pdf_path).stem)
+    # Save results
+    df = profiler.save_results(test_name)
+    # Quick analysis
+    print(f"\n{'-'*40}")
+    print("QUICK ANALYSIS:")
+    print(f"Peak Memory: {df['rss_mb'].max():.1f} MB")
+    print(f"Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
+    print(f"Peak Objects: {df['python_objects'].max():,}")
+    print(f"Total Time: {df['timestamp'].iloc[-1]:.1f} seconds")
+    return df
+def main():
+    """Main test runner"""
+    print("Natural PDF Performance Analysis Micro-Suite")
+    print("=" * 50)
+    # Find test PDFs
+    large_pdfs_dir = Path("pdfs/hidden/large")
+    if not large_pdfs_dir.exists():
+        print(f"Error: {large_pdfs_dir} not found")
+        print("Please ensure large test PDFs are available")
+        return
+    # Expected test PDFs
+    test_pdfs = {
+        "text_heavy": large_pdfs_dir / "appendix_fy2026.pdf",
+        "image_heavy": large_pdfs_dir / "OCR 0802030-56.2022.8.14.0060_Cópia integral_Fazenda Marrocos.pdf"
+    }
+    results = {}
+    for test_name, pdf_path in test_pdfs.items():
+        if pdf_path.exists():
+            try:
+                results[test_name] = run_comprehensive_test(str(pdf_path), test_name)
+            except Exception as e:
+                print(f"Error testing {test_name}: {e}")
+                traceback.print_exc()
+        else:
+            print(f"Warning: {pdf_path} not found, skipping {test_name} test")
+    # Generate comparison report
+    if results:
+        print(f"\n{'='*60}")
+        print("COMPARISON SUMMARY")
+        print(f"{'='*60}")
+        for test_name, df in results.items():
+            print(f"\n{test_name.upper()}:")
+            print(f"  Peak Memory: {df['rss_mb'].max():.1f} MB")
+            print(f"  Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
+            print(f"  Peak Objects: {df['python_objects'].max():,}")
+            print(f"  Duration: {df['timestamp'].iloc[-1]:.1f}s")
+        print(f"\nResults saved to performance_results/ directory")
+        print("Use the CSV files for detailed analysis")
+if __name__ == "__main__":
+    main()

optimization/test_cleanup_methods.py ADDED Viewed

@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the new cleanup methods work correctly.
+This test verifies that:
+1. Cleanup methods exist and are callable
+2. They handle edge cases gracefully (empty caches, missing engines)
+3. They actually clean up loaded models/engines
+"""
+import gc
+import os
+import sys
+from pathlib import Path
+import pytest
+import natural_pdf as npdf
+from natural_pdf.ocr.ocr_manager import OCRManager
+from natural_pdf.analyzers.layout.layout_manager import LayoutManager
+from natural_pdf.classification.manager import ClassificationManager
+class TestCleanupMethods:
+    """Test suite for manager cleanup methods"""
+    def test_ocr_manager_cleanup_empty(self):
+        """Test OCR manager cleanup when no engines are loaded"""
+        manager = OCRManager()
+        # Test cleanup when nothing is loaded
+        count = manager.cleanup_engine()
+        assert count == 0, "Should return 0 when no engines loaded"
+        # Test cleanup of specific non-existent engine
+        count = manager.cleanup_engine("nonexistent")
+        assert count == 0, "Should return 0 when engine doesn't exist"
+    def test_layout_manager_cleanup_empty(self):
+        """Test Layout manager cleanup when no detectors are loaded"""
+        manager = LayoutManager()
+        # Test cleanup when nothing is loaded
+        count = manager.cleanup_detector()
+        assert count == 0, "Should return 0 when no detectors loaded"
+        # Test cleanup of specific non-existent detector
+        count = manager.cleanup_detector("nonexistent")
+        assert count == 0, "Should return 0 when detector doesn't exist"
+    def test_classification_manager_cleanup_empty(self):
+        """Test Classification manager cleanup when no models are loaded"""
+        try:
+            manager = ClassificationManager()
+            # Test cleanup when nothing is loaded
+            count = manager.cleanup_models()
+            assert count == 0, "Should return 0 when no models loaded"
+            # Test cleanup of specific non-existent model
+            count = manager.cleanup_models("nonexistent/model")
+            assert count == 0, "Should return 0 when model doesn't exist"
+        except ImportError:
+            pytest.skip("Classification dependencies not available")
+    def test_ocr_manager_cleanup_with_engine(self):
+        """Test OCR manager cleanup after loading an engine"""
+        manager = OCRManager()
+        # Check if any OCR engines are available
+        available_engines = manager.get_available_engines()
+        if not available_engines:
+            pytest.skip("No OCR engines available for testing")
+        engine_name = available_engines[0]
+        print(f"Testing with OCR engine: {engine_name}")
+        # Load an engine by accessing it
+        try:
+            engine_instance = manager._get_engine_instance(engine_name)
+            assert engine_name in manager._engine_instances, "Engine should be cached"
+            # Test cleanup of specific engine
+            count = manager.cleanup_engine(engine_name)
+            assert count == 1, f"Should return 1 after cleaning up {engine_name}"
+            assert engine_name not in manager._engine_instances, "Engine should be removed from cache"
+        except Exception as e:
+            pytest.skip(f"Could not load {engine_name} engine: {e}")
+    def test_layout_manager_cleanup_with_detector(self):
+        """Test Layout manager cleanup after loading a detector"""
+        manager = LayoutManager()
+        # Check if any layout engines are available
+        available_engines = manager.get_available_engines()
+        if not available_engines:
+            pytest.skip("No layout engines available for testing")
+        engine_name = available_engines[0]
+        print(f"Testing with layout engine: {engine_name}")
+        # Load a detector by accessing it
+        try:
+            detector_instance = manager._get_engine_instance(engine_name)
+            assert engine_name in manager._detector_instances, "Detector should be cached"
+            # Test cleanup of specific detector
+            count = manager.cleanup_detector(engine_name)
+            assert count == 1, f"Should return 1 after cleaning up {engine_name}"
+            assert engine_name not in manager._detector_instances, "Detector should be removed from cache"
+        except Exception as e:
+            pytest.skip(f"Could not load {engine_name} detector: {e}")
+    def test_methods_exist(self):
+        """Test that all cleanup methods exist and are callable"""
+        # Test OCRManager
+        manager = OCRManager()
+        assert hasattr(manager, 'cleanup_engine'), "OCRManager should have cleanup_engine method"
+        assert callable(manager.cleanup_engine), "cleanup_engine should be callable"
+        # Test LayoutManager
+        layout_manager = LayoutManager()
+        assert hasattr(layout_manager, 'cleanup_detector'), "LayoutManager should have cleanup_detector method"
+        assert callable(layout_manager.cleanup_detector), "cleanup_detector should be callable"
+        # Test ClassificationManager (if available)
+        try:
+            classification_manager = ClassificationManager()
+            assert hasattr(classification_manager, 'cleanup_models'), "ClassificationManager should have cleanup_models method"
+            assert callable(classification_manager.cleanup_models), "cleanup_models should be callable"
+        except ImportError:
+            print("Classification dependencies not available, skipping ClassificationManager test")
+def main():
+    """Run the cleanup method tests"""
+    print("Testing manager cleanup methods...")
+    # Run pytest on just this file
+    exit_code = pytest.main([__file__, "-v", "-s"])
+    if exit_code == 0:
+        print("\n✅ All cleanup method tests passed!")
+        print("The memory management methods are working correctly.")
+    else:
+        print("\n❌ Some tests failed!")
+        print("The cleanup methods need investigation.")
+    return exit_code
+if __name__ == "__main__":
+    exit(main())

natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

natural-pdf 0.1.28py3-none-any.whl → 0.1.31py3-none-any.whl