PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

optimization/performance_analysis.py CHANGED Viewed

@@ -9,15 +9,16 @@ operations using real large PDFs to inform memory management decisions.
 import gc
 import json
 import os
-import psutil
 import sys
 import time
 import tracemalloc
-from dataclasses import dataclass, asdict
+from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import Dict, List, Optional, Any, Callable
-import pandas as pd
+from typing import Any, Callable, Dict, List, Optional
 import matplotlib.pyplot as plt
+import pandas as pd
+import psutil
 import natural_pdf as npdf
@@ -25,6 +26,7 @@ import natural_pdf as npdf
 @dataclass
 class MemorySnapshot:
     """Snapshot of memory usage at a point in time"""
     timestamp: float
     rss_mb: float  # Resident Set Size
     vms_mb: float  # Virtual Memory Size
@@ -37,26 +39,27 @@ class MemorySnapshot:
 class PerformanceProfiler:
     """Profiles memory usage and performance of Natural PDF operations"""
     def __init__(self, output_dir: str = "performance_results"):
         self.output_dir = Path(output_dir)
         self.output_dir.mkdir(exist_ok=True)
         self.snapshots: List[MemorySnapshot] = []
         self.process = psutil.Process()
         self.start_time = time.time()
         # Start tracemalloc for detailed Python memory tracking
         tracemalloc.start()
-    def take_snapshot(self, operation: str, page_count: int = 0,
-                     pdf_name: str = "", **additional_info):
+    def take_snapshot(
+        self, operation: str, page_count: int = 0, pdf_name: str = "", **additional_info
+    ):
         """Take a memory usage snapshot"""
         gc.collect()  # Force garbage collection for accurate measurement
         memory_info = self.process.memory_info()
         python_objects = len(gc.get_objects())
         snapshot = MemorySnapshot(
             timestamp=time.time() - self.start_time,
             rss_mb=memory_info.rss / 1024 / 1024,
@@ -65,108 +68,108 @@ class PerformanceProfiler:
             operation=operation,
             page_count=page_count,
             pdf_name=pdf_name,
-            additional_info=additional_info
+            additional_info=additional_info,
         )
         self.snapshots.append(snapshot)
-        print(f"[{snapshot.timestamp:.1f}s] {operation}: {snapshot.rss_mb:.1f}MB RSS, {python_objects} objects")
+        print(
+            f"[{snapshot.timestamp:.1f}s] {operation}: {snapshot.rss_mb:.1f}MB RSS, {python_objects} objects"
+        )
     def save_results(self, test_name: str):
         """Save results to JSON and CSV"""
         # Convert to list of dicts for JSON serialization
         data = [asdict(s) for s in self.snapshots]
         # Save JSON
         json_path = self.output_dir / f"{test_name}_snapshots.json"
-        with open(json_path, 'w') as f:
+        with open(json_path, "w") as f:
             json.dump(data, f, indent=2)
         # Save CSV for easy analysis
         df = pd.DataFrame(data)
         csv_path = self.output_dir / f"{test_name}_snapshots.csv"
         df.to_csv(csv_path, index=False)
         print(f"Results saved to {json_path} and {csv_path}")
         return df
 class PDFPerformanceTester:
     """Tests specific PDF operations and measures their performance"""
     def __init__(self, pdf_path: str, profiler: PerformanceProfiler):
         self.pdf_path = Path(pdf_path)
         self.pdf_name = self.pdf_path.stem
         self.profiler = profiler
         self.pdf = None
     def test_load_pdf(self):
         """Test just loading the PDF"""
         self.profiler.take_snapshot("before_load", pdf_name=self.pdf_name)
         self.pdf = npdf.PDF(str(self.pdf_path))
-        self.profiler.take_snapshot("after_load", pdf_name=self.pdf_name,
-                                  total_pages=len(self.pdf.pages))
+        self.profiler.take_snapshot(
+            "after_load", pdf_name=self.pdf_name, total_pages=len(self.pdf.pages)
+        )
     def test_page_access(self, max_pages: int = 10):
         """Test accessing pages sequentially"""
         if not self.pdf:
             self.test_load_pdf()
         pages_to_test = min(max_pages, len(self.pdf.pages))
         for i in range(pages_to_test):
             page = self.pdf.pages[i]
             # Just access the page to trigger lazy loading
             _ = page.width, page.height
             self.profiler.take_snapshot(
-                f"page_access_{i+1}",
-                page_count=i+1,
+                f"page_access_{i+1}",
+                page_count=i + 1,
                 pdf_name=self.pdf_name,
                 page_width=page.width,
-                page_height=page.height
+                page_height=page.height,
             )
     def test_describe_pages(self, max_pages: int = 5):
         """Test using .describe() on pages"""
         if not self.pdf:
             self.test_load_pdf()
         pages_to_test = min(max_pages, len(self.pdf.pages))
         for i in range(pages_to_test):
             page = self.pdf.pages[i]
             # Use describe to understand page content
             try:
                 description = page.describe()
                 self.profiler.take_snapshot(
                     f"describe_{i+1}",
-                    page_count=i+1,
+                    page_count=i + 1,
                     pdf_name=self.pdf_name,
-                    description_length=len(description) if description else 0
+                    description_length=len(description) if description else 0,
                 )
             except Exception as e:
                 self.profiler.take_snapshot(
-                    f"describe_{i+1}_error",
-                    page_count=i+1,
-                    pdf_name=self.pdf_name,
-                    error=str(e)
+                    f"describe_{i+1}_error", page_count=i + 1, pdf_name=self.pdf_name, error=str(e)
                 )
     def test_element_collections(self, max_pages: int = 5):
         """Test find_all operations that create element collections"""
         if not self.pdf:
             self.test_load_pdf()
         pages_to_test = min(max_pages, len(self.pdf.pages))
         for i in range(pages_to_test):
             page = self.pdf.pages[i]
             # Test different element collection operations
             operations = [
                 ("words", lambda p: p.find_all("words")),
@@ -174,121 +177,118 @@ class PDFPerformanceTester:
                 ("rects", lambda p: p.find_all("rect")),
                 ("large_text", lambda p: p.find_all("text[size>12]")),
             ]
             for op_name, operation in operations:
                 try:
                     elements = operation(page)
                     element_count = len(elements) if elements else 0
                     self.profiler.take_snapshot(
                         f"{op_name}_{i+1}",
-                        page_count=i+1,
+                        page_count=i + 1,
                         pdf_name=self.pdf_name,
                         operation_type=op_name,
-                        element_count=element_count
+                        element_count=element_count,
                     )
                 except Exception as e:
                     self.profiler.take_snapshot(
                         f"{op_name}_{i+1}_error",
-                        page_count=i+1,
+                        page_count=i + 1,
                         pdf_name=self.pdf_name,
                         operation_type=op_name,
-                        error=str(e)
+                        error=str(e),
                     )
     def test_image_generation(self, max_pages: int = 3, resolutions: List[int] = [72, 144, 216]):
         """Test image generation at different resolutions"""
         if not self.pdf:
             self.test_load_pdf()
         pages_to_test = min(max_pages, len(self.pdf.pages))
         for i in range(pages_to_test):
             page = self.pdf.pages[i]
             for resolution in resolutions:
                 try:
                     img = page.to_image(resolution=resolution)
                     self.profiler.take_snapshot(
                         f"image_{resolution}dpi_{i+1}",
-                        page_count=i+1,
+                        page_count=i + 1,
                         pdf_name=self.pdf_name,
                         resolution=resolution,
-                        image_size=f"{img.width}x{img.height}" if img else "None"
+                        image_size=f"{img.width}x{img.height}" if img else "None",
                     )
                     # Clean up image immediately to test memory release
                     del img
                 except Exception as e:
                     self.profiler.take_snapshot(
                         f"image_{resolution}dpi_{i+1}_error",
-                        page_count=i+1,
+                        page_count=i + 1,
                         pdf_name=self.pdf_name,
                         resolution=resolution,
-                        error=str(e)
+                        error=str(e),
                     )
     def test_ocr(self, max_pages: int = 2):
         """Test OCR operations (expensive!)"""
         if not self.pdf:
             self.test_load_pdf()
         pages_to_test = min(max_pages, len(self.pdf.pages))
         for i in range(pages_to_test):
             page = self.pdf.pages[i]
             try:
                 # Run OCR
                 page.apply_ocr(engine="easyocr")  # Default engine
                 self.profiler.take_snapshot(
-                    f"ocr_{i+1}",
-                    page_count=i+1,
-                    pdf_name=self.pdf_name,
-                    operation_type="ocr"
+                    f"ocr_{i+1}", page_count=i + 1, pdf_name=self.pdf_name, operation_type="ocr"
                 )
             except Exception as e:
                 self.profiler.take_snapshot(
                     f"ocr_{i+1}_error",
-                    page_count=i+1,
+                    page_count=i + 1,
                     pdf_name=self.pdf_name,
                     operation_type="ocr",
-                    error=str(e)
+                    error=str(e),
                 )
     def test_layout_analysis(self, max_pages: int = 3):
         """Test layout analysis operations"""
         if not self.pdf:
             self.test_load_pdf()
         pages_to_test = min(max_pages, len(self.pdf.pages))
         for i in range(pages_to_test):
             page = self.pdf.pages[i]
             try:
                 # Run layout analysis
                 layout_result = page.analyze_layout()
                 self.profiler.take_snapshot(
                     f"layout_{i+1}",
-                    page_count=i+1,
+                    page_count=i + 1,
                     pdf_name=self.pdf_name,
                     operation_type="layout",
-                    layout_regions=len(layout_result) if layout_result else 0
+                    layout_regions=len(layout_result) if layout_result else 0,
                 )
             except Exception as e:
                 self.profiler.take_snapshot(
                     f"layout_{i+1}_error",
-                    page_count=i+1,
+                    page_count=i + 1,
                     pdf_name=self.pdf_name,
                     operation_type="layout",
-                    error=str(e)
+                    error=str(e),
                 )
@@ -298,43 +298,43 @@ def run_comprehensive_test(pdf_path: str, test_name: str):
     print(f"COMPREHENSIVE TEST: {test_name}")
     print(f"PDF: {pdf_path}")
     print(f"{'='*60}")
     profiler = PerformanceProfiler()
     tester = PDFPerformanceTester(pdf_path, profiler)
     # Initial baseline
     profiler.take_snapshot("baseline_start", pdf_name=Path(pdf_path).stem)
     # Test sequence
     print("\n1. Testing PDF Load...")
     tester.test_load_pdf()
     print("\n2. Testing Page Access...")
     tester.test_page_access(max_pages=10)
     print("\n3. Testing Describe Operations...")
     tester.test_describe_pages(max_pages=5)
     print("\n4. Testing Element Collections...")
     tester.test_element_collections(max_pages=5)
     print("\n5. Testing Image Generation...")
     tester.test_image_generation(max_pages=3)
     print("\n6. Testing Layout Analysis...")
     tester.test_layout_analysis(max_pages=3)
     # OCR test (only for image-heavy PDFs)
     if "OCR" in pdf_path or "image" in test_name.lower():
         print("\n7. Testing OCR (Image-heavy PDF)...")
         tester.test_ocr(max_pages=2)
     # Final snapshot
     profiler.take_snapshot("test_complete", pdf_name=Path(pdf_path).stem)
     # Save results
     df = profiler.save_results(test_name)
     # Quick analysis
     print(f"\n{'-'*40}")
     print("QUICK ANALYSIS:")
@@ -342,7 +342,7 @@ def run_comprehensive_test(pdf_path: str, test_name: str):
     print(f"Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
     print(f"Peak Objects: {df['python_objects'].max():,}")
     print(f"Total Time: {df['timestamp'].iloc[-1]:.1f} seconds")
     return df
@@ -350,22 +350,23 @@ def main():
     """Main test runner"""
     print("Natural PDF Performance Analysis Micro-Suite")
     print("=" * 50)
     # Find test PDFs
     large_pdfs_dir = Path("pdfs/hidden/large")
     if not large_pdfs_dir.exists():
         print(f"Error: {large_pdfs_dir} not found")
         print("Please ensure large test PDFs are available")
         return
     # Expected test PDFs
     test_pdfs = {
         "text_heavy": large_pdfs_dir / "appendix_fy2026.pdf",
-        "image_heavy": large_pdfs_dir / "OCR 0802030-56.2022.8.14.0060_Cópia integral_Fazenda Marrocos.pdf"
+        "image_heavy": large_pdfs_dir
+        / "OCR 0802030-56.2022.8.14.0060_Cópia integral_Fazenda Marrocos.pdf",
     }
     results = {}
     for test_name, pdf_path in test_pdfs.items():
         if pdf_path.exists():
             try:
@@ -375,23 +376,23 @@ def main():
                 traceback.print_exc()
         else:
             print(f"Warning: {pdf_path} not found, skipping {test_name} test")
     # Generate comparison report
     if results:
         print(f"\n{'='*60}")
         print("COMPARISON SUMMARY")
         print(f"{'='*60}")
         for test_name, df in results.items():
             print(f"\n{test_name.upper()}:")
             print(f"  Peak Memory: {df['rss_mb'].max():.1f} MB")
             print(f"  Memory Growth: {df['rss_mb'].iloc[-1] - df['rss_mb'].iloc[0]:.1f} MB")
             print(f"  Peak Objects: {df['python_objects'].max():,}")
             print(f"  Duration: {df['timestamp'].iloc[-1]:.1f}s")
         print(f"\nResults saved to performance_results/ directory")
         print("Use the CSV files for detailed analysis")
 if __name__ == "__main__":
-    main()
+    main()

optimization/test_cleanup_methods.py CHANGED Viewed

@@ -12,124 +12,135 @@ import gc
 import os
 import sys
 from pathlib import Path
 import pytest
 import natural_pdf as npdf
-from natural_pdf.ocr.ocr_manager import OCRManager
 from natural_pdf.analyzers.layout.layout_manager import LayoutManager
 from natural_pdf.classification.manager import ClassificationManager
+from natural_pdf.ocr.ocr_manager import OCRManager
 class TestCleanupMethods:
     """Test suite for manager cleanup methods"""
     def test_ocr_manager_cleanup_empty(self):
         """Test OCR manager cleanup when no engines are loaded"""
         manager = OCRManager()
         # Test cleanup when nothing is loaded
         count = manager.cleanup_engine()
         assert count == 0, "Should return 0 when no engines loaded"
         # Test cleanup of specific non-existent engine
         count = manager.cleanup_engine("nonexistent")
         assert count == 0, "Should return 0 when engine doesn't exist"
     def test_layout_manager_cleanup_empty(self):
         """Test Layout manager cleanup when no detectors are loaded"""
         manager = LayoutManager()
         # Test cleanup when nothing is loaded
         count = manager.cleanup_detector()
         assert count == 0, "Should return 0 when no detectors loaded"
         # Test cleanup of specific non-existent detector
         count = manager.cleanup_detector("nonexistent")
         assert count == 0, "Should return 0 when detector doesn't exist"
     def test_classification_manager_cleanup_empty(self):
         """Test Classification manager cleanup when no models are loaded"""
         try:
             manager = ClassificationManager()
             # Test cleanup when nothing is loaded
             count = manager.cleanup_models()
             assert count == 0, "Should return 0 when no models loaded"
             # Test cleanup of specific non-existent model
             count = manager.cleanup_models("nonexistent/model")
             assert count == 0, "Should return 0 when model doesn't exist"
         except ImportError:
             pytest.skip("Classification dependencies not available")
     def test_ocr_manager_cleanup_with_engine(self):
         """Test OCR manager cleanup after loading an engine"""
         manager = OCRManager()
         # Check if any OCR engines are available
         available_engines = manager.get_available_engines()
         if not available_engines:
             pytest.skip("No OCR engines available for testing")
         engine_name = available_engines[0]
         print(f"Testing with OCR engine: {engine_name}")
         # Load an engine by accessing it
         try:
             engine_instance = manager._get_engine_instance(engine_name)
             assert engine_name in manager._engine_instances, "Engine should be cached"
             # Test cleanup of specific engine
             count = manager.cleanup_engine(engine_name)
             assert count == 1, f"Should return 1 after cleaning up {engine_name}"
-            assert engine_name not in manager._engine_instances, "Engine should be removed from cache"
+            assert (
+                engine_name not in manager._engine_instances
+            ), "Engine should be removed from cache"
         except Exception as e:
             pytest.skip(f"Could not load {engine_name} engine: {e}")
     def test_layout_manager_cleanup_with_detector(self):
         """Test Layout manager cleanup after loading a detector"""
         manager = LayoutManager()
         # Check if any layout engines are available
         available_engines = manager.get_available_engines()
         if not available_engines:
             pytest.skip("No layout engines available for testing")
         engine_name = available_engines[0]
         print(f"Testing with layout engine: {engine_name}")
         # Load a detector by accessing it
         try:
             detector_instance = manager._get_engine_instance(engine_name)
             assert engine_name in manager._detector_instances, "Detector should be cached"
             # Test cleanup of specific detector
             count = manager.cleanup_detector(engine_name)
             assert count == 1, f"Should return 1 after cleaning up {engine_name}"
-            assert engine_name not in manager._detector_instances, "Detector should be removed from cache"
+            assert (
+                engine_name not in manager._detector_instances
+            ), "Detector should be removed from cache"
         except Exception as e:
             pytest.skip(f"Could not load {engine_name} detector: {e}")
     def test_methods_exist(self):
         """Test that all cleanup methods exist and are callable"""
         # Test OCRManager
         manager = OCRManager()
-        assert hasattr(manager, 'cleanup_engine'), "OCRManager should have cleanup_engine method"
+        assert hasattr(manager, "cleanup_engine"), "OCRManager should have cleanup_engine method"
         assert callable(manager.cleanup_engine), "cleanup_engine should be callable"
         # Test LayoutManager
         layout_manager = LayoutManager()
-        assert hasattr(layout_manager, 'cleanup_detector'), "LayoutManager should have cleanup_detector method"
+        assert hasattr(
+            layout_manager, "cleanup_detector"
+        ), "LayoutManager should have cleanup_detector method"
         assert callable(layout_manager.cleanup_detector), "cleanup_detector should be callable"
         # Test ClassificationManager (if available)
         try:
             classification_manager = ClassificationManager()
-            assert hasattr(classification_manager, 'cleanup_models'), "ClassificationManager should have cleanup_models method"
-            assert callable(classification_manager.cleanup_models), "cleanup_models should be callable"
+            assert hasattr(
+                classification_manager, "cleanup_models"
+            ), "ClassificationManager should have cleanup_models method"
+            assert callable(
+                classification_manager.cleanup_models
+            ), "cleanup_models should be callable"
         except ImportError:
             print("Classification dependencies not available, skipping ClassificationManager test")
@@ -137,19 +148,19 @@ class TestCleanupMethods:
 def main():
     """Run the cleanup method tests"""
     print("Testing manager cleanup methods...")
     # Run pytest on just this file
     exit_code = pytest.main([__file__, "-v", "-s"])
     if exit_code == 0:
         print("\n✅ All cleanup method tests passed!")
         print("The memory management methods are working correctly.")
     else:
         print("\n❌ Some tests failed!")
         print("The cleanup methods need investigation.")
     return exit_code
 if __name__ == "__main__":
-    exit(main())
+    exit(main())

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl