PyPI - natural-pdf - Versions diffs - 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl - Mend

natural-pdf 0.1.27py3-none-any.whl → 0.1.30py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

bad_pdf_analysis/analyze_10_more.py +300 -0
bad_pdf_analysis/analyze_final_10.py +552 -0
bad_pdf_analysis/analyze_specific_pages.py +394 -0
bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
natural_pdf/analyzers/layout/layout_manager.py +45 -1
natural_pdf/analyzers/layout/surya.py +1 -1
natural_pdf/analyzers/layout/yolo.py +2 -2
natural_pdf/analyzers/shape_detection_mixin.py +228 -0
natural_pdf/classification/manager.py +67 -0
natural_pdf/core/element_manager.py +556 -25
natural_pdf/core/highlighting_service.py +98 -43
natural_pdf/core/page.py +86 -20
natural_pdf/core/pdf.py +0 -2
natural_pdf/describe/base.py +40 -9
natural_pdf/describe/elements.py +11 -6
natural_pdf/elements/base.py +134 -20
natural_pdf/elements/collections.py +43 -11
natural_pdf/elements/image.py +43 -0
natural_pdf/elements/region.py +64 -19
natural_pdf/elements/text.py +89 -11
natural_pdf/flows/collections.py +4 -4
natural_pdf/flows/region.py +17 -2
natural_pdf/ocr/engine_paddle.py +1 -1
natural_pdf/ocr/ocr_factory.py +8 -8
natural_pdf/ocr/ocr_manager.py +51 -1
natural_pdf/selectors/parser.py +27 -7
natural_pdf/tables/__init__.py +5 -0
natural_pdf/tables/result.py +101 -0
natural_pdf/utils/bidi_mirror.py +36 -0
natural_pdf/utils/visualization.py +15 -1
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
optimization/memory_comparison.py +172 -0
optimization/pdf_analyzer.py +410 -0
optimization/performance_analysis.py +397 -0
optimization/test_cleanup_methods.py +155 -0
optimization/test_memory_fix.py +162 -0
tools/bad_pdf_eval/__init__.py +1 -0
tools/bad_pdf_eval/analyser.py +302 -0
tools/bad_pdf_eval/collate_summaries.py +130 -0
tools/bad_pdf_eval/eval_suite.py +116 -0
tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
tools/bad_pdf_eval/llm_enrich.py +273 -0
tools/bad_pdf_eval/reporter.py +17 -0
tools/bad_pdf_eval/utils.py +127 -0
tools/rtl_smoke_test.py +80 -0
natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0

bad_pdf_analysis/analyze_specific_pages_direct.py ADDED Viewed

@@ -0,0 +1,382 @@
+#!/usr/bin/env python3
+"""
+Direct Natural PDF analysis targeting specific pages.
+"""
+import json
+import os
+import sys
+from pathlib import Path
+import natural_pdf as npdf
+import re
+from datetime import datetime
+def analyze_specific_pages_direct(pdf_path, target_pages, output_folder):
+    """Directly analyze specific pages using Natural PDF"""
+    print(f"🔍 Analyzing {pdf_path}")
+    print(f"📍 Target pages: {target_pages}")
+    pdf = npdf.PDF(pdf_path)
+    results = {}
+    for page_num in target_pages:
+        if page_num > len(pdf.pages):
+            print(f"❌ Page {page_num} not found - document only has {len(pdf.pages)} pages")
+            continue
+        print(f"\n📄 Analyzing page {page_num}...")
+        page = pdf.pages[page_num - 1]  # Convert to 0-based index
+        page_data = {
+            "page_number": page_num,
+            "dimensions": {
+                "width": page.width,
+                "height": page.height
+            }
+        }
+        # Get page description
+        try:
+            description = page.describe()
+            page_data["describe"] = description
+            print(f"✅ Page description: {len(description)} characters")
+        except Exception as e:
+            print(f"❌ Page description failed: {e}")
+            page_data["describe"] = f"ERROR: {e}"
+        # Extract text
+        try:
+            text = page.extract_text()
+            page_data["extract_text"] = {
+                "length": len(text),
+                "preview": text[:200] + "..." if len(text) > 200 else text,
+                "full_text": text
+            }
+            print(f"✅ Text extraction: {len(text)} characters")
+        except Exception as e:
+            print(f"❌ Text extraction failed: {e}")
+            page_data["extract_text"] = f"ERROR: {e}"
+        # Try table extraction
+        try:
+            table_data = page.extract_table()
+            if table_data and len(table_data) > 0:
+                page_data["extract_table"] = {
+                    "found": True,
+                    "rows": len(table_data),
+                    "columns": len(table_data[0]) if table_data else 0,
+                    "data": table_data[:5]  # First 5 rows only
+                }
+                print(f"✅ Table found: {len(table_data)} rows × {len(table_data[0]) if table_data else 0} columns")
+            else:
+                page_data["extract_table"] = {"found": False}
+                print("ℹ️ No table found with standard extraction")
+        except Exception as e:
+            print(f"❌ Table extraction failed: {e}")
+            page_data["extract_table"] = f"ERROR: {e}"
+        # Try layout analysis
+        try:
+            page.analyze_layout('yolo', existing='replace')
+            layout_regions = page.find_all('region')
+            if layout_regions and len(layout_regions) > 0:
+                page_data["analyze_layout"] = {
+                    "found": True,
+                    "count": len(layout_regions),
+                    "regions": []
+                }
+                for region in layout_regions[:10]:  # First 10 regions
+                    try:
+                        page_data["analyze_layout"]["regions"].append({
+                            "type": region.type if hasattr(region, 'type') else 'unknown',
+                            "bbox": [region.x0, region.y0, region.x1, region.y1],
+                            "confidence": region.confidence if hasattr(region, 'confidence') else 1.0
+                        })
+                    except:
+                        pass
+                print(f"✅ Layout analysis: {len(layout_regions)} regions")
+            else:
+                page_data["analyze_layout"] = {"found": False}
+                print("ℹ️ No layout regions found")
+        except Exception as e:
+            print(f"❌ Layout analysis failed: {e}")
+            page_data["analyze_layout"] = f"ERROR: {e}"
+        # Try TATR analysis
+        try:
+            page.analyze_layout('tatr', existing='append')
+            tatr_regions = page.find_all('region')
+            tatr_count = len([r for r in tatr_regions if hasattr(r, 'type') and 'table' in str(r.type).lower()])
+            if tatr_count > 0:
+                page_data["analyze_layout_tatr"] = {
+                    "found": True,
+                    "count": tatr_count,
+                    "regions": []
+                }
+                for region in tatr_regions[:25]:  # First 25 regions
+                    try:
+                        if hasattr(region, 'type') and 'table' in str(region.type).lower():
+                            page_data["analyze_layout_tatr"]["regions"].append({
+                                "type": str(region.type),
+                                "bbox": [region.x0, region.y0, region.x1, region.y1],
+                                "confidence": region.confidence if hasattr(region, 'confidence') else 1.0
+                            })
+                    except:
+                        pass
+                print(f"✅ TATR analysis: {tatr_count} table regions")
+            else:
+                page_data["analyze_layout_tatr"] = {"found": False}
+                print("ℹ️ No TATR table regions found")
+        except Exception as e:
+            print(f"❌ TATR analysis failed: {e}")
+            page_data["analyze_layout_tatr"] = f"ERROR: {e}"
+        # Save page image
+        try:
+            page_image_path = os.path.join(output_folder, f"page_{page_num}.png")
+            page.save_image(page_image_path, resolution=144)
+            page_data["image_path"] = page_image_path
+            print(f"✅ Page image saved: {page_image_path}")
+        except Exception as e:
+            print(f"❌ Page image save failed: {e}")
+            page_data["image_path"] = f"ERROR: {e}"
+        results[page_num] = page_data
+    return results
+def create_enhanced_analysis_report(pdf_path, target_pages, analysis_results, output_folder):
+    """Create enhanced analysis report"""
+    pdf_name = Path(pdf_path).name
+    # Determine what the user was looking for
+    user_goal = f"Analysis of pages {target_pages}"
+    if len(target_pages) == 1:
+        user_goal = f"Analysis of page {target_pages[0]}"
+    report = f"""# Enhanced PDF Analysis Report - {pdf_name.replace('.pdf', '')}
+## Analysis Overview
+**PDF File:** {pdf_name}
+**Target Pages:** {target_pages}
+**Pages Successfully Analyzed:** {list(analysis_results.keys())}
+**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+---
+## Page-by-Page Analysis Results
+"""
+    for page_num in sorted(analysis_results.keys()):
+        page_data = analysis_results[page_num]
+        report += f"""### Page {page_num}
+**Dimensions:** {page_data.get('dimensions', {}).get('width', 'Unknown')} × {page_data.get('dimensions', {}).get('height', 'Unknown')} points
+**Content Analysis:**
+"""
+        # Text analysis
+        if isinstance(page_data.get('extract_text'), dict):
+            text_info = page_data['extract_text']
+            report += f"- **Text Content:** {text_info.get('length', 0)} characters extracted\n"
+            if text_info.get('preview'):
+                report += f"- **Content Preview:** {text_info['preview']}\n"
+        # Table analysis
+        if isinstance(page_data.get('extract_table'), dict):
+            table_info = page_data['extract_table']
+            if table_info.get('found'):
+                report += f"- **Table Found:** {table_info.get('rows', 0)} rows × {table_info.get('columns', 0)} columns\n"
+            else:
+                report += "- **Table Status:** No standard table structure detected\n"
+        # Layout analysis
+        if isinstance(page_data.get('analyze_layout'), dict):
+            layout_info = page_data['analyze_layout']
+            if layout_info.get('found'):
+                report += f"- **Layout Regions:** {layout_info.get('count', 0)} regions detected\n"
+                # Show region types
+                region_types = {}
+                for region in layout_info.get('regions', []):
+                    region_type = region.get('type', 'unknown')
+                    region_types[region_type] = region_types.get(region_type, 0) + 1
+                if region_types:
+                    report += f"- **Region Types:** {dict(region_types)}\n"
+        # TATR analysis
+        if isinstance(page_data.get('analyze_layout_tatr'), dict):
+            tatr_info = page_data['analyze_layout_tatr']
+            if tatr_info.get('found'):
+                report += f"- **TATR Table Analysis:** {tatr_info.get('count', 0)} table regions detected\n"
+        # Image
+        if page_data.get('image_path') and not page_data['image_path'].startswith('ERROR'):
+            report += f"- **Visual:** Page image saved as `page_{page_num}.png`\n"
+        report += "\n"
+    # Analysis summary
+    report += """---
+## Analysis Summary
+### What We Found
+"""
+    # Summarize findings across all pages
+    total_text_chars = 0
+    pages_with_tables = 0
+    total_layout_regions = 0
+    total_tatr_regions = 0
+    for page_data in analysis_results.values():
+        if isinstance(page_data.get('extract_text'), dict):
+            total_text_chars += page_data['extract_text'].get('length', 0)
+        if isinstance(page_data.get('extract_table'), dict) and page_data['extract_table'].get('found'):
+            pages_with_tables += 1
+        if isinstance(page_data.get('analyze_layout'), dict) and page_data['analyze_layout'].get('found'):
+            total_layout_regions += page_data['analyze_layout'].get('count', 0)
+        if isinstance(page_data.get('analyze_layout_tatr'), dict) and page_data['analyze_layout_tatr'].get('found'):
+            total_tatr_regions += page_data['analyze_layout_tatr'].get('count', 0)
+    report += f"""
+- **Total Text Content:** {total_text_chars:,} characters across {len(analysis_results)} pages
+- **Table Detection:** {pages_with_tables} out of {len(analysis_results)} pages have detectable tables
+- **Layout Analysis:** {total_layout_regions} total layout regions detected
+- **TATR Analysis:** {total_tatr_regions} table-specific regions detected
+"""
+    # Add recommendations
+    report += """
+### Natural PDF Extraction Approach
+Based on the actual content found on these pages:
+```python
+import natural_pdf as npdf
+def extract_from_specific_pages(pdf_path, target_pages):
+    \"\"\"Extract data from specific pages with targeted approach\"\"\"
+    pdf = npdf.PDF(pdf_path)
+    results = []
+    for page_num in target_pages:
+        if page_num <= len(pdf.pages):
+            page = pdf.pages[page_num - 1]
+            # Use layout analysis for better structure detection
+            page.analyze_layout('tatr', existing='append')
+            # Try table extraction first
+            table_data = page.extract_table()
+            if table_data:
+                results.append({
+                    'page': page_num,
+                    'type': 'table',
+                    'data': table_data
+                })
+            else:
+                # Use spatial navigation for complex layouts
+                all_text = page.find_all('text')
+                results.append({
+                    'page': page_num,
+                    'type': 'text_elements',
+                    'elements': all_text
+                })
+    return results
+# Extract from your specific pages
+"""
+    if len(target_pages) == 1:
+        report += f"results = extract_from_specific_pages('{pdf_name}', [{target_pages[0]}])\n"
+    else:
+        report += f"results = extract_from_specific_pages('{pdf_name}', {target_pages})\n"
+    report += "```\n"
+    # Save the report
+    report_path = os.path.join(output_folder, f"{pdf_name.replace('.pdf', '')}_enhanced_analysis.md")
+    with open(report_path, 'w', encoding='utf-8') as f:
+        f.write(report)
+    print(f"✅ Enhanced analysis report saved: {report_path}")
+    return report_path
+def main():
+    """Re-analyze specific documents with page targeting"""
+    # Documents that need re-analysis with specific pages
+    documents_to_reanalyze = [
+        {
+            'folder': 'ODX1DW8_The large table on page 179',
+            'file': 'ODX1DW8.pdf',
+            'pages': [178, 179, 180],  # Page 179 ± 1 for safety
+            'reason': 'User requested page 179, original analysis used page 1'
+        },
+        {
+            'folder': 'eqrZ5yq_The long table _Annex 6_ spanning across pages fro',
+            'file': 'eqrZ5yq.pdf',
+            'pages': [89, 90, 91, 92],  # Multi-page table range
+            'reason': 'User requested pages 89-92, original analysis used page 1'
+        }
+    ]
+    base_path = "/Users/soma/Development/natural-pdf/bad_pdf_analysis"
+    for doc in documents_to_reanalyze:
+        print(f"\n{'='*80}")
+        print(f"🔄 Re-analyzing {doc['file']}")
+        print(f"📋 Reason: {doc['reason']}")
+        print(f"{'='*80}")
+        folder_path = os.path.join(base_path, doc['folder'])
+        pdf_path = os.path.join(folder_path, doc['file'])
+        output_folder = os.path.join(folder_path, 'enhanced_analysis')
+        if not os.path.exists(pdf_path):
+            print(f"❌ PDF not found: {pdf_path}")
+            continue
+        # Create output folder
+        os.makedirs(output_folder, exist_ok=True)
+        # Run direct analysis on specific pages
+        try:
+            analysis_results = analyze_specific_pages_direct(pdf_path, doc['pages'], output_folder)
+            if analysis_results:
+                # Save analysis results as JSON
+                results_file = os.path.join(output_folder, "enhanced_analysis_results.json")
+                with open(results_file, 'w') as f:
+                    json.dump({
+                        "pdf_path": pdf_path,
+                        "target_pages": doc['pages'],
+                        "analysis_timestamp": datetime.now().isoformat(),
+                        "results": analysis_results
+                    }, f, indent=2)
+                # Create enhanced report
+                create_enhanced_analysis_report(pdf_path, doc['pages'], analysis_results, output_folder)
+                print(f"\n✅ Successfully analyzed {len(analysis_results)} pages from {doc['file']}")
+            else:
+                print(f"❌ No results obtained for {doc['file']}")
+        except Exception as e:
+            print(f"❌ Analysis failed for {doc['file']}: {e}")
+if __name__ == "__main__":
+    main()

natural_pdf/analyzers/layout/layout_analyzer.py CHANGED Viewed

@@ -83,10 +83,9 @@ class LayoutAnalyzer:
             f"  Rendering page {self._page.number} to image for initial layout detection..."
         )
         try:
-            layout_scale = getattr(self._page._parent, "_config", {}).get("layout_image_scale", 1.0)
-            layout_resolution = layout_scale * 72
+            layout_resolution = getattr(self._page._parent, "_config", {}).get("layout_image_resolution", 72)
             std_res_page_image = self._page.to_image(
-                resolution=layout_resolution, include_highlights=False, scale=1.0
+                resolution=layout_resolution, include_highlights=False
             )
             if not std_res_page_image:
                 raise ValueError("Initial page rendering returned None")

natural_pdf/analyzers/layout/layout_manager.py CHANGED Viewed

@@ -131,7 +131,7 @@ class LayoutManager:
                 # Construct helpful error message with install hint
                 install_hint = ""
                 if engine_name in {"yolo", "paddle", "surya", "docling"}:
-                    install_hint = f"natural-pdf install {engine_name}"
+                    install_hint = f"npdf install {engine_name}"
                 elif engine_name == "tatr":
                     install_hint = "(should be installed with natural-pdf core dependencies)"
                 elif engine_name == "gemini":
@@ -220,3 +220,47 @@ class LayoutManager:
                 logger.debug(f"Layout engine '{name}' check failed: {e}")
                 pass
         return available
+    def cleanup_detector(self, detector_name: Optional[str] = None) -> int:
+        """
+        Cleanup layout detector instances to free memory.
+        Args:
+            detector_name: Specific detector to cleanup, or None to cleanup all detectors
+        Returns:
+            Number of detectors cleaned up
+        """
+        cleaned_count = 0
+        if detector_name:
+            # Cleanup specific detector
+            detector_name = detector_name.lower()
+            if detector_name in self._detector_instances:
+                detector = self._detector_instances.pop(detector_name)
+                if hasattr(detector, 'cleanup'):
+                    try:
+                        detector.cleanup()
+                    except Exception as e:
+                        logger.debug(f"Detector {detector_name} cleanup method failed: {e}")
+                logger.info(f"Cleaned up layout detector: {detector_name}")
+                cleaned_count = 1
+        else:
+            # Cleanup all detectors
+            for name, detector in list(self._detector_instances.items()):
+                if hasattr(detector, 'cleanup'):
+                    try:
+                        detector.cleanup()
+                    except Exception as e:
+                        logger.debug(f"Detector {name} cleanup method failed: {e}")
+            # Clear all caches
+            detector_count = len(self._detector_instances)
+            self._detector_instances.clear()
+            if detector_count > 0:
+                logger.info(f"Cleaned up {detector_count} layout detectors")
+            cleaned_count = detector_count
+        return cleaned_count

natural_pdf/analyzers/layout/surya.py CHANGED Viewed

@@ -189,7 +189,7 @@ class SuryaLayoutDetector(LayoutDetector):
             "surya_table_rec_dpi", 192
         )
         high_res_page_image = self._page_ref.to_image(
-            resolution=high_res_dpi, include_highlights=False, scale=1.0
+            resolution=high_res_dpi, include_highlights=False
         )
         # Render high-res page ONCE

natural_pdf/analyzers/layout/yolo.py CHANGED Viewed

@@ -92,7 +92,7 @@ class YOLODocLayoutDetector(LayoutDetector):
         """Load the YOLOv10 model based on options."""
         if not self.is_available():
             raise RuntimeError(
-                "YOLO dependencies not installed. Please run: natural-pdf install yolo"
+                "YOLO dependencies not installed. Please run: npdf install yolo"
             )
         self.logger.info(f"Loading YOLO model: {options.model_repo}/{options.model_file}")
         try:
@@ -108,7 +108,7 @@ class YOLODocLayoutDetector(LayoutDetector):
         """Detect layout elements in an image using YOLO."""
         if not self.is_available():
             raise RuntimeError(
-                "YOLO dependencies not installed. Please run: natural-pdf install yolo"
+                "YOLO dependencies not installed. Please run: npdf install yolo"
             )
         # Ensure options are the correct type, falling back to defaults if base type passed

natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl

natural-pdf 0.1.27py3-none-any.whl → 0.1.30py3-none-any.whl