PyPI - natural-pdf - Versions diffs - 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl - Mend

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

natural_pdf/analyzers/__init__.py +2 -2
natural_pdf/analyzers/guides.py +670 -595
natural_pdf/analyzers/layout/base.py +53 -6
natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
natural_pdf/analyzers/layout/layout_manager.py +18 -14
natural_pdf/analyzers/layout/layout_options.py +1 -0
natural_pdf/analyzers/layout/paddle.py +102 -64
natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
natural_pdf/analyzers/layout/yolo.py +2 -6
natural_pdf/analyzers/shape_detection_mixin.py +15 -6
natural_pdf/classification/manager.py +92 -77
natural_pdf/classification/mixin.py +49 -5
natural_pdf/classification/results.py +1 -1
natural_pdf/cli.py +7 -3
natural_pdf/collections/pdf_collection.py +96 -101
natural_pdf/core/element_manager.py +188 -82
natural_pdf/core/highlighting_service.py +5 -6
natural_pdf/core/page.py +132 -16
natural_pdf/core/pdf.py +486 -71
natural_pdf/describe/__init__.py +18 -12
natural_pdf/describe/base.py +179 -172
natural_pdf/describe/elements.py +155 -155
natural_pdf/describe/mixin.py +27 -19
natural_pdf/describe/summary.py +44 -55
natural_pdf/elements/base.py +134 -18
natural_pdf/elements/collections.py +90 -18
natural_pdf/elements/image.py +2 -1
natural_pdf/elements/line.py +0 -31
natural_pdf/elements/rect.py +0 -14
natural_pdf/elements/region.py +238 -111
natural_pdf/elements/text.py +18 -12
natural_pdf/exporters/__init__.py +4 -1
natural_pdf/exporters/original_pdf.py +12 -4
natural_pdf/extraction/mixin.py +66 -10
natural_pdf/extraction/result.py +1 -1
natural_pdf/flows/flow.py +63 -4
natural_pdf/flows/region.py +4 -4
natural_pdf/ocr/engine.py +83 -2
natural_pdf/ocr/engine_paddle.py +5 -5
natural_pdf/ocr/ocr_factory.py +2 -1
natural_pdf/ocr/ocr_manager.py +24 -13
natural_pdf/ocr/ocr_options.py +3 -10
natural_pdf/qa/document_qa.py +21 -8
natural_pdf/qa/qa_result.py +3 -7
natural_pdf/search/__init__.py +3 -2
natural_pdf/search/lancedb_search_service.py +5 -6
natural_pdf/search/numpy_search_service.py +5 -2
natural_pdf/selectors/parser.py +51 -6
natural_pdf/tables/__init__.py +2 -2
natural_pdf/tables/result.py +7 -6
natural_pdf/utils/bidi_mirror.py +2 -1
natural_pdf/utils/reading_order.py +3 -2
natural_pdf/utils/visualization.py +3 -3
natural_pdf/widgets/viewer.py +0 -1
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
natural_pdf-0.1.34.dist-info/RECORD +121 -0
optimization/memory_comparison.py +73 -58
optimization/pdf_analyzer.py +141 -96
optimization/performance_analysis.py +111 -110
optimization/test_cleanup_methods.py +47 -36
optimization/test_memory_fix.py +40 -39
tools/bad_pdf_eval/__init__.py +0 -1
tools/bad_pdf_eval/analyser.py +35 -18
tools/bad_pdf_eval/collate_summaries.py +22 -18
tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
tools/bad_pdf_eval/eval_suite.py +21 -9
tools/bad_pdf_eval/evaluate_quality.py +198 -0
tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
tools/bad_pdf_eval/llm_enrich.py +71 -39
tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
tools/bad_pdf_eval/reporter.py +1 -1
tools/bad_pdf_eval/utils.py +7 -4
natural_pdf-0.1.32.dist-info/RECORD +0 -118
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0

optimization/pdf_analyzer.py CHANGED Viewed

@@ -9,65 +9,69 @@ Usage:
     python pdf_analyzer.py path/to/document.pdf [num_pages] [output_folder]
 """
-import sys
 import json
+import sys
 from pathlib import Path
 import natural_pdf as npdf
 from natural_pdf.elements.collections import ElementCollection
-def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_timestamp_folder=True):
+def analyze_pdf(
+    pdf_path, num_pages=1, output_folder="analysis_results", create_timestamp_folder=True
+):
     """Analyze a PDF using Natural PDF's full capabilities"""
     pdf_file = Path(pdf_path)
     if not pdf_file.exists():
         print(f"❌ File not found: {pdf_path}")
         return
     # Create output folder structure
     base_output_dir = Path(output_folder)
     base_output_dir.mkdir(exist_ok=True)
     # If create_timestamp_folder=True, create a timestamped run folder for batch analysis
     if create_timestamp_folder:
         import datetime
         timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
         run_output_dir = base_output_dir / f"run_{timestamp}"
         run_output_dir.mkdir(exist_ok=True)
     else:
         run_output_dir = base_output_dir
     # Create subfolder for this specific PDF within the run folder
     pdf_output_dir = run_output_dir / pdf_file.stem
     pdf_output_dir.mkdir(exist_ok=True)
     print(f"🔍 ANALYZING: {pdf_file.name}")
     print(f"📁 Output folder: {pdf_output_dir}")
     print("=" * 80)
     analysis_data = {
         "pdf_name": pdf_file.name,
         "pdf_path": str(pdf_file),
         "analysis_timestamp": None,
-        "pages": []
+        "pages": [],
     }
     try:
         # Load PDF
         pdf = npdf.PDF(str(pdf_file))
         total_pages = len(pdf.pages)
         pages_to_analyze = min(num_pages, total_pages)
         analysis_data["total_pages"] = total_pages
         analysis_data["pages_analyzed"] = pages_to_analyze
         print(f"📄 Total pages: {total_pages}")
         print(f"🔍 Analyzing first {pages_to_analyze} page(s)")
         print()
         for page_num in range(pages_to_analyze):
             page = pdf.pages[page_num]
             page_data = {
                 "page_number": page_num + 1,
                 "dimensions": {"width": page.width, "height": page.height},
@@ -77,30 +81,30 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
                 "analyze_layout": None,
                 "regions": None,
                 "elements_sample": None,
-                "image_path": None
+                "image_path": None,
             }
             print(f"📄 PAGE {page_num + 1}")
             print("-" * 60)
             # Basic page info
             print(f"📐 Dimensions: {page.width:.1f} x {page.height:.1f}")
             # 1. .describe() - Overview of elements
             print(f"\n🤖 PAGE.DESCRIBE():")
             try:
                 description = page.describe()
                 print(description)
                 page_data["describe"] = str(description)
                 # Save describe output to file
                 with open(pdf_output_dir / f"page_{page_num + 1}_describe.txt", "w") as f:
                     f.write(str(description))
             except Exception as e:
                 print(f"❌ describe() failed: {e}")
                 page_data["describe"] = f"ERROR: {e}"
             # 2. .extract_text() - Raw text extraction
             print(f"\n📝 PAGE.EXTRACT_TEXT():")
             try:
@@ -108,21 +112,25 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
                 if text:
                     print(f"Length: {len(text)} characters")
                     # Show first 300 chars
-                    preview = text[:300].replace('\n', '\\n')
+                    preview = text[:300].replace("\n", "\\n")
                     print(f"Preview: {preview}...")
-                    page_data["extract_text"] = {"length": len(text), "preview": preview, "full_text": text}
+                    page_data["extract_text"] = {
+                        "length": len(text),
+                        "preview": preview,
+                        "full_text": text,
+                    }
                     # Save full text to file
                     with open(pdf_output_dir / f"page_{page_num + 1}_text.txt", "w") as f:
                         f.write(text)
                 else:
                     print("No text extracted")
                     page_data["extract_text"] = {"length": 0, "preview": "", "full_text": ""}
             except Exception as e:
                 print(f"❌ extract_text() failed: {e}")
                 page_data["extract_text"] = f"ERROR: {e}"
             # 3. .extract_table() - Table extraction (returns List[List[str]])
             print(f"\n📊 PAGE.EXTRACT_TABLE():")
             try:
@@ -134,25 +142,25 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
                     print("Sample data (first 3 rows):")
                     for i, row in enumerate(table_data[:3]):
                         print(f"  Row {i+1}: {row}")
                     page_data["extract_table"] = {
                         "found": True,
                         "rows": rows,
                         "columns": cols,
-                        "data": table_data
+                        "data": table_data,
                     }
                     # Save table data as JSON
                     with open(pdf_output_dir / f"page_{page_num + 1}_table.json", "w") as f:
                         json.dump(table_data, f, indent=2)
                 else:
                     print("No table extracted")
                     page_data["extract_table"] = {"found": False}
             except Exception as e:
                 print(f"❌ extract_table() failed: {e}")
                 page_data["extract_table"] = f"ERROR: {e}"
             # 4. .analyze_layout() - Layout analysis
             print(f"\n🏗️  PAGE.ANALYZE_LAYOUT():")
             try:
@@ -162,17 +170,19 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
                     layout_info = []
                     for i, region in enumerate(layout[:5]):  # Show first 5
                         region_info = {
-                            "type": getattr(region, 'type', 'unknown'),
+                            "type": getattr(region, "type", "unknown"),
                             "bbox": [region.x0, region.top, region.x1, region.bottom],
-                            "confidence": getattr(region, 'confidence', 0)
+                            "confidence": getattr(region, "confidence", 0),
                         }
                         layout_info.append(region_info)
-                        print(f"  {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})")
+                        print(
+                            f"  {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})"
+                        )
                     page_data["analyze_layout"] = {
                         "found": True,
                         "count": len(layout),
-                        "regions": layout_info
+                        "regions": layout_info,
                     }
                 else:
                     print("No layout regions found")
@@ -180,38 +190,40 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
             except Exception as e:
                 print(f"❌ analyze_layout() failed: {e}")
                 page_data["analyze_layout"] = f"ERROR: {e}"
             # 4b. .analyze_layout('tatr') - Table structure analysis (append to preserve YOLO results)
             print(f"\n🏗️  PAGE.ANALYZE_LAYOUT('TATR') - Table Structure:")
             try:
-                tatr_layout = page.analyze_layout('tatr', existing="append")
+                tatr_layout = page.analyze_layout("tatr", existing="append")
                 if tatr_layout and len(tatr_layout) > 0:
                     print(f"TATR layout regions found: {len(tatr_layout)}")
                     tatr_info = []
                     for i, region in enumerate(tatr_layout[:5]):  # Show first 5
                         region_info = {
-                            "type": getattr(region, 'type', 'unknown'),
+                            "type": getattr(region, "type", "unknown"),
                             "bbox": [region.x0, region.top, region.x1, region.bottom],
-                            "confidence": getattr(region, 'confidence', 0)
+                            "confidence": getattr(region, "confidence", 0),
                         }
                         tatr_info.append(region_info)
-                        print(f"  {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})")
+                        print(
+                            f"  {i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})"
+                        )
                     page_data["analyze_layout_tatr"] = {
                         "found": True,
                         "count": len(tatr_layout),
-                        "regions": tatr_info
+                        "regions": tatr_info,
                     }
                     # Save TATR layout analysis to file
                     tatr_summary = f"TATR Layout Analysis\n{'='*50}\n"
                     tatr_summary += f"Found {len(tatr_layout)} regions:\n\n"
                     for i, region_info in enumerate(tatr_info):
                         tatr_summary += f"{i+1}. {region_info['type']} at {region_info['bbox']} (conf: {region_info['confidence']:.2f})\n"
                     with open(pdf_output_dir / f"page_{page_num + 1}_tatr_layout.txt", "w") as f:
                         f.write(tatr_summary)
                     # Try to get detailed table structure
                     try:
                         table_structure = page.find_table_structure()
@@ -221,11 +233,14 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
                             page_data["table_structure"] = {
                                 "found": True,
                                 "count": len(table_structure),
-                                "details": table_details[:1000] + ("..." if len(table_details) > 1000 else "")
+                                "details": table_details[:1000]
+                                + ("..." if len(table_details) > 1000 else ""),
                             }
                             # Save table structure to file
-                            with open(pdf_output_dir / f"page_{page_num + 1}_table_structure.txt", "w") as f:
+                            with open(
+                                pdf_output_dir / f"page_{page_num + 1}_table_structure.txt", "w"
+                            ) as f:
                                 f.write(table_details)
                         else:
                             page_data["table_structure"] = {"found": False}
@@ -240,171 +255,201 @@ def analyze_pdf(pdf_path, num_pages=1, output_folder="analysis_results", create_
                 print(f"❌ analyze_layout('tatr') failed: {e}")
                 page_data["analyze_layout_tatr"] = f"ERROR: {e}"
                 page_data["table_structure"] = f"ERROR: {e}"
             # 5. Find regions by model and save separate + combined files
             print(f"\n📍 REGION ANALYSIS - By Model:")
             try:
-                all_regions = page.find_all('region')
+                all_regions = page.find_all("region")
                 if all_regions and len(all_regions) > 0:
                     print(f"Total regions found: {len(all_regions)}")
                     # Group regions by model/source
-                    yolo_regions = [r for r in all_regions if getattr(r, 'model', '') == '' or getattr(r, 'model', '') == 'yolo']
-                    tatr_regions = [r for r in all_regions if getattr(r, 'model', '') == 'tatr']
-                    other_regions = [r for r in all_regions if getattr(r, 'model', '') not in ['', 'yolo', 'tatr']]
+                    yolo_regions = [
+                        r
+                        for r in all_regions
+                        if getattr(r, "model", "") == "" or getattr(r, "model", "") == "yolo"
+                    ]
+                    tatr_regions = [r for r in all_regions if getattr(r, "model", "") == "tatr"]
+                    other_regions = [
+                        r
+                        for r in all_regions
+                        if getattr(r, "model", "") not in ["", "yolo", "tatr"]
+                    ]
                     print(f"  YOLO regions: {len(yolo_regions)}")
                     print(f"  TATR regions: {len(tatr_regions)}")
                     print(f"  Other regions: {len(other_regions)}")
                     # Save separate files for each model
                     if yolo_regions:
                         yolo_inspect = str(ElementCollection(yolo_regions).inspect(limit=1000))
-                        with open(pdf_output_dir / f"page_{page_num + 1}_yolo_regions.txt", "w") as f:
-                            f.write(f"YOLO Layout Regions ({len(yolo_regions)} found)\n{'='*50}\n\n{yolo_inspect}")
+                        with open(
+                            pdf_output_dir / f"page_{page_num + 1}_yolo_regions.txt", "w"
+                        ) as f:
+                            f.write(
+                                f"YOLO Layout Regions ({len(yolo_regions)} found)\n{'='*50}\n\n{yolo_inspect}"
+                            )
                     if tatr_regions:
                         tatr_inspect = str(ElementCollection(tatr_regions).inspect(limit=1000))
-                        with open(pdf_output_dir / f"page_{page_num + 1}_tatr_regions.txt", "w") as f:
-                            f.write(f"TATR Layout Regions ({len(tatr_regions)} found)\n{'='*50}\n\n{tatr_inspect}")
-                    # Combined regions inspect
+                        with open(
+                            pdf_output_dir / f"page_{page_num + 1}_tatr_regions.txt", "w"
+                        ) as f:
+                            f.write(
+                                f"TATR Layout Regions ({len(tatr_regions)} found)\n{'='*50}\n\n{tatr_inspect}"
+                            )
+                    # Combined regions inspect
                     all_inspect = str(all_regions.inspect(limit=1000))
                     print(f"Combined regions preview (first 500 chars):\n{all_inspect[:500]}...")
                     # Save combined regions file
                     with open(pdf_output_dir / f"page_{page_num + 1}_all_regions.txt", "w") as f:
                         f.write(f"All Layout Regions ({len(all_regions)} found)\n{'='*50}\n")
-                        f.write(f"YOLO: {len(yolo_regions)}, TATR: {len(tatr_regions)}, Other: {len(other_regions)}\n\n")
+                        f.write(
+                            f"YOLO: {len(yolo_regions)}, TATR: {len(tatr_regions)}, Other: {len(other_regions)}\n\n"
+                        )
                         f.write(all_inspect)
                     page_data["regions"] = {
                         "found": True,
                         "total_count": len(all_regions),
                         "yolo_count": len(yolo_regions),
                         "tatr_count": len(tatr_regions),
                         "other_count": len(other_regions),
-                        "inspect_preview": all_inspect[:500] + "..." if len(all_inspect) > 500 else all_inspect
+                        "inspect_preview": (
+                            all_inspect[:500] + "..." if len(all_inspect) > 500 else all_inspect
+                        ),
                     }
                 else:
                     print("No regions found")
                     page_data["regions"] = {"found": False}
             except Exception as e:
                 print(f"❌ region analysis failed: {e}")
                 page_data["regions"] = f"ERROR: {e}"
             # 6. General element inspection
             print(f"\n🔍 GENERAL ELEMENT INSPECTION:")
             try:
                 # Count different element types
-                all_elements = page.find_all('*')
+                all_elements = page.find_all("*")
                 if all_elements and len(all_elements) > 0:
                     print(f"Total elements: {len(all_elements)}")
                     # Full inspect output - shows complete breakdown
                     print(f"\nFull element breakdown (.inspect()):")
                     # Get string representation of inspect result (increased limit)
                     inspect_result = all_elements.inspect(limit=1000)
                     inspect_text = str(inspect_result)
                     print(inspect_text)
                     # Sample some elements for detailed inspection
                     sample_elements = all_elements[:10]  # First 10 elements
                     print(f"Sample of first 10 elements:")
                     elements_sample = []
                     for i, elem in enumerate(sample_elements):
-                        elem_type = getattr(elem, 'object_type', 'unknown')
-                        text_preview = getattr(elem, 'text', '')[:30] if hasattr(elem, 'text') else ''
+                        elem_type = getattr(elem, "object_type", "unknown")
+                        text_preview = (
+                            getattr(elem, "text", "")[:30] if hasattr(elem, "text") else ""
+                        )
                         elem_info = {
                             "type": elem_type,
                             "text": text_preview,
                             "x0": elem.x0,
-                            "top": elem.top
+                            "top": elem.top,
                         }
                         elements_sample.append(elem_info)
-                        print(f"  {i+1}. {elem_type}: '{text_preview}' at ({elem.x0:.0f}, {elem.top:.0f})")
+                        print(
+                            f"  {i+1}. {elem_type}: '{text_preview}' at ({elem.x0:.0f}, {elem.top:.0f})"
+                        )
                     page_data["elements_sample"] = {
                         "total_count": len(all_elements),
                         "full_inspect": inspect_text,
-                        "sample": elements_sample
+                        "sample": elements_sample,
                     }
                     # Save full inspect to file
-                    with open(pdf_output_dir / f"page_{page_num + 1}_all_elements_inspect.txt", "w") as f:
+                    with open(
+                        pdf_output_dir / f"page_{page_num + 1}_all_elements_inspect.txt", "w"
+                    ) as f:
                         f.write(inspect_text)
                 else:
                     print("No elements found")
                     page_data["elements_sample"] = {"total_count": 0, "sample": []}
             except Exception as e:
                 print(f"❌ element inspection failed: {e}")
                 page_data["elements_sample"] = f"ERROR: {e}"
             # 7. Render page as image
             print(f"\n🖼️  RENDERING PAGE AS IMAGE:")
             try:
                 img = page.to_image(resolution=144)
                 print(f"Image: {img.width}x{img.height} pixels")
                 # Save image in output folder
                 img_filename = f"page_{page_num + 1}.png"
                 img_path = pdf_output_dir / img_filename
                 img.save(str(img_path))
                 print(f"Saved: {img_path}")
                 page_data["image_path"] = str(img_path)
             except Exception as e:
                 print(f"❌ image rendering failed: {e}")
                 page_data["image_path"] = f"ERROR: {e}"
             analysis_data["pages"].append(page_data)
             if page_num < pages_to_analyze - 1:
                 print("\n" + "=" * 80 + "\n")
         # Save complete analysis data as JSON
         import datetime
         analysis_data["analysis_timestamp"] = datetime.datetime.now().isoformat()
         summary_file = pdf_output_dir / "analysis_summary.json"
         with open(summary_file, "w") as f:
             json.dump(analysis_data, f, indent=2)
         print(f"\n✅ ANALYSIS COMPLETE")
         print(f"📊 Summary: Analyzed {pages_to_analyze} page(s) of {pdf_file.name}")
         print(f"📁 All results saved to: {pdf_output_dir}")
         print(f"📋 Summary JSON: {summary_file}")
     except Exception as e:
         print(f"❌ CRITICAL ERROR: {e}")
         import traceback
         traceback.print_exc()
 def main():
     """Main function"""
     if len(sys.argv) < 2:
-        print("Usage: python pdf_analyzer.py <pdf_path> [num_pages] [output_folder] [--no-timestamp]")
+        print(
+            "Usage: python pdf_analyzer.py <pdf_path> [num_pages] [output_folder] [--no-timestamp]"
+        )
         print("Example: python pdf_analyzer.py bad-pdfs/submissions/Focus.pdf 2 analysis_results")
         print("         python pdf_analyzer.py Focus.pdf 1 my_analysis --no-timestamp")
         sys.exit(1)
     pdf_path = sys.argv[1]
     num_pages = int(sys.argv[2]) if len(sys.argv) > 2 else 1
     output_folder = "analysis_results"
     create_timestamp_folder = True
     # Parse remaining arguments
     for arg in sys.argv[3:]:
         if arg == "--no-timestamp":
             create_timestamp_folder = False
         elif not arg.startswith("--"):
             output_folder = arg
     analyze_pdf(pdf_path, num_pages, output_folder, create_timestamp_folder)
 if __name__ == "__main__":
-    main()
+    main()

natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

natural-pdf 0.1.32py3-none-any.whl → 0.1.34py3-none-any.whl