natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,71 @@
1
+ """
2
+ Example demonstrating the use of color names in selectors.
3
+ """
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ # Add the parent directory to the path to import the local package
8
+ sys.path.insert(0, str(Path(__file__).parent.parent))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def main():
13
+ """Run the example."""
14
+ # Get the PDF file path from command line args or use default
15
+ if len(sys.argv) > 1:
16
+ pdf_path = sys.argv[1]
17
+ else:
18
+ # Use a default sample PDF
19
+ pdf_path = str(Path(__file__).parent.parent / "pdfs" / "01-practice.pdf")
20
+
21
+ # Create a PDF object
22
+ pdf = PDF(pdf_path)
23
+ page = pdf.pages[0]
24
+
25
+ print("\n=== Using Color Names in Selectors ===\n")
26
+
27
+ # Different ways to specify the same red color
28
+ print("Finding red text using different color specifications:")
29
+
30
+ # Traditional RGB tuple
31
+ red_text1 = page.find_all('text[color~=(1,0,0)]')
32
+ print(f"- Using RGB tuple (1,0,0): Found {len(red_text1)} elements")
33
+
34
+ # Using named color
35
+ red_text2 = page.find_all('text[color~=red]')
36
+ print(f"- Using named color 'red': Found {len(red_text2)} elements")
37
+
38
+ # Using hex color
39
+ red_text3 = page.find_all('text[color~=#ff0000]')
40
+ print(f"- Using hex color '#ff0000': Found {len(red_text3)} elements")
41
+
42
+ # Compare results
43
+ print("\nAre the results the same?",
44
+ len(red_text1) == len(red_text2) == len(red_text3))
45
+
46
+ # Highlight the found elements
47
+ page.clear_highlights()
48
+ red_text1.highlight(label="Red (RGB tuple)")
49
+
50
+ # Try a different color by name
51
+ blue_text = page.find_all('text[color~=blue]')
52
+ blue_text.highlight(label="Blue (named color)")
53
+
54
+ green_text = page.find_all('text[color~=#00ff00]')
55
+ green_text.highlight(label="Green (hex color)")
56
+
57
+ print("\nHighlighting the found elements...")
58
+
59
+ # Save the highlighted image
60
+ output_path = str(Path(__file__).parent.parent / "output" / "color_names.png")
61
+ page.to_image(path=output_path, show_labels=True)
62
+ print(f"Image saved to {output_path}")
63
+
64
+ # Show more information about the colors
65
+ if red_text1:
66
+ print("\nExample red text element:")
67
+ print(f"- Text: {red_text1.first.text}")
68
+ print(f"- Color: {red_text1.first.color}")
69
+
70
+ if __name__ == "__main__":
71
+ main()
examples/color_test.py ADDED
@@ -0,0 +1,62 @@
1
+ """
2
+ Test script to verify color conversion in the highlight system.
3
+ """
4
+ import os
5
+ import sys
6
+ from typing import List, Dict, Tuple, Optional, Union, Any, Set
7
+
8
+ # Add the parent directory to the path to import the package
9
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ def test_color_conversion():
12
+ """Test the color conversion logic directly without relying on the PDF."""
13
+ print("Testing color conversion logic...")
14
+
15
+ # Test the same logic we added to highlighting.py
16
+ def normalize_color(color) -> Tuple[int, int, int, int]:
17
+ """Normalize color tuple to 0-255 integer format."""
18
+ if isinstance(color, tuple):
19
+ # Convert values to integers in 0-255 range
20
+ processed_color = []
21
+ for i, c in enumerate(color):
22
+ if isinstance(c, float):
23
+ # 0.0-1.0 float format
24
+ if c <= 1.0:
25
+ processed_color.append(int(c * 255))
26
+ # Already in 0-255 range but as float
27
+ else:
28
+ processed_color.append(int(c))
29
+ else:
30
+ processed_color.append(c)
31
+
32
+ # Default alpha value if needed
33
+ if len(processed_color) == 3:
34
+ processed_color.append(100) # Default alpha
35
+
36
+ return tuple(processed_color)
37
+ else:
38
+ # Default if invalid color is provided
39
+ return (255, 255, 0, 100) # Yellow with semi-transparency
40
+
41
+ # Test various color formats
42
+ test_cases = [
43
+ ((255, 0, 0, 128), "Integer RGB with alpha"),
44
+ ((255, 0, 0), "Integer RGB without alpha"),
45
+ ((0.0, 1.0, 0.0, 0.5), "Float RGB with alpha (0-1)"),
46
+ ((0.0, 1.0, 0.0), "Float RGB without alpha (0-1)"),
47
+ ((0.5, 0.5, 255, 0.7), "Mixed float and integer"),
48
+ ((0.5, 0.5, 255), "Mixed without alpha"),
49
+ ((128.5, 64.3, 200.7, 50.9), "Float values > 1"),
50
+ (None, "None case")
51
+ ]
52
+
53
+ for color, desc in test_cases:
54
+ print(f"\nTesting: {desc}")
55
+ print(f"Input: {color}")
56
+ result = normalize_color(color)
57
+ print(f"Output: {result}")
58
+
59
+ print("\nTest complete!")
60
+
61
+ if __name__ == "__main__":
62
+ test_color_conversion()
examples/debug_ocr.py ADDED
@@ -0,0 +1,91 @@
1
+ """
2
+ Debug OCR issues.
3
+ """
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Add parent directory to path for imports
9
+ sys.path.insert(0, str(Path(__file__).parent.parent))
10
+
11
+ from natural_pdf import PDF
12
+ from natural_pdf.ocr import EasyOCREngine
13
+
14
+ # Get current directory
15
+ script_dir = os.path.dirname(os.path.realpath(__file__))
16
+ root_dir = os.path.dirname(script_dir)
17
+ default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
18
+ output_dir = os.path.join(root_dir, "output")
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ print("OCR Debug Test")
22
+ print("=============")
23
+
24
+ # Check if OCR engines are available
25
+ try:
26
+ import easyocr
27
+ print("EasyOCR is available.")
28
+ except ImportError:
29
+ print("EasyOCR is not available.")
30
+
31
+ try:
32
+ import paddleocr
33
+ import paddle
34
+ print("PaddleOCR is available.")
35
+ except ImportError:
36
+ print("PaddleOCR is not available.")
37
+
38
+ # Test with EasyOCR directly (explicit configuration)
39
+ print("\n1. Testing with explicit EasyOCR engine and forced enabled")
40
+ pdf = PDF(default_pdf,
41
+ ocr_engine="easyocr",
42
+ ocr={
43
+ "enabled": True,
44
+ "languages": ["en"],
45
+ "min_confidence": 0.3
46
+ })
47
+
48
+ # Get the page
49
+ print("Getting page...")
50
+ page = pdf.pages[0]
51
+
52
+ # Print OCR config
53
+ print(f"PDF OCR config: {pdf._ocr_config}")
54
+ print(f"OCR engine type: {type(pdf._ocr_engine)}")
55
+
56
+ # Generate page image for debugging
57
+ print("Generating debug image of the page...")
58
+ img = page.to_image()
59
+ img_path = os.path.join(output_dir, "debug_page_image.png")
60
+ img.save(img_path)
61
+ print(f"Saved page image to {img_path}")
62
+
63
+ # Force OCR extraction
64
+ print("Forcing OCR extraction...")
65
+ ocr_elements = page.extract_ocr_elements()
66
+ print(f"Extracted {len(ocr_elements)} OCR elements")
67
+
68
+ # Print details of first few elements if any
69
+ if ocr_elements:
70
+ for i, elem in enumerate(ocr_elements[:3]):
71
+ print(f"Element {i+1}: '{elem.text}' (conf: {elem.confidence:.2f})")
72
+ else:
73
+ print("No OCR elements found!")
74
+
75
+ # Extract text with OCR
76
+ print("Extracting text with OCR=True...")
77
+ text = page.extract_text(ocr=True)
78
+ print(f"Extracted {len(text)} characters of text")
79
+ print(f"First 100 chars: {text[:100]}...")
80
+
81
+ # Create a debug image
82
+ print("Creating debug visualization...")
83
+ page.clear_highlights()
84
+ for elem in ocr_elements:
85
+ elem.highlight(label=f"OCR ({elem.confidence:.2f})")
86
+
87
+ output_path = os.path.join(output_dir, "ocr_debug.png")
88
+ page.to_image(path=output_path, show_labels=True)
89
+ print(f"Saved debug image to {output_path}")
90
+
91
+ print("\nTest complete!")
@@ -0,0 +1,148 @@
1
+ """
2
+ Direct OCR test script to debug OCR issues.
3
+ """
4
+ import os
5
+ import sys
6
+ from PIL import Image
7
+ import numpy as np
8
+
9
+ # Add the project directory to the path to import the library
10
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
11
+ from natural_pdf import PDF
12
+
13
+ # Select a PDF file to test
14
+ PDF_FILE = "./pdfs/HARRY ROQUE_redacted.pdf"
15
+ if not os.path.exists(PDF_FILE):
16
+ PDF_FILE = "./pdfs/01-practice.pdf" # Fallback to another file if needed
17
+
18
+ def test_direct_ocr():
19
+ """Test OCR engines directly."""
20
+
21
+ # Create output directory
22
+ output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output")
23
+ os.makedirs(output_dir, exist_ok=True)
24
+
25
+ # Direct test with EasyOCR
26
+ print("\n=== Direct test with EasyOCR ===")
27
+ try:
28
+ import easyocr
29
+ # Use the provided PDF file
30
+ with PDF(PDF_FILE) as pdf:
31
+ # Get the first page
32
+ page = pdf.pages[0]
33
+ # Convert to image
34
+ image = page.to_image()
35
+ image_path = os.path.join(output_dir, "easyocr_test_input.png")
36
+ image.save(image_path)
37
+ print(f"Saved image to {image_path}")
38
+
39
+ # Run EasyOCR directly
40
+ reader = easyocr.Reader(['en'])
41
+ results = reader.readtext(np.array(image))
42
+ print(f"EasyOCR found {len(results)} text elements")
43
+
44
+ # Print results
45
+ for i, (bbox, text, conf) in enumerate(results[:5]):
46
+ print(f"Result {i+1}: '{text}' (Confidence: {conf:.2f})")
47
+
48
+ print("EasyOCR direct test successful")
49
+ except ImportError:
50
+ print("EasyOCR not available")
51
+ except Exception as e:
52
+ print(f"Error in EasyOCR direct test: {e}")
53
+ import traceback
54
+ traceback.print_exc()
55
+
56
+ # Direct test with PaddleOCR
57
+ print("\n=== Direct test with PaddleOCR ===")
58
+ try:
59
+ import paddleocr
60
+ # Use the provided PDF file
61
+ with PDF(PDF_FILE) as pdf:
62
+ # Get the first page
63
+ page = pdf.pages[0]
64
+ # Convert to image
65
+ image = page.to_image()
66
+ image_path = os.path.join(output_dir, "paddleocr_test_input.png")
67
+ image.save(image_path)
68
+ print(f"Saved image to {image_path}")
69
+
70
+ # Run PaddleOCR directly
71
+ reader = paddleocr.PaddleOCR(lang='en')
72
+ results = reader.ocr(np.array(image), cls=False)
73
+
74
+ if results is not None and len(results) > 0:
75
+ page_result = results[0] if isinstance(results[0], list) else results
76
+ print(f"PaddleOCR found {len(page_result)} text elements")
77
+
78
+ # Print results
79
+ for i, detection in enumerate(page_result[:5]):
80
+ if len(detection) >= 2:
81
+ bbox = detection[0]
82
+ text_conf = detection[1]
83
+ text = text_conf[0] if isinstance(text_conf, tuple) and len(text_conf) >= 2 else str(text_conf)
84
+ conf = text_conf[1] if isinstance(text_conf, tuple) and len(text_conf) >= 2 else 1.0
85
+ print(f"Result {i+1}: '{text}' (Confidence: {conf:.2f})")
86
+ else:
87
+ print(f"PaddleOCR returned no results: {results}")
88
+
89
+ print("PaddleOCR direct test complete")
90
+ except ImportError:
91
+ print("PaddleOCR not available")
92
+ except Exception as e:
93
+ print(f"Error in PaddleOCR direct test: {e}")
94
+ import traceback
95
+ traceback.print_exc()
96
+
97
+ def test_library_ocr():
98
+ """Test OCR integration with the library."""
99
+
100
+ print("\n=== Test library integration with EasyOCR ===")
101
+ try:
102
+ # Create a PDF with explicit OCR config
103
+ with PDF(PDF_FILE, ocr={"enabled": True, "languages": ["en"]}, ocr_engine="easyocr") as pdf:
104
+ # Get the first page
105
+ page = pdf.pages[0]
106
+
107
+ # Extract text with OCR
108
+ print("Running OCR through library...")
109
+ elements = page.extract_ocr_elements()
110
+
111
+ print(f"Library OCR found {len(elements)} text elements")
112
+
113
+ # Print results
114
+ for i, elem in enumerate(elements[:5]):
115
+ print(f"Result {i+1}: '{elem.text}' (Confidence: {elem.confidence:.2f})")
116
+
117
+ print("Library OCR with EasyOCR test complete")
118
+ except Exception as e:
119
+ print(f"Error in library OCR with EasyOCR test: {e}")
120
+ import traceback
121
+ traceback.print_exc()
122
+
123
+ print("\n=== Test library integration with PaddleOCR ===")
124
+ try:
125
+ # Create a PDF with explicit OCR config
126
+ with PDF(PDF_FILE, ocr={"enabled": True, "languages": ["en"]}, ocr_engine="paddleocr") as pdf:
127
+ # Get the first page
128
+ page = pdf.pages[0]
129
+
130
+ # Extract text with OCR
131
+ print("Running OCR through library...")
132
+ elements = page.extract_ocr_elements()
133
+
134
+ print(f"Library OCR found {len(elements)} text elements")
135
+
136
+ # Print results
137
+ for i, elem in enumerate(elements[:5]):
138
+ print(f"Result {i+1}: '{elem.text}' (Confidence: {elem.confidence:.2f})")
139
+
140
+ print("Library OCR with PaddleOCR test complete")
141
+ except Exception as e:
142
+ print(f"Error in library OCR with PaddleOCR test: {e}")
143
+ import traceback
144
+ traceback.print_exc()
145
+
146
+ if __name__ == "__main__":
147
+ test_direct_ocr()
148
+ test_library_ocr()
@@ -0,0 +1,99 @@
1
+ """
2
+ Direct test of PaddlePaddle's PPStructure functionality.
3
+
4
+ This script bypasses our library and directly uses paddleocr to test layout detection.
5
+ """
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+ import cv2
10
+
11
+ try:
12
+ from paddleocr import PPStructure
13
+ except ImportError:
14
+ print("PaddleOCR not installed. Run: pip install paddlepaddle paddleocr")
15
+ sys.exit(1)
16
+
17
+ # Get the current directory of this script
18
+ script_dir = os.path.dirname(os.path.realpath(__file__))
19
+ # Get the parent directory (project root)
20
+ root_dir = os.path.dirname(script_dir)
21
+ # Default PDF path
22
+ default_pdf = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
23
+
24
+ # Check command line args
25
+ if len(sys.argv) > 1:
26
+ image_path = sys.argv[1]
27
+ else:
28
+ # Convert first page of PDF to image since PPStructure needs an image
29
+ import fitz # PyMuPDF
30
+ pdf_path = default_pdf
31
+ print(f"Converting first page of {pdf_path} to image...")
32
+
33
+ pdf_doc = fitz.open(pdf_path)
34
+ page = pdf_doc[0]
35
+
36
+ # Render page at higher resolution
37
+ zoom = 2.0 # Increase resolution
38
+ mat = fitz.Matrix(zoom, zoom)
39
+ pix = page.get_pixmap(matrix=mat)
40
+
41
+ # Save as image
42
+ image_path = os.path.join(root_dir, "output", "direct_paddle_test.png")
43
+ pix.save(image_path)
44
+ print(f"Saved image to {image_path}")
45
+
46
+ # Ensure image exists
47
+ if not os.path.exists(image_path):
48
+ print(f"Image doesn't exist: {image_path}")
49
+ sys.exit(1)
50
+
51
+ print(f"Running PPStructure on {image_path}...")
52
+
53
+ # Initialize PP-Structure with minimal settings
54
+ table_engine = PPStructure(show_log=True)
55
+
56
+ try:
57
+ # Run layout analysis
58
+ result = table_engine(image_path)
59
+
60
+ # Print results
61
+ print(f"Found {len(result)} layout regions:")
62
+ for i, region in enumerate(result):
63
+ region_type = region.get('type', 'unknown')
64
+ bbox = region.get('bbox', [])
65
+ confidence = region.get('score', 0)
66
+ print(f"{i+1}. Type: {region_type}, Confidence: {confidence:.4f}, BBox: {bbox}")
67
+
68
+ # Check for OCR text inside the region
69
+ if 'res' in region:
70
+ if isinstance(region['res'], dict) and 'text' in region['res']:
71
+ print(f" Text: {region['res']['text'][:50]}...")
72
+ elif isinstance(region['res'], dict) and 'cells' in region['res']:
73
+ print(f" Table with {len(region['res']['cells'])} cells")
74
+ else:
75
+ print(f" Has result data: {type(region['res'])}")
76
+
77
+ # Try directly with PaddleOCR for layout analysis
78
+ from paddleocr import PaddleOCR
79
+ print("\nTrying with direct PaddleOCR...")
80
+
81
+ ocr_engine = PaddleOCR(lang="en", show_log=True)
82
+ layout_result = ocr_engine.ocr(image_path, det=True, rec=True, cls=False)
83
+
84
+ if layout_result:
85
+ print(f"PaddleOCR found text elements on page 1: {len(layout_result[0])}")
86
+
87
+ # Print first few elements
88
+ for i, line in enumerate(layout_result[0][:5]):
89
+ points = line[0] # Coordinates: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
90
+ text = line[1][0] # Text content
91
+ confidence = line[1][1] # Confidence score
92
+ print(f" {i+1}. Text: '{text}', Confidence: {confidence:.4f}")
93
+ else:
94
+ print("PaddleOCR found no elements")
95
+
96
+ except Exception as e:
97
+ print(f"Error: {e}")
98
+ import traceback
99
+ traceback.print_exc()
@@ -0,0 +1,165 @@
1
+ """
2
+ Direct Document QA example that closely mirrors the original pdfplumber implementation.
3
+
4
+ This example shows how to:
5
+ 1. Use pdfplumber directly to extract words and images
6
+ 2. Use transformers pipelines for document QA
7
+ 3. Compare with the Natural PDF implementation
8
+
9
+ It's intentionally similar to the original code provided by the user.
10
+ """
11
+
12
+ import os
13
+ import sys
14
+ import argparse
15
+ import pdfplumber
16
+ from PIL import Image, ImageDraw
17
+ import numpy as np
18
+
19
+ # Add parent directory to path to run without installing
20
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21
+
22
+ # For comparison
23
+ from natural_pdf import PDF, configure_logging
24
+ import logging
25
+
26
+ def pdfplumber_qa(pdf_path, question, debug=False):
27
+ """Run QA using direct pdfplumber code similar to the original example."""
28
+ # Open PDF
29
+ pdf = pdfplumber.open(pdf_path)
30
+ page = pdf.pages[0]
31
+
32
+ # Get image
33
+ image = page.to_image(resolution=300).original
34
+
35
+ # Extract words
36
+ words = page.extract_words()
37
+
38
+ # Build word boxes in the expected format
39
+ def get_box(word):
40
+ return [
41
+ word['text'],
42
+ [int(word["x0"]), int(word["top"]), int(word["x1"]), int(word["bottom"])]
43
+ ]
44
+
45
+ word_boxes = [get_box(word) for word in words]
46
+
47
+ # Debug visualization
48
+ if debug:
49
+ os.makedirs("output", exist_ok=True)
50
+
51
+ # Save image
52
+ image.save("output/direct_qa_image.png")
53
+
54
+ # Save visualization
55
+ vis_image = image.copy()
56
+ draw = ImageDraw.Draw(vis_image)
57
+
58
+ for i, (text, box) in enumerate(word_boxes):
59
+ x0, y0, x1, y1 = box
60
+ draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
61
+ draw.text((x0, y0), str(i), fill=(255, 0, 0))
62
+
63
+ vis_image.save("output/direct_qa_boxes.png")
64
+
65
+ # Use transformers pipeline
66
+ try:
67
+ from transformers import pipeline
68
+
69
+ pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
70
+
71
+ # Run query
72
+ query = { "image": image, "question": question, "word_boxes": word_boxes }
73
+
74
+ result = pipe(query)[0]
75
+
76
+ # Create result dictionary similar to Natural PDF's format
77
+ return {
78
+ "answer": result.get("answer", ""),
79
+ "confidence": result.get("score", 0.0),
80
+ "start": result.get("start", 0),
81
+ "end": result.get("end", 0),
82
+ "found": True if result.get("answer") else False
83
+ }
84
+
85
+ except Exception as e:
86
+ print(f"Error in direct QA: {e}")
87
+ return {
88
+ "answer": "",
89
+ "confidence": 0.0,
90
+ "error": str(e),
91
+ "found": False
92
+ }
93
+
94
+ def main():
95
+ parser = argparse.ArgumentParser(description="Direct Document QA Example")
96
+ parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
97
+ help="Path to PDF document")
98
+ parser.add_argument("--question", default="How many votes for Harris and Walz?",
99
+ help="Question to ask about the document")
100
+ parser.add_argument("--debug", action="store_true",
101
+ help="Save debug information for troubleshooting")
102
+ parser.add_argument("--compare", action="store_true",
103
+ help="Compare with Natural PDF implementation")
104
+
105
+ args = parser.parse_args()
106
+
107
+ # Configure logging for Natural PDF
108
+ if args.debug:
109
+ configure_logging(level=logging.DEBUG)
110
+ else:
111
+ configure_logging(level=logging.INFO)
112
+
113
+ print(f"Document: {args.pdf_path}")
114
+ print(f"Question: {args.question}")
115
+
116
+ # Run direct pdfplumber QA
117
+ print("\n=== Direct pdfplumber implementation ===")
118
+ result = pdfplumber_qa(args.pdf_path, args.question, debug=args.debug)
119
+
120
+ if result.get("found", False):
121
+ print(f"Answer: {result['answer']}")
122
+ print(f"Confidence: {result['confidence']:.2f}")
123
+ else:
124
+ print(f"No answer found: {result.get('error', '')}")
125
+
126
+ # Compare with Natural PDF if requested
127
+ if args.compare:
128
+ print("\n=== Natural PDF implementation ===")
129
+
130
+ # Use Natural PDF
131
+ pdf = PDF(args.pdf_path)
132
+ page = pdf.pages[0]
133
+
134
+ # Ask the question
135
+ natural_result = page.ask(args.question, debug=args.debug)
136
+
137
+ if natural_result.get("found", False):
138
+ print(f"Answer: {natural_result['answer']}")
139
+ print(f"Confidence: {natural_result['confidence']:.2f}")
140
+
141
+ # Highlight the answer
142
+ if natural_result.get("source_elements"):
143
+ for element in natural_result["source_elements"]:
144
+ element.highlight(color=(1, 0.5, 0, 0.5))
145
+
146
+ # Save the image
147
+ page.save_image("output/natural_pdf_answer.png")
148
+ print("Saved highlighted answer to output/natural_pdf_answer.png")
149
+ else:
150
+ print(f"No answer found: {natural_result.get('error', '')}")
151
+
152
+ # Compare results
153
+ if result.get("found", False) and natural_result.get("found", False):
154
+ print("\n=== Comparison ===")
155
+ print(f"Direct answer: '{result['answer']}' (confidence: {result['confidence']:.2f})")
156
+ print(f"Natural PDF answer: '{natural_result['answer']}' (confidence: {natural_result['confidence']:.2f})")
157
+
158
+ # Calculate similarity
159
+ if result['answer'] == natural_result['answer']:
160
+ print("Results match exactly!")
161
+ else:
162
+ print("Results differ.")
163
+
164
+ if __name__ == "__main__":
165
+ main()