natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,71 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the use of color names in selectors.
|
3
|
+
"""
|
4
|
+
import sys
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the local package
|
8
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
def main():
|
13
|
+
"""Run the example."""
|
14
|
+
# Get the PDF file path from command line args or use default
|
15
|
+
if len(sys.argv) > 1:
|
16
|
+
pdf_path = sys.argv[1]
|
17
|
+
else:
|
18
|
+
# Use a default sample PDF
|
19
|
+
pdf_path = str(Path(__file__).parent.parent / "pdfs" / "01-practice.pdf")
|
20
|
+
|
21
|
+
# Create a PDF object
|
22
|
+
pdf = PDF(pdf_path)
|
23
|
+
page = pdf.pages[0]
|
24
|
+
|
25
|
+
print("\n=== Using Color Names in Selectors ===\n")
|
26
|
+
|
27
|
+
# Different ways to specify the same red color
|
28
|
+
print("Finding red text using different color specifications:")
|
29
|
+
|
30
|
+
# Traditional RGB tuple
|
31
|
+
red_text1 = page.find_all('text[color~=(1,0,0)]')
|
32
|
+
print(f"- Using RGB tuple (1,0,0): Found {len(red_text1)} elements")
|
33
|
+
|
34
|
+
# Using named color
|
35
|
+
red_text2 = page.find_all('text[color~=red]')
|
36
|
+
print(f"- Using named color 'red': Found {len(red_text2)} elements")
|
37
|
+
|
38
|
+
# Using hex color
|
39
|
+
red_text3 = page.find_all('text[color~=#ff0000]')
|
40
|
+
print(f"- Using hex color '#ff0000': Found {len(red_text3)} elements")
|
41
|
+
|
42
|
+
# Compare results
|
43
|
+
print("\nAre the results the same?",
|
44
|
+
len(red_text1) == len(red_text2) == len(red_text3))
|
45
|
+
|
46
|
+
# Highlight the found elements
|
47
|
+
page.clear_highlights()
|
48
|
+
red_text1.highlight(label="Red (RGB tuple)")
|
49
|
+
|
50
|
+
# Try a different color by name
|
51
|
+
blue_text = page.find_all('text[color~=blue]')
|
52
|
+
blue_text.highlight(label="Blue (named color)")
|
53
|
+
|
54
|
+
green_text = page.find_all('text[color~=#00ff00]')
|
55
|
+
green_text.highlight(label="Green (hex color)")
|
56
|
+
|
57
|
+
print("\nHighlighting the found elements...")
|
58
|
+
|
59
|
+
# Save the highlighted image
|
60
|
+
output_path = str(Path(__file__).parent.parent / "output" / "color_names.png")
|
61
|
+
page.to_image(path=output_path, show_labels=True)
|
62
|
+
print(f"Image saved to {output_path}")
|
63
|
+
|
64
|
+
# Show more information about the colors
|
65
|
+
if red_text1:
|
66
|
+
print("\nExample red text element:")
|
67
|
+
print(f"- Text: {red_text1.first.text}")
|
68
|
+
print(f"- Color: {red_text1.first.color}")
|
69
|
+
|
70
|
+
if __name__ == "__main__":
|
71
|
+
main()
|
examples/color_test.py
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
"""
|
2
|
+
Test script to verify color conversion in the highlight system.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
from typing import List, Dict, Tuple, Optional, Union, Any, Set
|
7
|
+
|
8
|
+
# Add the parent directory to the path to import the package
|
9
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
10
|
+
|
11
|
+
def test_color_conversion():
|
12
|
+
"""Test the color conversion logic directly without relying on the PDF."""
|
13
|
+
print("Testing color conversion logic...")
|
14
|
+
|
15
|
+
# Test the same logic we added to highlighting.py
|
16
|
+
def normalize_color(color) -> Tuple[int, int, int, int]:
|
17
|
+
"""Normalize color tuple to 0-255 integer format."""
|
18
|
+
if isinstance(color, tuple):
|
19
|
+
# Convert values to integers in 0-255 range
|
20
|
+
processed_color = []
|
21
|
+
for i, c in enumerate(color):
|
22
|
+
if isinstance(c, float):
|
23
|
+
# 0.0-1.0 float format
|
24
|
+
if c <= 1.0:
|
25
|
+
processed_color.append(int(c * 255))
|
26
|
+
# Already in 0-255 range but as float
|
27
|
+
else:
|
28
|
+
processed_color.append(int(c))
|
29
|
+
else:
|
30
|
+
processed_color.append(c)
|
31
|
+
|
32
|
+
# Default alpha value if needed
|
33
|
+
if len(processed_color) == 3:
|
34
|
+
processed_color.append(100) # Default alpha
|
35
|
+
|
36
|
+
return tuple(processed_color)
|
37
|
+
else:
|
38
|
+
# Default if invalid color is provided
|
39
|
+
return (255, 255, 0, 100) # Yellow with semi-transparency
|
40
|
+
|
41
|
+
# Test various color formats
|
42
|
+
test_cases = [
|
43
|
+
((255, 0, 0, 128), "Integer RGB with alpha"),
|
44
|
+
((255, 0, 0), "Integer RGB without alpha"),
|
45
|
+
((0.0, 1.0, 0.0, 0.5), "Float RGB with alpha (0-1)"),
|
46
|
+
((0.0, 1.0, 0.0), "Float RGB without alpha (0-1)"),
|
47
|
+
((0.5, 0.5, 255, 0.7), "Mixed float and integer"),
|
48
|
+
((0.5, 0.5, 255), "Mixed without alpha"),
|
49
|
+
((128.5, 64.3, 200.7, 50.9), "Float values > 1"),
|
50
|
+
(None, "None case")
|
51
|
+
]
|
52
|
+
|
53
|
+
for color, desc in test_cases:
|
54
|
+
print(f"\nTesting: {desc}")
|
55
|
+
print(f"Input: {color}")
|
56
|
+
result = normalize_color(color)
|
57
|
+
print(f"Output: {result}")
|
58
|
+
|
59
|
+
print("\nTest complete!")
|
60
|
+
|
61
|
+
if __name__ == "__main__":
|
62
|
+
test_color_conversion()
|
examples/debug_ocr.py
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
"""
|
2
|
+
Debug OCR issues.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
# Add parent directory to path for imports
|
9
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
10
|
+
|
11
|
+
from natural_pdf import PDF
|
12
|
+
from natural_pdf.ocr import EasyOCREngine
|
13
|
+
|
14
|
+
# Get current directory
|
15
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
16
|
+
root_dir = os.path.dirname(script_dir)
|
17
|
+
default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
18
|
+
output_dir = os.path.join(root_dir, "output")
|
19
|
+
os.makedirs(output_dir, exist_ok=True)
|
20
|
+
|
21
|
+
print("OCR Debug Test")
|
22
|
+
print("=============")
|
23
|
+
|
24
|
+
# Check if OCR engines are available
|
25
|
+
try:
|
26
|
+
import easyocr
|
27
|
+
print("EasyOCR is available.")
|
28
|
+
except ImportError:
|
29
|
+
print("EasyOCR is not available.")
|
30
|
+
|
31
|
+
try:
|
32
|
+
import paddleocr
|
33
|
+
import paddle
|
34
|
+
print("PaddleOCR is available.")
|
35
|
+
except ImportError:
|
36
|
+
print("PaddleOCR is not available.")
|
37
|
+
|
38
|
+
# Test with EasyOCR directly (explicit configuration)
|
39
|
+
print("\n1. Testing with explicit EasyOCR engine and forced enabled")
|
40
|
+
pdf = PDF(default_pdf,
|
41
|
+
ocr_engine="easyocr",
|
42
|
+
ocr={
|
43
|
+
"enabled": True,
|
44
|
+
"languages": ["en"],
|
45
|
+
"min_confidence": 0.3
|
46
|
+
})
|
47
|
+
|
48
|
+
# Get the page
|
49
|
+
print("Getting page...")
|
50
|
+
page = pdf.pages[0]
|
51
|
+
|
52
|
+
# Print OCR config
|
53
|
+
print(f"PDF OCR config: {pdf._ocr_config}")
|
54
|
+
print(f"OCR engine type: {type(pdf._ocr_engine)}")
|
55
|
+
|
56
|
+
# Generate page image for debugging
|
57
|
+
print("Generating debug image of the page...")
|
58
|
+
img = page.to_image()
|
59
|
+
img_path = os.path.join(output_dir, "debug_page_image.png")
|
60
|
+
img.save(img_path)
|
61
|
+
print(f"Saved page image to {img_path}")
|
62
|
+
|
63
|
+
# Force OCR extraction
|
64
|
+
print("Forcing OCR extraction...")
|
65
|
+
ocr_elements = page.extract_ocr_elements()
|
66
|
+
print(f"Extracted {len(ocr_elements)} OCR elements")
|
67
|
+
|
68
|
+
# Print details of first few elements if any
|
69
|
+
if ocr_elements:
|
70
|
+
for i, elem in enumerate(ocr_elements[:3]):
|
71
|
+
print(f"Element {i+1}: '{elem.text}' (conf: {elem.confidence:.2f})")
|
72
|
+
else:
|
73
|
+
print("No OCR elements found!")
|
74
|
+
|
75
|
+
# Extract text with OCR
|
76
|
+
print("Extracting text with OCR=True...")
|
77
|
+
text = page.extract_text(ocr=True)
|
78
|
+
print(f"Extracted {len(text)} characters of text")
|
79
|
+
print(f"First 100 chars: {text[:100]}...")
|
80
|
+
|
81
|
+
# Create a debug image
|
82
|
+
print("Creating debug visualization...")
|
83
|
+
page.clear_highlights()
|
84
|
+
for elem in ocr_elements:
|
85
|
+
elem.highlight(label=f"OCR ({elem.confidence:.2f})")
|
86
|
+
|
87
|
+
output_path = os.path.join(output_dir, "ocr_debug.png")
|
88
|
+
page.to_image(path=output_path, show_labels=True)
|
89
|
+
print(f"Saved debug image to {output_path}")
|
90
|
+
|
91
|
+
print("\nTest complete!")
|
@@ -0,0 +1,148 @@
|
|
1
|
+
"""
|
2
|
+
Direct OCR test script to debug OCR issues.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
from PIL import Image
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
# Add the project directory to the path to import the library
|
10
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
11
|
+
from natural_pdf import PDF
|
12
|
+
|
13
|
+
# Select a PDF file to test
|
14
|
+
PDF_FILE = "./pdfs/HARRY ROQUE_redacted.pdf"
|
15
|
+
if not os.path.exists(PDF_FILE):
|
16
|
+
PDF_FILE = "./pdfs/01-practice.pdf" # Fallback to another file if needed
|
17
|
+
|
18
|
+
def test_direct_ocr():
|
19
|
+
"""Test OCR engines directly."""
|
20
|
+
|
21
|
+
# Create output directory
|
22
|
+
output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "output")
|
23
|
+
os.makedirs(output_dir, exist_ok=True)
|
24
|
+
|
25
|
+
# Direct test with EasyOCR
|
26
|
+
print("\n=== Direct test with EasyOCR ===")
|
27
|
+
try:
|
28
|
+
import easyocr
|
29
|
+
# Use the provided PDF file
|
30
|
+
with PDF(PDF_FILE) as pdf:
|
31
|
+
# Get the first page
|
32
|
+
page = pdf.pages[0]
|
33
|
+
# Convert to image
|
34
|
+
image = page.to_image()
|
35
|
+
image_path = os.path.join(output_dir, "easyocr_test_input.png")
|
36
|
+
image.save(image_path)
|
37
|
+
print(f"Saved image to {image_path}")
|
38
|
+
|
39
|
+
# Run EasyOCR directly
|
40
|
+
reader = easyocr.Reader(['en'])
|
41
|
+
results = reader.readtext(np.array(image))
|
42
|
+
print(f"EasyOCR found {len(results)} text elements")
|
43
|
+
|
44
|
+
# Print results
|
45
|
+
for i, (bbox, text, conf) in enumerate(results[:5]):
|
46
|
+
print(f"Result {i+1}: '{text}' (Confidence: {conf:.2f})")
|
47
|
+
|
48
|
+
print("EasyOCR direct test successful")
|
49
|
+
except ImportError:
|
50
|
+
print("EasyOCR not available")
|
51
|
+
except Exception as e:
|
52
|
+
print(f"Error in EasyOCR direct test: {e}")
|
53
|
+
import traceback
|
54
|
+
traceback.print_exc()
|
55
|
+
|
56
|
+
# Direct test with PaddleOCR
|
57
|
+
print("\n=== Direct test with PaddleOCR ===")
|
58
|
+
try:
|
59
|
+
import paddleocr
|
60
|
+
# Use the provided PDF file
|
61
|
+
with PDF(PDF_FILE) as pdf:
|
62
|
+
# Get the first page
|
63
|
+
page = pdf.pages[0]
|
64
|
+
# Convert to image
|
65
|
+
image = page.to_image()
|
66
|
+
image_path = os.path.join(output_dir, "paddleocr_test_input.png")
|
67
|
+
image.save(image_path)
|
68
|
+
print(f"Saved image to {image_path}")
|
69
|
+
|
70
|
+
# Run PaddleOCR directly
|
71
|
+
reader = paddleocr.PaddleOCR(lang='en')
|
72
|
+
results = reader.ocr(np.array(image), cls=False)
|
73
|
+
|
74
|
+
if results is not None and len(results) > 0:
|
75
|
+
page_result = results[0] if isinstance(results[0], list) else results
|
76
|
+
print(f"PaddleOCR found {len(page_result)} text elements")
|
77
|
+
|
78
|
+
# Print results
|
79
|
+
for i, detection in enumerate(page_result[:5]):
|
80
|
+
if len(detection) >= 2:
|
81
|
+
bbox = detection[0]
|
82
|
+
text_conf = detection[1]
|
83
|
+
text = text_conf[0] if isinstance(text_conf, tuple) and len(text_conf) >= 2 else str(text_conf)
|
84
|
+
conf = text_conf[1] if isinstance(text_conf, tuple) and len(text_conf) >= 2 else 1.0
|
85
|
+
print(f"Result {i+1}: '{text}' (Confidence: {conf:.2f})")
|
86
|
+
else:
|
87
|
+
print(f"PaddleOCR returned no results: {results}")
|
88
|
+
|
89
|
+
print("PaddleOCR direct test complete")
|
90
|
+
except ImportError:
|
91
|
+
print("PaddleOCR not available")
|
92
|
+
except Exception as e:
|
93
|
+
print(f"Error in PaddleOCR direct test: {e}")
|
94
|
+
import traceback
|
95
|
+
traceback.print_exc()
|
96
|
+
|
97
|
+
def test_library_ocr():
|
98
|
+
"""Test OCR integration with the library."""
|
99
|
+
|
100
|
+
print("\n=== Test library integration with EasyOCR ===")
|
101
|
+
try:
|
102
|
+
# Create a PDF with explicit OCR config
|
103
|
+
with PDF(PDF_FILE, ocr={"enabled": True, "languages": ["en"]}, ocr_engine="easyocr") as pdf:
|
104
|
+
# Get the first page
|
105
|
+
page = pdf.pages[0]
|
106
|
+
|
107
|
+
# Extract text with OCR
|
108
|
+
print("Running OCR through library...")
|
109
|
+
elements = page.extract_ocr_elements()
|
110
|
+
|
111
|
+
print(f"Library OCR found {len(elements)} text elements")
|
112
|
+
|
113
|
+
# Print results
|
114
|
+
for i, elem in enumerate(elements[:5]):
|
115
|
+
print(f"Result {i+1}: '{elem.text}' (Confidence: {elem.confidence:.2f})")
|
116
|
+
|
117
|
+
print("Library OCR with EasyOCR test complete")
|
118
|
+
except Exception as e:
|
119
|
+
print(f"Error in library OCR with EasyOCR test: {e}")
|
120
|
+
import traceback
|
121
|
+
traceback.print_exc()
|
122
|
+
|
123
|
+
print("\n=== Test library integration with PaddleOCR ===")
|
124
|
+
try:
|
125
|
+
# Create a PDF with explicit OCR config
|
126
|
+
with PDF(PDF_FILE, ocr={"enabled": True, "languages": ["en"]}, ocr_engine="paddleocr") as pdf:
|
127
|
+
# Get the first page
|
128
|
+
page = pdf.pages[0]
|
129
|
+
|
130
|
+
# Extract text with OCR
|
131
|
+
print("Running OCR through library...")
|
132
|
+
elements = page.extract_ocr_elements()
|
133
|
+
|
134
|
+
print(f"Library OCR found {len(elements)} text elements")
|
135
|
+
|
136
|
+
# Print results
|
137
|
+
for i, elem in enumerate(elements[:5]):
|
138
|
+
print(f"Result {i+1}: '{elem.text}' (Confidence: {elem.confidence:.2f})")
|
139
|
+
|
140
|
+
print("Library OCR with PaddleOCR test complete")
|
141
|
+
except Exception as e:
|
142
|
+
print(f"Error in library OCR with PaddleOCR test: {e}")
|
143
|
+
import traceback
|
144
|
+
traceback.print_exc()
|
145
|
+
|
146
|
+
if __name__ == "__main__":
|
147
|
+
test_direct_ocr()
|
148
|
+
test_library_ocr()
|
@@ -0,0 +1,99 @@
|
|
1
|
+
"""
|
2
|
+
Direct test of PaddlePaddle's PPStructure functionality.
|
3
|
+
|
4
|
+
This script bypasses our library and directly uses paddleocr to test layout detection.
|
5
|
+
"""
|
6
|
+
import os
|
7
|
+
import sys
|
8
|
+
from pathlib import Path
|
9
|
+
import cv2
|
10
|
+
|
11
|
+
try:
|
12
|
+
from paddleocr import PPStructure
|
13
|
+
except ImportError:
|
14
|
+
print("PaddleOCR not installed. Run: pip install paddlepaddle paddleocr")
|
15
|
+
sys.exit(1)
|
16
|
+
|
17
|
+
# Get the current directory of this script
|
18
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
19
|
+
# Get the parent directory (project root)
|
20
|
+
root_dir = os.path.dirname(script_dir)
|
21
|
+
# Default PDF path
|
22
|
+
default_pdf = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
|
23
|
+
|
24
|
+
# Check command line args
|
25
|
+
if len(sys.argv) > 1:
|
26
|
+
image_path = sys.argv[1]
|
27
|
+
else:
|
28
|
+
# Convert first page of PDF to image since PPStructure needs an image
|
29
|
+
import fitz # PyMuPDF
|
30
|
+
pdf_path = default_pdf
|
31
|
+
print(f"Converting first page of {pdf_path} to image...")
|
32
|
+
|
33
|
+
pdf_doc = fitz.open(pdf_path)
|
34
|
+
page = pdf_doc[0]
|
35
|
+
|
36
|
+
# Render page at higher resolution
|
37
|
+
zoom = 2.0 # Increase resolution
|
38
|
+
mat = fitz.Matrix(zoom, zoom)
|
39
|
+
pix = page.get_pixmap(matrix=mat)
|
40
|
+
|
41
|
+
# Save as image
|
42
|
+
image_path = os.path.join(root_dir, "output", "direct_paddle_test.png")
|
43
|
+
pix.save(image_path)
|
44
|
+
print(f"Saved image to {image_path}")
|
45
|
+
|
46
|
+
# Ensure image exists
|
47
|
+
if not os.path.exists(image_path):
|
48
|
+
print(f"Image doesn't exist: {image_path}")
|
49
|
+
sys.exit(1)
|
50
|
+
|
51
|
+
print(f"Running PPStructure on {image_path}...")
|
52
|
+
|
53
|
+
# Initialize PP-Structure with minimal settings
|
54
|
+
table_engine = PPStructure(show_log=True)
|
55
|
+
|
56
|
+
try:
|
57
|
+
# Run layout analysis
|
58
|
+
result = table_engine(image_path)
|
59
|
+
|
60
|
+
# Print results
|
61
|
+
print(f"Found {len(result)} layout regions:")
|
62
|
+
for i, region in enumerate(result):
|
63
|
+
region_type = region.get('type', 'unknown')
|
64
|
+
bbox = region.get('bbox', [])
|
65
|
+
confidence = region.get('score', 0)
|
66
|
+
print(f"{i+1}. Type: {region_type}, Confidence: {confidence:.4f}, BBox: {bbox}")
|
67
|
+
|
68
|
+
# Check for OCR text inside the region
|
69
|
+
if 'res' in region:
|
70
|
+
if isinstance(region['res'], dict) and 'text' in region['res']:
|
71
|
+
print(f" Text: {region['res']['text'][:50]}...")
|
72
|
+
elif isinstance(region['res'], dict) and 'cells' in region['res']:
|
73
|
+
print(f" Table with {len(region['res']['cells'])} cells")
|
74
|
+
else:
|
75
|
+
print(f" Has result data: {type(region['res'])}")
|
76
|
+
|
77
|
+
# Try directly with PaddleOCR for layout analysis
|
78
|
+
from paddleocr import PaddleOCR
|
79
|
+
print("\nTrying with direct PaddleOCR...")
|
80
|
+
|
81
|
+
ocr_engine = PaddleOCR(lang="en", show_log=True)
|
82
|
+
layout_result = ocr_engine.ocr(image_path, det=True, rec=True, cls=False)
|
83
|
+
|
84
|
+
if layout_result:
|
85
|
+
print(f"PaddleOCR found text elements on page 1: {len(layout_result[0])}")
|
86
|
+
|
87
|
+
# Print first few elements
|
88
|
+
for i, line in enumerate(layout_result[0][:5]):
|
89
|
+
points = line[0] # Coordinates: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
90
|
+
text = line[1][0] # Text content
|
91
|
+
confidence = line[1][1] # Confidence score
|
92
|
+
print(f" {i+1}. Text: '{text}', Confidence: {confidence:.4f}")
|
93
|
+
else:
|
94
|
+
print("PaddleOCR found no elements")
|
95
|
+
|
96
|
+
except Exception as e:
|
97
|
+
print(f"Error: {e}")
|
98
|
+
import traceback
|
99
|
+
traceback.print_exc()
|
@@ -0,0 +1,165 @@
|
|
1
|
+
"""
|
2
|
+
Direct Document QA example that closely mirrors the original pdfplumber implementation.
|
3
|
+
|
4
|
+
This example shows how to:
|
5
|
+
1. Use pdfplumber directly to extract words and images
|
6
|
+
2. Use transformers pipelines for document QA
|
7
|
+
3. Compare with the Natural PDF implementation
|
8
|
+
|
9
|
+
It's intentionally similar to the original code provided by the user.
|
10
|
+
"""
|
11
|
+
|
12
|
+
import os
|
13
|
+
import sys
|
14
|
+
import argparse
|
15
|
+
import pdfplumber
|
16
|
+
from PIL import Image, ImageDraw
|
17
|
+
import numpy as np
|
18
|
+
|
19
|
+
# Add parent directory to path to run without installing
|
20
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
21
|
+
|
22
|
+
# For comparison
|
23
|
+
from natural_pdf import PDF, configure_logging
|
24
|
+
import logging
|
25
|
+
|
26
|
+
def pdfplumber_qa(pdf_path, question, debug=False):
|
27
|
+
"""Run QA using direct pdfplumber code similar to the original example."""
|
28
|
+
# Open PDF
|
29
|
+
pdf = pdfplumber.open(pdf_path)
|
30
|
+
page = pdf.pages[0]
|
31
|
+
|
32
|
+
# Get image
|
33
|
+
image = page.to_image(resolution=300).original
|
34
|
+
|
35
|
+
# Extract words
|
36
|
+
words = page.extract_words()
|
37
|
+
|
38
|
+
# Build word boxes in the expected format
|
39
|
+
def get_box(word):
|
40
|
+
return [
|
41
|
+
word['text'],
|
42
|
+
[int(word["x0"]), int(word["top"]), int(word["x1"]), int(word["bottom"])]
|
43
|
+
]
|
44
|
+
|
45
|
+
word_boxes = [get_box(word) for word in words]
|
46
|
+
|
47
|
+
# Debug visualization
|
48
|
+
if debug:
|
49
|
+
os.makedirs("output", exist_ok=True)
|
50
|
+
|
51
|
+
# Save image
|
52
|
+
image.save("output/direct_qa_image.png")
|
53
|
+
|
54
|
+
# Save visualization
|
55
|
+
vis_image = image.copy()
|
56
|
+
draw = ImageDraw.Draw(vis_image)
|
57
|
+
|
58
|
+
for i, (text, box) in enumerate(word_boxes):
|
59
|
+
x0, y0, x1, y1 = box
|
60
|
+
draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
|
61
|
+
draw.text((x0, y0), str(i), fill=(255, 0, 0))
|
62
|
+
|
63
|
+
vis_image.save("output/direct_qa_boxes.png")
|
64
|
+
|
65
|
+
# Use transformers pipeline
|
66
|
+
try:
|
67
|
+
from transformers import pipeline
|
68
|
+
|
69
|
+
pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
|
70
|
+
|
71
|
+
# Run query
|
72
|
+
query = { "image": image, "question": question, "word_boxes": word_boxes }
|
73
|
+
|
74
|
+
result = pipe(query)[0]
|
75
|
+
|
76
|
+
# Create result dictionary similar to Natural PDF's format
|
77
|
+
return {
|
78
|
+
"answer": result.get("answer", ""),
|
79
|
+
"confidence": result.get("score", 0.0),
|
80
|
+
"start": result.get("start", 0),
|
81
|
+
"end": result.get("end", 0),
|
82
|
+
"found": True if result.get("answer") else False
|
83
|
+
}
|
84
|
+
|
85
|
+
except Exception as e:
|
86
|
+
print(f"Error in direct QA: {e}")
|
87
|
+
return {
|
88
|
+
"answer": "",
|
89
|
+
"confidence": 0.0,
|
90
|
+
"error": str(e),
|
91
|
+
"found": False
|
92
|
+
}
|
93
|
+
|
94
|
+
def main():
|
95
|
+
parser = argparse.ArgumentParser(description="Direct Document QA Example")
|
96
|
+
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
97
|
+
help="Path to PDF document")
|
98
|
+
parser.add_argument("--question", default="How many votes for Harris and Walz?",
|
99
|
+
help="Question to ask about the document")
|
100
|
+
parser.add_argument("--debug", action="store_true",
|
101
|
+
help="Save debug information for troubleshooting")
|
102
|
+
parser.add_argument("--compare", action="store_true",
|
103
|
+
help="Compare with Natural PDF implementation")
|
104
|
+
|
105
|
+
args = parser.parse_args()
|
106
|
+
|
107
|
+
# Configure logging for Natural PDF
|
108
|
+
if args.debug:
|
109
|
+
configure_logging(level=logging.DEBUG)
|
110
|
+
else:
|
111
|
+
configure_logging(level=logging.INFO)
|
112
|
+
|
113
|
+
print(f"Document: {args.pdf_path}")
|
114
|
+
print(f"Question: {args.question}")
|
115
|
+
|
116
|
+
# Run direct pdfplumber QA
|
117
|
+
print("\n=== Direct pdfplumber implementation ===")
|
118
|
+
result = pdfplumber_qa(args.pdf_path, args.question, debug=args.debug)
|
119
|
+
|
120
|
+
if result.get("found", False):
|
121
|
+
print(f"Answer: {result['answer']}")
|
122
|
+
print(f"Confidence: {result['confidence']:.2f}")
|
123
|
+
else:
|
124
|
+
print(f"No answer found: {result.get('error', '')}")
|
125
|
+
|
126
|
+
# Compare with Natural PDF if requested
|
127
|
+
if args.compare:
|
128
|
+
print("\n=== Natural PDF implementation ===")
|
129
|
+
|
130
|
+
# Use Natural PDF
|
131
|
+
pdf = PDF(args.pdf_path)
|
132
|
+
page = pdf.pages[0]
|
133
|
+
|
134
|
+
# Ask the question
|
135
|
+
natural_result = page.ask(args.question, debug=args.debug)
|
136
|
+
|
137
|
+
if natural_result.get("found", False):
|
138
|
+
print(f"Answer: {natural_result['answer']}")
|
139
|
+
print(f"Confidence: {natural_result['confidence']:.2f}")
|
140
|
+
|
141
|
+
# Highlight the answer
|
142
|
+
if natural_result.get("source_elements"):
|
143
|
+
for element in natural_result["source_elements"]:
|
144
|
+
element.highlight(color=(1, 0.5, 0, 0.5))
|
145
|
+
|
146
|
+
# Save the image
|
147
|
+
page.save_image("output/natural_pdf_answer.png")
|
148
|
+
print("Saved highlighted answer to output/natural_pdf_answer.png")
|
149
|
+
else:
|
150
|
+
print(f"No answer found: {natural_result.get('error', '')}")
|
151
|
+
|
152
|
+
# Compare results
|
153
|
+
if result.get("found", False) and natural_result.get("found", False):
|
154
|
+
print("\n=== Comparison ===")
|
155
|
+
print(f"Direct answer: '{result['answer']}' (confidence: {result['confidence']:.2f})")
|
156
|
+
print(f"Natural PDF answer: '{natural_result['answer']}' (confidence: {natural_result['confidence']:.2f})")
|
157
|
+
|
158
|
+
# Calculate similarity
|
159
|
+
if result['answer'] == natural_result['answer']:
|
160
|
+
print("Results match exactly!")
|
161
|
+
else:
|
162
|
+
print("Results differ.")
|
163
|
+
|
164
|
+
if __name__ == "__main__":
|
165
|
+
main()
|