natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,82 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the highlight_all feature of natural-pdf.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the package
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
def highlight_all_example(pdf_path):
|
13
|
+
"""Demonstrates the highlight_all feature for quick visual inspection."""
|
14
|
+
# Open the PDF
|
15
|
+
with PDF(pdf_path) as pdf:
|
16
|
+
page = pdf.pages[0]
|
17
|
+
|
18
|
+
print(f"PDF loaded: {pdf_path}")
|
19
|
+
print(f"PDF has {len(pdf)} pages")
|
20
|
+
|
21
|
+
# Create an output directory for saving images
|
22
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
23
|
+
os.makedirs(output_dir, exist_ok=True)
|
24
|
+
|
25
|
+
# EXAMPLE 1: Highlight all elements on the page
|
26
|
+
print("\nEXAMPLE 1: Highlighting all elements")
|
27
|
+
print("-" * 60)
|
28
|
+
|
29
|
+
# Count all element types first
|
30
|
+
element_counts = {
|
31
|
+
'Text': len(page.words),
|
32
|
+
'Characters': len(page.chars),
|
33
|
+
'Lines': len(page.lines),
|
34
|
+
'Rectangles': len(page.rects)
|
35
|
+
}
|
36
|
+
|
37
|
+
for element_type, count in element_counts.items():
|
38
|
+
print(f"Found {count} {element_type.lower()}")
|
39
|
+
|
40
|
+
# Highlight all elements
|
41
|
+
page.highlight_all()
|
42
|
+
|
43
|
+
# Save the image with a legend using to_image
|
44
|
+
output_file = os.path.join(output_dir, "highlight_all.png")
|
45
|
+
page.to_image(path=output_file, show_labels=True)
|
46
|
+
print(f"Saved all highlighted elements to: {output_file}")
|
47
|
+
|
48
|
+
# Clear highlights for the next example
|
49
|
+
page.clear_highlights()
|
50
|
+
|
51
|
+
# EXAMPLE 2: Highlight only specific element types
|
52
|
+
print("\nEXAMPLE 2: Highlighting only specific element types")
|
53
|
+
print("-" * 60)
|
54
|
+
|
55
|
+
# Highlight only text and lines
|
56
|
+
page.highlight_all(include_types=['text', 'line'])
|
57
|
+
|
58
|
+
# Save the image with a legend using to_image
|
59
|
+
output_file = os.path.join(output_dir, "highlight_specific_types.png")
|
60
|
+
page.to_image(path=output_file, show_labels=True)
|
61
|
+
print(f"Saved with only text and lines highlighted to: {output_file}")
|
62
|
+
|
63
|
+
print("\nEnd of highlight_all demonstration.")
|
64
|
+
|
65
|
+
if __name__ == "__main__":
|
66
|
+
# Default to example PDF if no path is provided
|
67
|
+
if len(sys.argv) < 2:
|
68
|
+
# Use the example PDF in the pdfs directory
|
69
|
+
pdf_path = os.path.abspath(os.path.join(
|
70
|
+
os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
|
71
|
+
if not os.path.exists(pdf_path):
|
72
|
+
print("Example PDF not found. Please provide a path to a PDF file.")
|
73
|
+
print("Usage: python highlight_all_example.py [path/to/file.pdf]")
|
74
|
+
sys.exit(1)
|
75
|
+
else:
|
76
|
+
pdf_path = sys.argv[1]
|
77
|
+
# Check if the file exists
|
78
|
+
if not os.path.exists(pdf_path):
|
79
|
+
print(f"File not found: {pdf_path}")
|
80
|
+
sys.exit(1)
|
81
|
+
|
82
|
+
highlight_all_example(pdf_path)
|
@@ -0,0 +1,114 @@
|
|
1
|
+
"""
|
2
|
+
Demonstrate highlighting with attributes displayed.
|
3
|
+
|
4
|
+
This example shows how to display element attributes like confidence scores
|
5
|
+
directly on the highlighting, using the include_attrs parameter.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import argparse
|
10
|
+
from typing import List
|
11
|
+
|
12
|
+
# Add the parent directory to the Python path
|
13
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
14
|
+
from natural_pdf import PDF
|
15
|
+
|
16
|
+
# Get the current directory of this script
|
17
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
18
|
+
# Get the parent directory (project root)
|
19
|
+
root_dir = os.path.dirname(script_dir)
|
20
|
+
# Default PDF path
|
21
|
+
default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
22
|
+
|
23
|
+
# Set up argument parser
|
24
|
+
parser = argparse.ArgumentParser(description="Highlight attributes example")
|
25
|
+
parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
|
26
|
+
parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
|
27
|
+
args = parser.parse_args()
|
28
|
+
|
29
|
+
print(f"Testing attribute display on: {args.pdf_path}")
|
30
|
+
print(f"Page: {args.page}")
|
31
|
+
|
32
|
+
# Load the PDF
|
33
|
+
pdf = PDF(args.pdf_path)
|
34
|
+
page = pdf.pages[args.page]
|
35
|
+
|
36
|
+
# Test 1: Standard highlight without attributes
|
37
|
+
print("\nTest 1: Standard layout highlighting (no attributes)")
|
38
|
+
page.clear_highlights()
|
39
|
+
page.analyze_layout(model="yolo", confidence=0.2)
|
40
|
+
page.analyze_layout(model="tatr", confidence=0.2, existing="append")
|
41
|
+
page.highlight_layout()
|
42
|
+
output_path = os.path.join(root_dir, "output", "highlight_no_attrs.png")
|
43
|
+
page.to_image(path=output_path, show_labels=True)
|
44
|
+
print(f"Saved to {output_path}")
|
45
|
+
|
46
|
+
# Test 2: Highlight with confidence and model attributes
|
47
|
+
print("\nTest 2: Layout highlighting with explicit confidence and model attributes")
|
48
|
+
page.clear_highlights()
|
49
|
+
for region in page.detected_layout_regions:
|
50
|
+
# Use a simplified label since details will be shown on the highlight
|
51
|
+
label = f"{region.region_type}"
|
52
|
+
# Explicitly show confidence and model directly on the highlight
|
53
|
+
region.highlight(
|
54
|
+
label=label,
|
55
|
+
include_attrs=['confidence', 'model']
|
56
|
+
)
|
57
|
+
output_path = os.path.join(root_dir, "output", "highlight_with_attrs.png")
|
58
|
+
page.to_image(path=output_path, show_labels=True)
|
59
|
+
print(f"Saved to {output_path}")
|
60
|
+
|
61
|
+
# Test 3: Use highlight_all with include_layout_regions=True (no attributes by default)
|
62
|
+
print("\nTest 3: Using highlight_all with include_layout_regions=True (no attributes)")
|
63
|
+
page.clear_highlights()
|
64
|
+
page.highlight_all(
|
65
|
+
include_layout_regions=True,
|
66
|
+
include_types=['text'],
|
67
|
+
layout_confidence=0.2
|
68
|
+
)
|
69
|
+
output_path = os.path.join(root_dir, "output", "highlight_all_with_attrs.png")
|
70
|
+
page.to_image(path=output_path, show_labels=True)
|
71
|
+
print(f"Saved to {output_path}")
|
72
|
+
|
73
|
+
# Test 4: Create a collection of regions and highlight with custom attributes
|
74
|
+
print("\nTest 4: Highlight a collection with custom attributes")
|
75
|
+
page.clear_highlights()
|
76
|
+
|
77
|
+
# Create collections by region type
|
78
|
+
from natural_pdf.elements.collections import ElementCollection
|
79
|
+
|
80
|
+
# Get high confidence regions
|
81
|
+
high_conf_regions = [r for r in page.detected_layout_regions if hasattr(r, 'confidence') and r.confidence >= 0.8]
|
82
|
+
if high_conf_regions:
|
83
|
+
high_conf_collection = ElementCollection(high_conf_regions)
|
84
|
+
high_conf_collection.highlight(
|
85
|
+
label="High Confidence",
|
86
|
+
color=(0, 1, 0, 0.3), # Green for high confidence
|
87
|
+
include_attrs=['region_type', 'confidence', 'model']
|
88
|
+
)
|
89
|
+
|
90
|
+
# Get medium confidence regions
|
91
|
+
med_conf_regions = [r for r in page.detected_layout_regions if hasattr(r, 'confidence') and 0.5 <= r.confidence < 0.8]
|
92
|
+
if med_conf_regions:
|
93
|
+
med_conf_collection = ElementCollection(med_conf_regions)
|
94
|
+
med_conf_collection.highlight(
|
95
|
+
label="Medium Confidence",
|
96
|
+
color=(1, 1, 0, 0.3), # Yellow for medium confidence
|
97
|
+
include_attrs=['region_type', 'confidence', 'model']
|
98
|
+
)
|
99
|
+
|
100
|
+
# Get low confidence regions
|
101
|
+
low_conf_regions = [r for r in page.detected_layout_regions if hasattr(r, 'confidence') and r.confidence < 0.5]
|
102
|
+
if low_conf_regions:
|
103
|
+
low_conf_collection = ElementCollection(low_conf_regions)
|
104
|
+
low_conf_collection.highlight(
|
105
|
+
label="Low Confidence",
|
106
|
+
color=(1, 0, 0, 0.3), # Red for low confidence
|
107
|
+
include_attrs=['region_type', 'confidence', 'model']
|
108
|
+
)
|
109
|
+
|
110
|
+
output_path = os.path.join(root_dir, "output", "highlight_by_confidence.png")
|
111
|
+
page.to_image(path=output_path, show_labels=True)
|
112
|
+
print(f"Saved to {output_path}")
|
113
|
+
|
114
|
+
print("\nDone!")
|
@@ -0,0 +1,122 @@
|
|
1
|
+
"""
|
2
|
+
Demonstrate the enhanced confidence display feature.
|
3
|
+
|
4
|
+
This example shows how confidence scores are displayed by default
|
5
|
+
and also demonstrates customizing the attributes displayed.
|
6
|
+
"""
|
7
|
+
import os
|
8
|
+
import sys
|
9
|
+
import argparse
|
10
|
+
|
11
|
+
# Add the parent directory to the Python path
|
12
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
13
|
+
from natural_pdf import PDF
|
14
|
+
|
15
|
+
# Get the current directory of this script
|
16
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
17
|
+
# Get the parent directory (project root)
|
18
|
+
root_dir = os.path.dirname(script_dir)
|
19
|
+
# Default PDF path
|
20
|
+
default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
21
|
+
|
22
|
+
# Set up argument parser
|
23
|
+
parser = argparse.ArgumentParser(description="Confidence display example")
|
24
|
+
parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
|
25
|
+
parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
|
26
|
+
args = parser.parse_args()
|
27
|
+
|
28
|
+
print(f"Demonstrating confidence display on: {args.pdf_path}")
|
29
|
+
print(f"Page: {args.page}")
|
30
|
+
|
31
|
+
# Load the PDF
|
32
|
+
pdf = PDF(args.pdf_path)
|
33
|
+
page = pdf.pages[args.page]
|
34
|
+
|
35
|
+
# Run layout analysis
|
36
|
+
print("\nRunning layout analysis...")
|
37
|
+
page.analyze_layout(model="yolo", confidence=0.1) # Use low confidence to show a range of values
|
38
|
+
regions = page.detected_layout_regions
|
39
|
+
print(f"Found {len(regions)} layout regions")
|
40
|
+
|
41
|
+
# Example 1: Basic highlighting without attributes
|
42
|
+
print("\nExample 1: Basic highlighting (no attributes)")
|
43
|
+
page.clear_highlights()
|
44
|
+
# Regular highlighting without showing confidence
|
45
|
+
for region in regions:
|
46
|
+
region.highlight(label=region.region_type)
|
47
|
+
|
48
|
+
output_path = os.path.join(root_dir, "output", "basic_highlighting.png")
|
49
|
+
page.to_image(path=output_path, show_labels=True)
|
50
|
+
print(f"Saved to {output_path}")
|
51
|
+
|
52
|
+
# Example 2: Explicitly adding confidence
|
53
|
+
print("\nExample 2: Explicitly showing confidence")
|
54
|
+
page.clear_highlights()
|
55
|
+
for region in regions:
|
56
|
+
region.highlight(
|
57
|
+
label=region.region_type,
|
58
|
+
include_attrs=['confidence']
|
59
|
+
)
|
60
|
+
output_path = os.path.join(root_dir, "output", "explicit_confidence_display.png")
|
61
|
+
page.to_image(path=output_path, show_labels=True)
|
62
|
+
print(f"Saved to {output_path}")
|
63
|
+
|
64
|
+
# Example 3: Show confidence values with different colors based on confidence level
|
65
|
+
print("\nExample 3: Color-coded by confidence level")
|
66
|
+
page.clear_highlights()
|
67
|
+
|
68
|
+
# Group regions by confidence
|
69
|
+
high_conf = [r for r in regions if r.confidence >= 0.8]
|
70
|
+
med_conf = [r for r in regions if 0.5 <= r.confidence < 0.8]
|
71
|
+
low_conf = [r for r in regions if 0.2 <= r.confidence < 0.5]
|
72
|
+
very_low_conf = [r for r in regions if r.confidence < 0.2]
|
73
|
+
|
74
|
+
print(f" High confidence (>=0.8): {len(high_conf)} regions")
|
75
|
+
print(f" Medium confidence (0.5-0.8): {len(med_conf)} regions")
|
76
|
+
print(f" Low confidence (0.2-0.5): {len(low_conf)} regions")
|
77
|
+
print(f" Very low confidence (<0.2): {len(very_low_conf)} regions")
|
78
|
+
|
79
|
+
# Highlight each group with appropriate color
|
80
|
+
from natural_pdf.elements.collections import ElementCollection
|
81
|
+
if high_conf:
|
82
|
+
ElementCollection(high_conf).highlight(
|
83
|
+
label="High Confidence",
|
84
|
+
color=(0, 0.8, 0, 0.3), # Green
|
85
|
+
include_attrs=['confidence'] # Show the confidence values
|
86
|
+
)
|
87
|
+
if med_conf:
|
88
|
+
ElementCollection(med_conf).highlight(
|
89
|
+
label="Medium Confidence",
|
90
|
+
color=(0.8, 0.8, 0, 0.3), # Yellow
|
91
|
+
include_attrs=['confidence'] # Show the confidence values
|
92
|
+
)
|
93
|
+
if low_conf:
|
94
|
+
ElementCollection(low_conf).highlight(
|
95
|
+
label="Low Confidence",
|
96
|
+
color=(0.8, 0.4, 0, 0.3), # Orange
|
97
|
+
include_attrs=['confidence'] # Show the confidence values
|
98
|
+
)
|
99
|
+
if very_low_conf:
|
100
|
+
ElementCollection(very_low_conf).highlight(
|
101
|
+
label="Very Low Confidence",
|
102
|
+
color=(0.8, 0, 0, 0.3), # Red
|
103
|
+
include_attrs=['confidence'] # Show the confidence values
|
104
|
+
)
|
105
|
+
|
106
|
+
output_path = os.path.join(root_dir, "output", "confidence_color_coded.png")
|
107
|
+
page.to_image(path=output_path, show_labels=True)
|
108
|
+
print(f"Saved to {output_path}")
|
109
|
+
|
110
|
+
# Example 4: Show multiple attributes (confidence + type)
|
111
|
+
print("\nExample 4: Showing multiple attributes (confidence, region_type)")
|
112
|
+
page.clear_highlights()
|
113
|
+
for region in regions:
|
114
|
+
region.highlight(
|
115
|
+
include_attrs=['confidence', 'region_type'],
|
116
|
+
color=(0, 0.5, 0.8, 0.3) # Blue
|
117
|
+
)
|
118
|
+
output_path = os.path.join(root_dir, "output", "multiple_attributes_display.png")
|
119
|
+
page.to_image(path=output_path, show_labels=False) # No legend needed
|
120
|
+
print(f"Saved to {output_path}")
|
121
|
+
|
122
|
+
print("\nDone!")
|
@@ -0,0 +1,110 @@
|
|
1
|
+
"""
|
2
|
+
Demo script to show highlight color cycling behavior.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def highlight_demo():
|
16
|
+
# Get PDF path
|
17
|
+
example_dir = Path(__file__).parent
|
18
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
19
|
+
|
20
|
+
if not pdf_files:
|
21
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
22
|
+
if pdfs_dir.exists():
|
23
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
24
|
+
|
25
|
+
if pdf_files:
|
26
|
+
pdf_path = str(pdf_files[0])
|
27
|
+
else:
|
28
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
29
|
+
sys.exit(1)
|
30
|
+
|
31
|
+
print(f"Using PDF: {pdf_path}")
|
32
|
+
|
33
|
+
# Create output directory
|
34
|
+
output_dir = Path(__file__).parent / "highlight_demo_output"
|
35
|
+
output_dir.mkdir(exist_ok=True)
|
36
|
+
|
37
|
+
# Load PDF
|
38
|
+
pdf = PDF(pdf_path)
|
39
|
+
page = pdf.pages[0]
|
40
|
+
|
41
|
+
# Demo 1: Default behavior - consistent color without label
|
42
|
+
print("Demo 1: Default behavior - consistent color without label")
|
43
|
+
texts = page.find_all('text')[:5] # Get first 5 text elements for demo
|
44
|
+
|
45
|
+
# Highlight each element individually
|
46
|
+
for i, text in enumerate(texts):
|
47
|
+
text.highlight() # No label - should use consistent color (yellow)
|
48
|
+
|
49
|
+
# Save result
|
50
|
+
page.save(str(output_dir / "demo1_default_no_label.png"), labels=True)
|
51
|
+
page.clear_highlights()
|
52
|
+
|
53
|
+
# Demo 2: With cycle_colors=True - different colors without label
|
54
|
+
print("Demo 2: With cycle_colors=True - different colors without label")
|
55
|
+
|
56
|
+
# Highlight each element individually with cycling
|
57
|
+
for i, text in enumerate(texts):
|
58
|
+
text.highlight(cycle_colors=True) # No label but with cycling
|
59
|
+
|
60
|
+
# Save result
|
61
|
+
page.save(str(output_dir / "demo2_cycling_no_label.png"), labels=True)
|
62
|
+
page.clear_highlights()
|
63
|
+
|
64
|
+
# Demo 3: With labels - different colors for different labels
|
65
|
+
print("Demo 3: With labels - different colors for different labels")
|
66
|
+
|
67
|
+
# Highlight each element with a unique label
|
68
|
+
for i, text in enumerate(texts):
|
69
|
+
text.highlight(label=f"Element {i+1}") # Different labels
|
70
|
+
|
71
|
+
# Save result
|
72
|
+
page.save(str(output_dir / "demo3_with_labels.png"), labels=True)
|
73
|
+
page.clear_highlights()
|
74
|
+
|
75
|
+
# Demo 4: With same label - same color
|
76
|
+
print("Demo 4: With same label - same color")
|
77
|
+
|
78
|
+
# Highlight all with the same label
|
79
|
+
for i, text in enumerate(texts):
|
80
|
+
text.highlight(label="Group A") # Same label - should use same color
|
81
|
+
|
82
|
+
# Save result
|
83
|
+
page.save(str(output_dir / "demo4_same_label.png"), labels=True)
|
84
|
+
page.clear_highlights()
|
85
|
+
|
86
|
+
# Demo 5: Using highlight_all with default settings
|
87
|
+
print("Demo 5: Using highlight_all with default settings")
|
88
|
+
|
89
|
+
# Highlight all elements by type
|
90
|
+
page.highlight_all() # Default: cycle_colors=True
|
91
|
+
|
92
|
+
# Save result
|
93
|
+
page.save(str(output_dir / "demo5_highlight_all_default.png"), labels=True)
|
94
|
+
page.clear_highlights()
|
95
|
+
|
96
|
+
# Demo 6: Using highlight_all with cycle_colors=False
|
97
|
+
print("Demo 6: Using highlight_all with cycle_colors=False")
|
98
|
+
|
99
|
+
# Highlight all elements by type without cycling
|
100
|
+
page.highlight_all(cycle_colors=False)
|
101
|
+
|
102
|
+
# Save result
|
103
|
+
page.save(str(output_dir / "demo6_highlight_all_no_cycling.png"), labels=True)
|
104
|
+
page.clear_highlights()
|
105
|
+
|
106
|
+
print(f"Results saved to {output_dir}/")
|
107
|
+
|
108
|
+
|
109
|
+
if __name__ == "__main__":
|
110
|
+
highlight_demo()
|
@@ -0,0 +1,71 @@
|
|
1
|
+
"""
|
2
|
+
Test script to verify highlighting with float colors.
|
3
|
+
This is a simplified version of the test without OCR to test just the color handling.
|
4
|
+
"""
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
|
8
|
+
# Add the parent directory to the path to import the package
|
9
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
10
|
+
|
11
|
+
from natural_pdf import PDF
|
12
|
+
|
13
|
+
def main():
|
14
|
+
"""Test that highlighting works with float colors."""
|
15
|
+
# Default to example PDF
|
16
|
+
pdf_path = os.path.abspath(os.path.join(
|
17
|
+
os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
|
18
|
+
|
19
|
+
if not os.path.exists(pdf_path):
|
20
|
+
print(f"Example PDF not found: {pdf_path}")
|
21
|
+
return
|
22
|
+
|
23
|
+
# Create an output directory for saving images
|
24
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
25
|
+
os.makedirs(output_dir, exist_ok=True)
|
26
|
+
|
27
|
+
print(f"Testing highlighting with float colors...")
|
28
|
+
|
29
|
+
# Open the PDF
|
30
|
+
with PDF(pdf_path) as pdf:
|
31
|
+
page = pdf.pages[0]
|
32
|
+
|
33
|
+
# Get some text elements
|
34
|
+
elements = page.find_all('text')[:4]
|
35
|
+
|
36
|
+
if len(elements) < 4:
|
37
|
+
print("Not enough text elements found in the PDF")
|
38
|
+
return
|
39
|
+
|
40
|
+
# Test with various color formats
|
41
|
+
# Example 1: RGB float 0-1 with alpha
|
42
|
+
elements[0].highlight(
|
43
|
+
color=(0.0, 1.0, 0.0, 0.5), # Green semi-transparent
|
44
|
+
label="Green Float"
|
45
|
+
)
|
46
|
+
|
47
|
+
# Example 2: RGB float 0-1 without alpha
|
48
|
+
elements[1].highlight(
|
49
|
+
color=(1.0, 0.0, 0.0), # Red
|
50
|
+
label="Red Float"
|
51
|
+
)
|
52
|
+
|
53
|
+
# Example 3: Mixed integer and float
|
54
|
+
elements[2].highlight(
|
55
|
+
color=(0.5, 0.5, 255, 0.7), # Mixed format
|
56
|
+
label="Mixed"
|
57
|
+
)
|
58
|
+
|
59
|
+
# Example 4: Integer RGB with alpha
|
60
|
+
elements[3].highlight(
|
61
|
+
color=(0, 0, 255, 100), # Blue
|
62
|
+
label="Blue Integer"
|
63
|
+
)
|
64
|
+
|
65
|
+
# Save the highlighted image
|
66
|
+
highlight_file = os.path.join(output_dir, "highlight_float_test.png")
|
67
|
+
page.to_image(path=highlight_file, show_labels=True)
|
68
|
+
print(f"Saved to: {highlight_file}")
|
69
|
+
|
70
|
+
if __name__ == "__main__":
|
71
|
+
main()
|
@@ -0,0 +1,147 @@
|
|
1
|
+
"""
|
2
|
+
Test script to verify highlighting with the same label uses the same color.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the package
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
def highlight_label_test(pdf_path):
|
13
|
+
"""Test that highlighting colors are consistent for the same label."""
|
14
|
+
# Open the PDF
|
15
|
+
with PDF(pdf_path) as pdf:
|
16
|
+
page = pdf.pages[0]
|
17
|
+
|
18
|
+
print(f"PDF loaded: {pdf_path}")
|
19
|
+
|
20
|
+
# Create an output directory for saving images
|
21
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
22
|
+
os.makedirs(output_dir, exist_ok=True)
|
23
|
+
|
24
|
+
# Find bold text elements
|
25
|
+
headings = page.find_all('text:bold')
|
26
|
+
print(f"Found {len(headings)} bold headings")
|
27
|
+
|
28
|
+
# Display the first few headings
|
29
|
+
for i, h in enumerate(headings[:5]):
|
30
|
+
print(f" {i+1}. '{h.text}' at {h.bbox}")
|
31
|
+
|
32
|
+
# Apply highlighting with a label
|
33
|
+
print("\nHighlighting bold headings...")
|
34
|
+
headings.highlight(label="Bold Headings")
|
35
|
+
|
36
|
+
# Save the image
|
37
|
+
output_file = os.path.join(output_dir, "highlight_test.png")
|
38
|
+
page.save(output_file, labels=True)
|
39
|
+
print(f"Saved to: {output_file}")
|
40
|
+
|
41
|
+
# Now let's test another case where we add elements individually
|
42
|
+
page.clear_highlights()
|
43
|
+
|
44
|
+
print("\nTesting individual elements with same label...")
|
45
|
+
|
46
|
+
# Find elements with different text
|
47
|
+
summary = page.find('text:contains("Summary:")')
|
48
|
+
site = page.find('text:contains("Site:")')
|
49
|
+
date = page.find('text:contains("Date:")')
|
50
|
+
|
51
|
+
# Highlight them with the same label
|
52
|
+
print("Highlighting 'Summary:' with label 'Key Fields'")
|
53
|
+
summary.highlight(label="Key Fields")
|
54
|
+
|
55
|
+
print("Highlighting 'Site:' with label 'Key Fields'")
|
56
|
+
site.highlight(label="Key Fields")
|
57
|
+
|
58
|
+
print("Highlighting 'Date:' with label 'Key Fields'")
|
59
|
+
date.highlight(label="Key Fields")
|
60
|
+
|
61
|
+
# Save the image
|
62
|
+
output_file = os.path.join(output_dir, "highlight_test_individual.png")
|
63
|
+
page.save(output_file, labels=True)
|
64
|
+
print(f"Saved to: {output_file}")
|
65
|
+
|
66
|
+
def highlight_color_test(pdf_path):
|
67
|
+
"""Test highlighting with float and integer color values."""
|
68
|
+
print("\n=== Testing highlight with different color formats ===")
|
69
|
+
|
70
|
+
# Create output directory
|
71
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
72
|
+
os.makedirs(output_dir, exist_ok=True)
|
73
|
+
|
74
|
+
# Open the PDF
|
75
|
+
with PDF(pdf_path) as pdf:
|
76
|
+
page = pdf.pages[0]
|
77
|
+
|
78
|
+
# Clear any existing highlights
|
79
|
+
page.clear_highlights()
|
80
|
+
|
81
|
+
# Test with integer colors (0-255)
|
82
|
+
text1 = page.find('text')
|
83
|
+
print(f"1. Using integer color (255, 0, 0, 128) for '{text1.text}'")
|
84
|
+
text1.highlight(color=(255, 0, 0, 128), label="Red (Integer)")
|
85
|
+
|
86
|
+
# Test with float colors (0.0-1.0)
|
87
|
+
text2 = page.find_all('text')[5]
|
88
|
+
print(f"2. Using float color (0.0, 1.0, 0.0, 0.5) for '{text2.text}'")
|
89
|
+
text2.highlight(color=(0.0, 1.0, 0.0, 0.5), label="Green (Float)")
|
90
|
+
|
91
|
+
# Test with partial float colors
|
92
|
+
text3 = page.find_all('text')[10]
|
93
|
+
print(f"3. Using mixed color (0.5, 0.5, 255, 0.7) for '{text3.text}'")
|
94
|
+
text3.highlight(color=(0.5, 0.5, 255, 0.7), label="Mixed")
|
95
|
+
|
96
|
+
# Test with RGB only (no alpha)
|
97
|
+
text4 = page.find_all('text')[15]
|
98
|
+
print(f"4. Using RGB-only color (0.0, 0.0, 1.0) for '{text4.text}'")
|
99
|
+
text4.highlight(color=(0.0, 0.0, 1.0), label="Blue (No Alpha)")
|
100
|
+
|
101
|
+
# Save the highlighted page
|
102
|
+
highlight_path = os.path.join(output_dir, "highlight_test_colors.png")
|
103
|
+
page.to_image(path=highlight_path, show_labels=True)
|
104
|
+
print(f"Saved highlighted image to {highlight_path}")
|
105
|
+
|
106
|
+
# Also try individual highlighting to test each color format separately
|
107
|
+
for i, (text, color, label) in enumerate([
|
108
|
+
(text1, (255, 0, 0, 128), "Red"),
|
109
|
+
(text2, (0.0, 1.0, 0.0, 0.5), "Green"),
|
110
|
+
(text3, (0.5, 0.5, 255, 0.7), "Mixed"),
|
111
|
+
(text4, (0.0, 0.0, 1.0), "Blue")
|
112
|
+
]):
|
113
|
+
page.clear_highlights()
|
114
|
+
text.highlight(color=color, label=label)
|
115
|
+
individual_path = os.path.join(output_dir, f"highlight_color_test_{i+1}.png")
|
116
|
+
page.to_image(path=individual_path, show_labels=True)
|
117
|
+
print(f"Saved individual highlight {i+1} to {individual_path}")
|
118
|
+
|
119
|
+
print("Color highlight test complete")
|
120
|
+
|
121
|
+
if __name__ == "__main__":
|
122
|
+
# Default to example PDF if no path is provided
|
123
|
+
if len(sys.argv) < 2:
|
124
|
+
# Use the example PDF in the pdfs directory
|
125
|
+
pdf_path = os.path.abspath(os.path.join(
|
126
|
+
os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
|
127
|
+
if not os.path.exists(pdf_path):
|
128
|
+
print("Example PDF not found. Please provide a path to a PDF file.")
|
129
|
+
print("Usage: python highlight_test.py [path/to/file.pdf]")
|
130
|
+
sys.exit(1)
|
131
|
+
else:
|
132
|
+
pdf_path = sys.argv[1]
|
133
|
+
# Check if the file exists
|
134
|
+
if not os.path.exists(pdf_path):
|
135
|
+
print(f"File not found: {pdf_path}")
|
136
|
+
sys.exit(1)
|
137
|
+
|
138
|
+
# Get the test name from arguments if provided
|
139
|
+
test_name = "all"
|
140
|
+
if len(sys.argv) >= 3:
|
141
|
+
test_name = sys.argv[2].lower()
|
142
|
+
|
143
|
+
if test_name == "labels" or test_name == "all":
|
144
|
+
highlight_label_test(pdf_path)
|
145
|
+
|
146
|
+
if test_name == "colors" or test_name == "all":
|
147
|
+
highlight_color_test(pdf_path)
|