natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,82 @@
1
+ """
2
+ Example demonstrating the highlight_all feature of natural-pdf.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def highlight_all_example(pdf_path):
13
+ """Demonstrates the highlight_all feature for quick visual inspection."""
14
+ # Open the PDF
15
+ with PDF(pdf_path) as pdf:
16
+ page = pdf.pages[0]
17
+
18
+ print(f"PDF loaded: {pdf_path}")
19
+ print(f"PDF has {len(pdf)} pages")
20
+
21
+ # Create an output directory for saving images
22
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
23
+ os.makedirs(output_dir, exist_ok=True)
24
+
25
+ # EXAMPLE 1: Highlight all elements on the page
26
+ print("\nEXAMPLE 1: Highlighting all elements")
27
+ print("-" * 60)
28
+
29
+ # Count all element types first
30
+ element_counts = {
31
+ 'Text': len(page.words),
32
+ 'Characters': len(page.chars),
33
+ 'Lines': len(page.lines),
34
+ 'Rectangles': len(page.rects)
35
+ }
36
+
37
+ for element_type, count in element_counts.items():
38
+ print(f"Found {count} {element_type.lower()}")
39
+
40
+ # Highlight all elements
41
+ page.highlight_all()
42
+
43
+ # Save the image with a legend using to_image
44
+ output_file = os.path.join(output_dir, "highlight_all.png")
45
+ page.to_image(path=output_file, show_labels=True)
46
+ print(f"Saved all highlighted elements to: {output_file}")
47
+
48
+ # Clear highlights for the next example
49
+ page.clear_highlights()
50
+
51
+ # EXAMPLE 2: Highlight only specific element types
52
+ print("\nEXAMPLE 2: Highlighting only specific element types")
53
+ print("-" * 60)
54
+
55
+ # Highlight only text and lines
56
+ page.highlight_all(include_types=['text', 'line'])
57
+
58
+ # Save the image with a legend using to_image
59
+ output_file = os.path.join(output_dir, "highlight_specific_types.png")
60
+ page.to_image(path=output_file, show_labels=True)
61
+ print(f"Saved with only text and lines highlighted to: {output_file}")
62
+
63
+ print("\nEnd of highlight_all demonstration.")
64
+
65
+ if __name__ == "__main__":
66
+ # Default to example PDF if no path is provided
67
+ if len(sys.argv) < 2:
68
+ # Use the example PDF in the pdfs directory
69
+ pdf_path = os.path.abspath(os.path.join(
70
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
71
+ if not os.path.exists(pdf_path):
72
+ print("Example PDF not found. Please provide a path to a PDF file.")
73
+ print("Usage: python highlight_all_example.py [path/to/file.pdf]")
74
+ sys.exit(1)
75
+ else:
76
+ pdf_path = sys.argv[1]
77
+ # Check if the file exists
78
+ if not os.path.exists(pdf_path):
79
+ print(f"File not found: {pdf_path}")
80
+ sys.exit(1)
81
+
82
+ highlight_all_example(pdf_path)
@@ -0,0 +1,114 @@
1
+ """
2
+ Demonstrate highlighting with attributes displayed.
3
+
4
+ This example shows how to display element attributes like confidence scores
5
+ directly on the highlighting, using the include_attrs parameter.
6
+ """
7
+ import os
8
+ import sys
9
+ import argparse
10
+ from typing import List
11
+
12
+ # Add the parent directory to the Python path
13
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
14
+ from natural_pdf import PDF
15
+
16
+ # Get the current directory of this script
17
+ script_dir = os.path.dirname(os.path.realpath(__file__))
18
+ # Get the parent directory (project root)
19
+ root_dir = os.path.dirname(script_dir)
20
+ # Default PDF path
21
+ default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
22
+
23
+ # Set up argument parser
24
+ parser = argparse.ArgumentParser(description="Highlight attributes example")
25
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
26
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
27
+ args = parser.parse_args()
28
+
29
+ print(f"Testing attribute display on: {args.pdf_path}")
30
+ print(f"Page: {args.page}")
31
+
32
+ # Load the PDF
33
+ pdf = PDF(args.pdf_path)
34
+ page = pdf.pages[args.page]
35
+
36
+ # Test 1: Standard highlight without attributes
37
+ print("\nTest 1: Standard layout highlighting (no attributes)")
38
+ page.clear_highlights()
39
+ page.analyze_layout(model="yolo", confidence=0.2)
40
+ page.analyze_layout(model="tatr", confidence=0.2, existing="append")
41
+ page.highlight_layout()
42
+ output_path = os.path.join(root_dir, "output", "highlight_no_attrs.png")
43
+ page.to_image(path=output_path, show_labels=True)
44
+ print(f"Saved to {output_path}")
45
+
46
+ # Test 2: Highlight with confidence and model attributes
47
+ print("\nTest 2: Layout highlighting with explicit confidence and model attributes")
48
+ page.clear_highlights()
49
+ for region in page.detected_layout_regions:
50
+ # Use a simplified label since details will be shown on the highlight
51
+ label = f"{region.region_type}"
52
+ # Explicitly show confidence and model directly on the highlight
53
+ region.highlight(
54
+ label=label,
55
+ include_attrs=['confidence', 'model']
56
+ )
57
+ output_path = os.path.join(root_dir, "output", "highlight_with_attrs.png")
58
+ page.to_image(path=output_path, show_labels=True)
59
+ print(f"Saved to {output_path}")
60
+
61
+ # Test 3: Use highlight_all with include_layout_regions=True (no attributes by default)
62
+ print("\nTest 3: Using highlight_all with include_layout_regions=True (no attributes)")
63
+ page.clear_highlights()
64
+ page.highlight_all(
65
+ include_layout_regions=True,
66
+ include_types=['text'],
67
+ layout_confidence=0.2
68
+ )
69
+ output_path = os.path.join(root_dir, "output", "highlight_all_with_attrs.png")
70
+ page.to_image(path=output_path, show_labels=True)
71
+ print(f"Saved to {output_path}")
72
+
73
+ # Test 4: Create a collection of regions and highlight with custom attributes
74
+ print("\nTest 4: Highlight a collection with custom attributes")
75
+ page.clear_highlights()
76
+
77
+ # Create collections by region type
78
+ from natural_pdf.elements.collections import ElementCollection
79
+
80
+ # Get high confidence regions
81
+ high_conf_regions = [r for r in page.detected_layout_regions if hasattr(r, 'confidence') and r.confidence >= 0.8]
82
+ if high_conf_regions:
83
+ high_conf_collection = ElementCollection(high_conf_regions)
84
+ high_conf_collection.highlight(
85
+ label="High Confidence",
86
+ color=(0, 1, 0, 0.3), # Green for high confidence
87
+ include_attrs=['region_type', 'confidence', 'model']
88
+ )
89
+
90
+ # Get medium confidence regions
91
+ med_conf_regions = [r for r in page.detected_layout_regions if hasattr(r, 'confidence') and 0.5 <= r.confidence < 0.8]
92
+ if med_conf_regions:
93
+ med_conf_collection = ElementCollection(med_conf_regions)
94
+ med_conf_collection.highlight(
95
+ label="Medium Confidence",
96
+ color=(1, 1, 0, 0.3), # Yellow for medium confidence
97
+ include_attrs=['region_type', 'confidence', 'model']
98
+ )
99
+
100
+ # Get low confidence regions
101
+ low_conf_regions = [r for r in page.detected_layout_regions if hasattr(r, 'confidence') and r.confidence < 0.5]
102
+ if low_conf_regions:
103
+ low_conf_collection = ElementCollection(low_conf_regions)
104
+ low_conf_collection.highlight(
105
+ label="Low Confidence",
106
+ color=(1, 0, 0, 0.3), # Red for low confidence
107
+ include_attrs=['region_type', 'confidence', 'model']
108
+ )
109
+
110
+ output_path = os.path.join(root_dir, "output", "highlight_by_confidence.png")
111
+ page.to_image(path=output_path, show_labels=True)
112
+ print(f"Saved to {output_path}")
113
+
114
+ print("\nDone!")
@@ -0,0 +1,122 @@
1
+ """
2
+ Demonstrate the enhanced confidence display feature.
3
+
4
+ This example shows how confidence scores are displayed by default
5
+ and also demonstrates customizing the attributes displayed.
6
+ """
7
+ import os
8
+ import sys
9
+ import argparse
10
+
11
+ # Add the parent directory to the Python path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+ from natural_pdf import PDF
14
+
15
+ # Get the current directory of this script
16
+ script_dir = os.path.dirname(os.path.realpath(__file__))
17
+ # Get the parent directory (project root)
18
+ root_dir = os.path.dirname(script_dir)
19
+ # Default PDF path
20
+ default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
21
+
22
+ # Set up argument parser
23
+ parser = argparse.ArgumentParser(description="Confidence display example")
24
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
25
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
26
+ args = parser.parse_args()
27
+
28
+ print(f"Demonstrating confidence display on: {args.pdf_path}")
29
+ print(f"Page: {args.page}")
30
+
31
+ # Load the PDF
32
+ pdf = PDF(args.pdf_path)
33
+ page = pdf.pages[args.page]
34
+
35
+ # Run layout analysis
36
+ print("\nRunning layout analysis...")
37
+ page.analyze_layout(model="yolo", confidence=0.1) # Use low confidence to show a range of values
38
+ regions = page.detected_layout_regions
39
+ print(f"Found {len(regions)} layout regions")
40
+
41
+ # Example 1: Basic highlighting without attributes
42
+ print("\nExample 1: Basic highlighting (no attributes)")
43
+ page.clear_highlights()
44
+ # Regular highlighting without showing confidence
45
+ for region in regions:
46
+ region.highlight(label=region.region_type)
47
+
48
+ output_path = os.path.join(root_dir, "output", "basic_highlighting.png")
49
+ page.to_image(path=output_path, show_labels=True)
50
+ print(f"Saved to {output_path}")
51
+
52
+ # Example 2: Explicitly adding confidence
53
+ print("\nExample 2: Explicitly showing confidence")
54
+ page.clear_highlights()
55
+ for region in regions:
56
+ region.highlight(
57
+ label=region.region_type,
58
+ include_attrs=['confidence']
59
+ )
60
+ output_path = os.path.join(root_dir, "output", "explicit_confidence_display.png")
61
+ page.to_image(path=output_path, show_labels=True)
62
+ print(f"Saved to {output_path}")
63
+
64
+ # Example 3: Show confidence values with different colors based on confidence level
65
+ print("\nExample 3: Color-coded by confidence level")
66
+ page.clear_highlights()
67
+
68
+ # Group regions by confidence
69
+ high_conf = [r for r in regions if r.confidence >= 0.8]
70
+ med_conf = [r for r in regions if 0.5 <= r.confidence < 0.8]
71
+ low_conf = [r for r in regions if 0.2 <= r.confidence < 0.5]
72
+ very_low_conf = [r for r in regions if r.confidence < 0.2]
73
+
74
+ print(f" High confidence (>=0.8): {len(high_conf)} regions")
75
+ print(f" Medium confidence (0.5-0.8): {len(med_conf)} regions")
76
+ print(f" Low confidence (0.2-0.5): {len(low_conf)} regions")
77
+ print(f" Very low confidence (<0.2): {len(very_low_conf)} regions")
78
+
79
+ # Highlight each group with appropriate color
80
+ from natural_pdf.elements.collections import ElementCollection
81
+ if high_conf:
82
+ ElementCollection(high_conf).highlight(
83
+ label="High Confidence",
84
+ color=(0, 0.8, 0, 0.3), # Green
85
+ include_attrs=['confidence'] # Show the confidence values
86
+ )
87
+ if med_conf:
88
+ ElementCollection(med_conf).highlight(
89
+ label="Medium Confidence",
90
+ color=(0.8, 0.8, 0, 0.3), # Yellow
91
+ include_attrs=['confidence'] # Show the confidence values
92
+ )
93
+ if low_conf:
94
+ ElementCollection(low_conf).highlight(
95
+ label="Low Confidence",
96
+ color=(0.8, 0.4, 0, 0.3), # Orange
97
+ include_attrs=['confidence'] # Show the confidence values
98
+ )
99
+ if very_low_conf:
100
+ ElementCollection(very_low_conf).highlight(
101
+ label="Very Low Confidence",
102
+ color=(0.8, 0, 0, 0.3), # Red
103
+ include_attrs=['confidence'] # Show the confidence values
104
+ )
105
+
106
+ output_path = os.path.join(root_dir, "output", "confidence_color_coded.png")
107
+ page.to_image(path=output_path, show_labels=True)
108
+ print(f"Saved to {output_path}")
109
+
110
+ # Example 4: Show multiple attributes (confidence + type)
111
+ print("\nExample 4: Showing multiple attributes (confidence, region_type)")
112
+ page.clear_highlights()
113
+ for region in regions:
114
+ region.highlight(
115
+ include_attrs=['confidence', 'region_type'],
116
+ color=(0, 0.5, 0.8, 0.3) # Blue
117
+ )
118
+ output_path = os.path.join(root_dir, "output", "multiple_attributes_display.png")
119
+ page.to_image(path=output_path, show_labels=False) # No legend needed
120
+ print(f"Saved to {output_path}")
121
+
122
+ print("\nDone!")
@@ -0,0 +1,110 @@
1
+ """
2
+ Demo script to show highlight color cycling behavior.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def highlight_demo():
16
+ # Get PDF path
17
+ example_dir = Path(__file__).parent
18
+ pdf_files = list(example_dir.glob("*.pdf"))
19
+
20
+ if not pdf_files:
21
+ pdfs_dir = example_dir.parent / "pdfs"
22
+ if pdfs_dir.exists():
23
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
24
+
25
+ if pdf_files:
26
+ pdf_path = str(pdf_files[0])
27
+ else:
28
+ print("No PDF file found. Please provide a path to a PDF file.")
29
+ sys.exit(1)
30
+
31
+ print(f"Using PDF: {pdf_path}")
32
+
33
+ # Create output directory
34
+ output_dir = Path(__file__).parent / "highlight_demo_output"
35
+ output_dir.mkdir(exist_ok=True)
36
+
37
+ # Load PDF
38
+ pdf = PDF(pdf_path)
39
+ page = pdf.pages[0]
40
+
41
+ # Demo 1: Default behavior - consistent color without label
42
+ print("Demo 1: Default behavior - consistent color without label")
43
+ texts = page.find_all('text')[:5] # Get first 5 text elements for demo
44
+
45
+ # Highlight each element individually
46
+ for i, text in enumerate(texts):
47
+ text.highlight() # No label - should use consistent color (yellow)
48
+
49
+ # Save result
50
+ page.save(str(output_dir / "demo1_default_no_label.png"), labels=True)
51
+ page.clear_highlights()
52
+
53
+ # Demo 2: With cycle_colors=True - different colors without label
54
+ print("Demo 2: With cycle_colors=True - different colors without label")
55
+
56
+ # Highlight each element individually with cycling
57
+ for i, text in enumerate(texts):
58
+ text.highlight(cycle_colors=True) # No label but with cycling
59
+
60
+ # Save result
61
+ page.save(str(output_dir / "demo2_cycling_no_label.png"), labels=True)
62
+ page.clear_highlights()
63
+
64
+ # Demo 3: With labels - different colors for different labels
65
+ print("Demo 3: With labels - different colors for different labels")
66
+
67
+ # Highlight each element with a unique label
68
+ for i, text in enumerate(texts):
69
+ text.highlight(label=f"Element {i+1}") # Different labels
70
+
71
+ # Save result
72
+ page.save(str(output_dir / "demo3_with_labels.png"), labels=True)
73
+ page.clear_highlights()
74
+
75
+ # Demo 4: With same label - same color
76
+ print("Demo 4: With same label - same color")
77
+
78
+ # Highlight all with the same label
79
+ for i, text in enumerate(texts):
80
+ text.highlight(label="Group A") # Same label - should use same color
81
+
82
+ # Save result
83
+ page.save(str(output_dir / "demo4_same_label.png"), labels=True)
84
+ page.clear_highlights()
85
+
86
+ # Demo 5: Using highlight_all with default settings
87
+ print("Demo 5: Using highlight_all with default settings")
88
+
89
+ # Highlight all elements by type
90
+ page.highlight_all() # Default: cycle_colors=True
91
+
92
+ # Save result
93
+ page.save(str(output_dir / "demo5_highlight_all_default.png"), labels=True)
94
+ page.clear_highlights()
95
+
96
+ # Demo 6: Using highlight_all with cycle_colors=False
97
+ print("Demo 6: Using highlight_all with cycle_colors=False")
98
+
99
+ # Highlight all elements by type without cycling
100
+ page.highlight_all(cycle_colors=False)
101
+
102
+ # Save result
103
+ page.save(str(output_dir / "demo6_highlight_all_no_cycling.png"), labels=True)
104
+ page.clear_highlights()
105
+
106
+ print(f"Results saved to {output_dir}/")
107
+
108
+
109
+ if __name__ == "__main__":
110
+ highlight_demo()
@@ -0,0 +1,71 @@
1
+ """
2
+ Test script to verify highlighting with float colors.
3
+ This is a simplified version of the test without OCR to test just the color handling.
4
+ """
5
+ import os
6
+ import sys
7
+
8
+ # Add the parent directory to the path to import the package
9
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ from natural_pdf import PDF
12
+
13
+ def main():
14
+ """Test that highlighting works with float colors."""
15
+ # Default to example PDF
16
+ pdf_path = os.path.abspath(os.path.join(
17
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
18
+
19
+ if not os.path.exists(pdf_path):
20
+ print(f"Example PDF not found: {pdf_path}")
21
+ return
22
+
23
+ # Create an output directory for saving images
24
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
25
+ os.makedirs(output_dir, exist_ok=True)
26
+
27
+ print(f"Testing highlighting with float colors...")
28
+
29
+ # Open the PDF
30
+ with PDF(pdf_path) as pdf:
31
+ page = pdf.pages[0]
32
+
33
+ # Get some text elements
34
+ elements = page.find_all('text')[:4]
35
+
36
+ if len(elements) < 4:
37
+ print("Not enough text elements found in the PDF")
38
+ return
39
+
40
+ # Test with various color formats
41
+ # Example 1: RGB float 0-1 with alpha
42
+ elements[0].highlight(
43
+ color=(0.0, 1.0, 0.0, 0.5), # Green semi-transparent
44
+ label="Green Float"
45
+ )
46
+
47
+ # Example 2: RGB float 0-1 without alpha
48
+ elements[1].highlight(
49
+ color=(1.0, 0.0, 0.0), # Red
50
+ label="Red Float"
51
+ )
52
+
53
+ # Example 3: Mixed integer and float
54
+ elements[2].highlight(
55
+ color=(0.5, 0.5, 255, 0.7), # Mixed format
56
+ label="Mixed"
57
+ )
58
+
59
+ # Example 4: Integer RGB with alpha
60
+ elements[3].highlight(
61
+ color=(0, 0, 255, 100), # Blue
62
+ label="Blue Integer"
63
+ )
64
+
65
+ # Save the highlighted image
66
+ highlight_file = os.path.join(output_dir, "highlight_float_test.png")
67
+ page.to_image(path=highlight_file, show_labels=True)
68
+ print(f"Saved to: {highlight_file}")
69
+
70
+ if __name__ == "__main__":
71
+ main()
@@ -0,0 +1,147 @@
1
+ """
2
+ Test script to verify highlighting with the same label uses the same color.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def highlight_label_test(pdf_path):
13
+ """Test that highlighting colors are consistent for the same label."""
14
+ # Open the PDF
15
+ with PDF(pdf_path) as pdf:
16
+ page = pdf.pages[0]
17
+
18
+ print(f"PDF loaded: {pdf_path}")
19
+
20
+ # Create an output directory for saving images
21
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
22
+ os.makedirs(output_dir, exist_ok=True)
23
+
24
+ # Find bold text elements
25
+ headings = page.find_all('text:bold')
26
+ print(f"Found {len(headings)} bold headings")
27
+
28
+ # Display the first few headings
29
+ for i, h in enumerate(headings[:5]):
30
+ print(f" {i+1}. '{h.text}' at {h.bbox}")
31
+
32
+ # Apply highlighting with a label
33
+ print("\nHighlighting bold headings...")
34
+ headings.highlight(label="Bold Headings")
35
+
36
+ # Save the image
37
+ output_file = os.path.join(output_dir, "highlight_test.png")
38
+ page.save(output_file, labels=True)
39
+ print(f"Saved to: {output_file}")
40
+
41
+ # Now let's test another case where we add elements individually
42
+ page.clear_highlights()
43
+
44
+ print("\nTesting individual elements with same label...")
45
+
46
+ # Find elements with different text
47
+ summary = page.find('text:contains("Summary:")')
48
+ site = page.find('text:contains("Site:")')
49
+ date = page.find('text:contains("Date:")')
50
+
51
+ # Highlight them with the same label
52
+ print("Highlighting 'Summary:' with label 'Key Fields'")
53
+ summary.highlight(label="Key Fields")
54
+
55
+ print("Highlighting 'Site:' with label 'Key Fields'")
56
+ site.highlight(label="Key Fields")
57
+
58
+ print("Highlighting 'Date:' with label 'Key Fields'")
59
+ date.highlight(label="Key Fields")
60
+
61
+ # Save the image
62
+ output_file = os.path.join(output_dir, "highlight_test_individual.png")
63
+ page.save(output_file, labels=True)
64
+ print(f"Saved to: {output_file}")
65
+
66
+ def highlight_color_test(pdf_path):
67
+ """Test highlighting with float and integer color values."""
68
+ print("\n=== Testing highlight with different color formats ===")
69
+
70
+ # Create output directory
71
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
72
+ os.makedirs(output_dir, exist_ok=True)
73
+
74
+ # Open the PDF
75
+ with PDF(pdf_path) as pdf:
76
+ page = pdf.pages[0]
77
+
78
+ # Clear any existing highlights
79
+ page.clear_highlights()
80
+
81
+ # Test with integer colors (0-255)
82
+ text1 = page.find('text')
83
+ print(f"1. Using integer color (255, 0, 0, 128) for '{text1.text}'")
84
+ text1.highlight(color=(255, 0, 0, 128), label="Red (Integer)")
85
+
86
+ # Test with float colors (0.0-1.0)
87
+ text2 = page.find_all('text')[5]
88
+ print(f"2. Using float color (0.0, 1.0, 0.0, 0.5) for '{text2.text}'")
89
+ text2.highlight(color=(0.0, 1.0, 0.0, 0.5), label="Green (Float)")
90
+
91
+ # Test with partial float colors
92
+ text3 = page.find_all('text')[10]
93
+ print(f"3. Using mixed color (0.5, 0.5, 255, 0.7) for '{text3.text}'")
94
+ text3.highlight(color=(0.5, 0.5, 255, 0.7), label="Mixed")
95
+
96
+ # Test with RGB only (no alpha)
97
+ text4 = page.find_all('text')[15]
98
+ print(f"4. Using RGB-only color (0.0, 0.0, 1.0) for '{text4.text}'")
99
+ text4.highlight(color=(0.0, 0.0, 1.0), label="Blue (No Alpha)")
100
+
101
+ # Save the highlighted page
102
+ highlight_path = os.path.join(output_dir, "highlight_test_colors.png")
103
+ page.to_image(path=highlight_path, show_labels=True)
104
+ print(f"Saved highlighted image to {highlight_path}")
105
+
106
+ # Also try individual highlighting to test each color format separately
107
+ for i, (text, color, label) in enumerate([
108
+ (text1, (255, 0, 0, 128), "Red"),
109
+ (text2, (0.0, 1.0, 0.0, 0.5), "Green"),
110
+ (text3, (0.5, 0.5, 255, 0.7), "Mixed"),
111
+ (text4, (0.0, 0.0, 1.0), "Blue")
112
+ ]):
113
+ page.clear_highlights()
114
+ text.highlight(color=color, label=label)
115
+ individual_path = os.path.join(output_dir, f"highlight_color_test_{i+1}.png")
116
+ page.to_image(path=individual_path, show_labels=True)
117
+ print(f"Saved individual highlight {i+1} to {individual_path}")
118
+
119
+ print("Color highlight test complete")
120
+
121
+ if __name__ == "__main__":
122
+ # Default to example PDF if no path is provided
123
+ if len(sys.argv) < 2:
124
+ # Use the example PDF in the pdfs directory
125
+ pdf_path = os.path.abspath(os.path.join(
126
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
127
+ if not os.path.exists(pdf_path):
128
+ print("Example PDF not found. Please provide a path to a PDF file.")
129
+ print("Usage: python highlight_test.py [path/to/file.pdf]")
130
+ sys.exit(1)
131
+ else:
132
+ pdf_path = sys.argv[1]
133
+ # Check if the file exists
134
+ if not os.path.exists(pdf_path):
135
+ print(f"File not found: {pdf_path}")
136
+ sys.exit(1)
137
+
138
+ # Get the test name from arguments if provided
139
+ test_name = "all"
140
+ if len(sys.argv) >= 3:
141
+ test_name = sys.argv[2].lower()
142
+
143
+ if test_name == "labels" or test_name == "all":
144
+ highlight_label_test(pdf_path)
145
+
146
+ if test_name == "colors" or test_name == "all":
147
+ highlight_color_test(pdf_path)