natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,123 @@
1
+ """
2
+ Example demonstrating the highlighting feature of natural-pdf.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ # IMPORTANT: This example has been updated to use the new API
13
+ # Changes:
14
+ # - select_until() → until()
15
+ # - full_width=False → width="element"
16
+ # - labels=True → show_labels=True
17
+ # - cycle_colors=True → use_color_cycling=True
18
+
19
+ def highlighting_example(pdf_path):
20
+ """Demonstrates the highlighting features for visual debugging."""
21
+ # Open the PDF
22
+ with PDF(pdf_path) as pdf:
23
+ page = pdf.pages[0]
24
+
25
+ print(f"PDF loaded: {pdf_path}")
26
+ print(f"PDF has {len(pdf)} pages")
27
+
28
+ # Create an output directory for saving images
29
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
30
+ os.makedirs(output_dir, exist_ok=True)
31
+
32
+ # EXAMPLE 1: Highlight a single element
33
+ print("\nEXAMPLE 1: Highlighting a single element")
34
+ print("-" * 60)
35
+
36
+ # Find the "Summary:" text
37
+ summary = page.find('text:contains("Summary:")')
38
+ print(f"Found 'Summary' text at: {summary.bbox}")
39
+
40
+ # Highlight it and save the image
41
+ summary.highlight(label="Summary Heading")
42
+ output_file = os.path.join(output_dir, "highlight_single.png")
43
+ summary.page.to_image(path=output_file, show_show_labels=True)
44
+ print(f"Saved highlighted page to: {output_file}")
45
+
46
+ # Clear highlights for next example
47
+ page.clear_highlights()
48
+
49
+ # EXAMPLE 2: Highlight multiple elements with automatic color cycling
50
+ print("\nEXAMPLE 2: Highlighting multiple elements with color cycling")
51
+ print("-" * 60)
52
+
53
+ # Find different types of elements
54
+ thick_lines = page.find_all('line[width>=2]')
55
+ headings = page.find_all('text:bold')
56
+
57
+ # Highlight each group with a label
58
+ print(f"Found {len(thick_lines)} thick lines")
59
+ thick_lines.highlight(label="Thick Lines")
60
+
61
+ print(f"Found {len(headings)} bold headings")
62
+ # Let's examine some of the bold headings
63
+ for i, h in enumerate(headings[:5]):
64
+ print(f" Bold heading {i+1}: '{h.text}' at {h.bbox}")
65
+ headings.highlight(label="Bold Headings")
66
+
67
+ # Save the image with a legend
68
+ output_file = os.path.join(output_dir, "highlight_multiple.png")
69
+ page.to_image(path=output_file, show_show_labels=True)
70
+ print(f"Saved page with multiple highlights to: {output_file}")
71
+
72
+ # Clear highlights for next example
73
+ page.clear_highlights()
74
+
75
+ # EXAMPLE 3: Highlighting regions
76
+ print("\nEXAMPLE 3: Highlighting regions")
77
+ print("-" * 60)
78
+
79
+ # Find the "Summary:" text and the thick line
80
+ summary = page.find('text:contains("Summary:")')
81
+ thick_line = page.find('line[width>=2]')
82
+
83
+ # Create a region from Summary until the thick line
84
+ summary_region = summary.until('line[width>=2]', width="full")
85
+ print(f"Created region from Summary to thick line: {summary_region.bbox}")
86
+
87
+ # Highlight the region
88
+ summary_region.highlight(label="Summary Section")
89
+
90
+ # Find text within the region and highlight with a different color
91
+ key_elements = summary_region.find_all('text')
92
+ print(f"Found {len(key_elements)} text elements in the region")
93
+
94
+ # Only highlight a subset to avoid cluttering the image
95
+ for element in key_elements[:10]:
96
+ if "fertilizer" in element.text.lower():
97
+ element.highlight(label="Key Terms")
98
+
99
+ # Save the image with a legend
100
+ output_file = os.path.join(output_dir, "highlight_region.png")
101
+ page.to_image(path=output_file, show_show_labels=True)
102
+ print(f"Saved page with highlighted region to: {output_file}")
103
+
104
+ print("\nEnd of highlighting demonstration.")
105
+
106
+ if __name__ == "__main__":
107
+ # Default to example PDF if no path is provided
108
+ if len(sys.argv) < 2:
109
+ # Use the example PDF in the pdfs directory
110
+ pdf_path = os.path.abspath(os.path.join(
111
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
112
+ if not os.path.exists(pdf_path):
113
+ print("Example PDF not found. Please provide a path to a PDF file.")
114
+ print("Usage: python highlighting_example.py [path/to/file.pdf]")
115
+ sys.exit(1)
116
+ else:
117
+ pdf_path = sys.argv[1]
118
+ # Check if the file exists
119
+ if not os.path.exists(pdf_path):
120
+ print(f"File not found: {pdf_path}")
121
+ sys.exit(1)
122
+
123
+ highlighting_example(pdf_path)
@@ -0,0 +1,84 @@
1
+ """
2
+ Example demonstrating image width customization in to_image method.
3
+ """
4
+ import os
5
+ import sys
6
+
7
+ # Add the parent directory to the path to import the package
8
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
9
+
10
+ from natural_pdf import PDF
11
+
12
+ def image_width_example(pdf_path):
13
+ """Demonstrate customizing image width with the to_image method."""
14
+ # Open the PDF
15
+ with PDF(pdf_path) as pdf:
16
+ page = pdf.pages[0]
17
+
18
+ print(f"PDF loaded: {pdf_path}")
19
+ print(f"PDF has {len(pdf)} pages")
20
+
21
+ # Create an output directory for saving images
22
+ output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
23
+ os.makedirs(output_dir, exist_ok=True)
24
+
25
+ # First highlight some elements to make the examples more interesting
26
+ page.clear_highlights()
27
+ page.highlight_all(include_types=['rect', 'line'])
28
+
29
+ # EXAMPLE 1: Save image with default width (based on scale)
30
+ print("\nEXAMPLE 1: Image with default width")
31
+ print("-" * 60)
32
+
33
+ output_file = os.path.join(output_dir, "width_default.png")
34
+ img = page.to_image(path=output_file, show_labels=True)
35
+ print(f"Original image size: {img.width} x {img.height} pixels")
36
+ print(f"Saved to: {output_file}")
37
+
38
+ # EXAMPLE 2: Image with custom width of 800px
39
+ print("\nEXAMPLE 2: Custom width of 800px")
40
+ print("-" * 60)
41
+
42
+ output_file = os.path.join(output_dir, "width_800px.png")
43
+ img = page.to_image(path=output_file, width=800, show_labels=True)
44
+ print(f"Custom image size: {img.width} x {img.height} pixels")
45
+ print(f"Saved to: {output_file}")
46
+
47
+ # EXAMPLE 3: Image with custom width of 1200px
48
+ print("\nEXAMPLE 3: Custom width of 1200px")
49
+ print("-" * 60)
50
+
51
+ output_file = os.path.join(output_dir, "width_1200px.png")
52
+ img = page.to_image(path=output_file, width=1200, show_labels=True)
53
+ print(f"Custom image size: {img.width} x {img.height} pixels")
54
+ print(f"Saved to: {output_file}")
55
+
56
+ # EXAMPLE 4: Using both scale and width (width takes precedence for final output)
57
+ print("\nEXAMPLE 4: Using both scale and width")
58
+ print("-" * 60)
59
+
60
+ output_file = os.path.join(output_dir, "width_with_scale.png")
61
+ img = page.to_image(path=output_file, scale=3.0, width=600, show_labels=True)
62
+ print(f"Scale 3.0 with width 600px: {img.width} x {img.height} pixels")
63
+ print(f"Saved to: {output_file}")
64
+
65
+ print("\nEnd of image width demonstration.")
66
+
67
+ if __name__ == "__main__":
68
+ # Default to example PDF if no path is provided
69
+ if len(sys.argv) < 2:
70
+ # Use the example PDF in the pdfs directory
71
+ pdf_path = os.path.abspath(os.path.join(
72
+ os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
73
+ if not os.path.exists(pdf_path):
74
+ print("Example PDF not found. Please provide a path to a PDF file.")
75
+ print("Usage: python image_width_example.py [path/to/file.pdf]")
76
+ sys.exit(1)
77
+ else:
78
+ pdf_path = sys.argv[1]
79
+ # Check if the file exists
80
+ if not os.path.exists(pdf_path):
81
+ print(f"File not found: {pdf_path}")
82
+ sys.exit(1)
83
+
84
+ image_width_example(pdf_path)
@@ -0,0 +1,128 @@
1
+ """
2
+ Example demonstrating the improved API consistency in natural-pdf.
3
+ """
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Add the parent directory to the path to import the package
9
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10
+
11
+ from natural_pdf import PDF
12
+
13
+ def consistency_example(pdf_path):
14
+ """Example showing the improved consistent API."""
15
+ # Open the PDF without OCR to avoid issues
16
+ with PDF(pdf_path) as pdf:
17
+
18
+ print(f"PDF has {len(pdf)} pages")
19
+ page = pdf.pages[0]
20
+
21
+ print("\n1. IMPROVED REGION CREATION:")
22
+ # Create a region with intuitive named parameters
23
+ header_region = page.region(top=0, bottom=100)
24
+ print(f" Created header region with bounds {header_region.bbox}")
25
+
26
+ # Create a custom region with element width
27
+ custom_region = page.region(
28
+ left=100, right=300,
29
+ top=200, bottom=400,
30
+ width="element"
31
+ )
32
+ print(f" Created custom region with bounds {custom_region.bbox}")
33
+
34
+ print("\n2. IMPROVED SPATIAL NAVIGATION:")
35
+ # Find a major element
36
+ heading = page.find('text[size>=12]')
37
+ if heading:
38
+ print(f" Found heading: '{heading.text}'")
39
+
40
+ # Use above/below with improved parameters
41
+ above_region = heading.above(height=50, width="full")
42
+ print(f" Region above: {above_region.bbox}")
43
+
44
+ # Below with element width
45
+ below_region = heading.below(height=100, width="element")
46
+ print(f" Region below (element width): {below_region.bbox}")
47
+
48
+ # Using until with consistent parameter naming
49
+ next_heading = page.find('text[size>=12]', skip=1)
50
+ if next_heading:
51
+ print(f" Found next heading: '{next_heading.text}'")
52
+
53
+ # Using the until method
54
+ between_region = heading.until(
55
+ 'text[size>=12]',
56
+ include_endpoint=False,
57
+ width="full"
58
+ )
59
+ # Don't use OCR for text extraction
60
+ print(f" Region between headings: {between_region.bbox}")
61
+
62
+ print("\n3. CONSISTENT EXTRACTION PARAMETERS:")
63
+ # Text extraction with consistent parameters
64
+ text = page.extract_text(
65
+ preserve_whitespace=True,
66
+ use_exclusions=True
67
+ )
68
+ print(f" Extracted {len(text)} characters")
69
+
70
+ print("\n4. CONSISTENT VISUAL METHODS:")
71
+ # Find and highlight elements with consistent parameters
72
+ lines = page.find_all('line[width>=1]')
73
+ if lines:
74
+ print(f" Found {len(lines)} thick lines")
75
+
76
+ # Highlight with label first, then color
77
+ lines.highlight(
78
+ label="Thick Lines",
79
+ color=(1, 0, 0, 0.5)
80
+ )
81
+
82
+ # Method chaining with save
83
+ lines.highlight(
84
+ label="Thick Lines"
85
+ ).save(
86
+ "improved_api_lines.png",
87
+ show_labels=True
88
+ )
89
+
90
+ print("\n5. BUILDER PATTERN:")
91
+ # Create regions for exclusion
92
+ header = page.region(top=0, bottom=50)
93
+ footer = page.region(top=page.height-50, bottom=page.height)
94
+
95
+ # Add exclusions with method chaining
96
+ pdf.add_exclusion(
97
+ lambda p: p.region(top=0, bottom=50),
98
+ label="headers"
99
+ ).add_exclusion(
100
+ lambda p: p.region(top=p.height-50, bottom=p.height),
101
+ label="footers"
102
+ )
103
+
104
+ # Extract with exclusions
105
+ filtered_text = page.extract_text(use_exclusions=True)
106
+ print(f" Extracted {len(filtered_text)} characters with exclusions")
107
+
108
+ # Method chaining with method return
109
+ pdf_same = pdf.add_exclusion(lambda p: None, label="test")
110
+ print(f" Method chaining returns same object: {pdf is pdf_same}")
111
+
112
+ if __name__ == "__main__":
113
+ # Default to example PDF if no path is provided
114
+ if len(sys.argv) < 2:
115
+ # Use the example PDF in the pdfs directory
116
+ pdf_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
117
+ if not os.path.exists(pdf_path):
118
+ print("Example PDF not found. Please provide a path to a PDF file.")
119
+ print("Usage: python improved_api_example.py [path/to/file.pdf]")
120
+ sys.exit(1)
121
+ else:
122
+ pdf_path = sys.argv[1]
123
+ # Check if the file exists
124
+ if not os.path.exists(pdf_path):
125
+ print(f"File not found: {pdf_path}")
126
+ sys.exit(1)
127
+
128
+ consistency_example(pdf_path)
@@ -0,0 +1,65 @@
1
+ """
2
+ Test displaying confidence scores in layout highlighting.
3
+
4
+ This example demonstrates how confidence scores are displayed next to
5
+ each layout region in both highlight_layout and highlight_all methods.
6
+ """
7
+ import os
8
+ import sys
9
+ import argparse
10
+
11
+ # Add the parent directory to the Python path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+ from natural_pdf import PDF
14
+
15
+ # Get the current directory of this script
16
+ script_dir = os.path.dirname(os.path.realpath(__file__))
17
+ # Get the parent directory (project root)
18
+ root_dir = os.path.dirname(script_dir)
19
+ # Default PDF path
20
+ default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
21
+
22
+ # Set up argument parser
23
+ parser = argparse.ArgumentParser(description="Layout confidence display test")
24
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
25
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
26
+ args = parser.parse_args()
27
+
28
+ print(f"Testing confidence display on: {args.pdf_path}")
29
+ print(f"Page: {args.page}")
30
+
31
+ # Load the PDF
32
+ pdf = PDF(args.pdf_path)
33
+ page = pdf.pages[args.page]
34
+
35
+ # Run layout analysis with different models
36
+ print("Running layout analysis...")
37
+ page.analyze_layout(model="yolo", confidence=0.1) # Use low confidence to get more regions
38
+ page.analyze_layout(model="tatr", confidence=0.1, existing="append") # Low confidence for TATR too
39
+ print(f"Found {len(page.detected_layout_regions)} total layout regions")
40
+
41
+ # Test 1: highlight_layout with default format
42
+ print("\nTest 1: Using highlight_layout with default format")
43
+ page.clear_highlights()
44
+ page.highlight_layout()
45
+ output_path = os.path.join(root_dir, "output", "conf_display_highlight_layout.png")
46
+ page.to_image(path=output_path, show_labels=True)
47
+ print(f"Saved to {output_path}")
48
+
49
+ # Test 2: highlight_all with include_layout_regions=True
50
+ print("\nTest 2: Using highlight_all with include_layout_regions=True")
51
+ page.clear_highlights()
52
+ page.highlight_all(include_layout_regions=True, layout_confidence=0.1)
53
+ output_path = os.path.join(root_dir, "output", "conf_display_highlight_all.png")
54
+ page.to_image(path=output_path, show_labels=True)
55
+ print(f"Saved to {output_path}")
56
+
57
+ # Test 3: highlight_all with only layout regions
58
+ print("\nTest 3: Using highlight_all with only layout regions")
59
+ page.clear_highlights()
60
+ page.highlight_all(include_layout_regions=True, include_types=[], layout_confidence=0.1)
61
+ output_path = os.path.join(root_dir, "output", "conf_display_layout_only.png")
62
+ page.to_image(path=output_path, show_labels=True)
63
+ print(f"Saved to {output_path}")
64
+
65
+ print("\nDone!")
@@ -0,0 +1,82 @@
1
+ """
2
+ Test the layout_confidence=True behavior in highlight_all method.
3
+
4
+ This example demonstrates that when layout_confidence=True is passed,
5
+ all layout regions are included regardless of their confidence score.
6
+ """
7
+ import os
8
+ import sys
9
+ import argparse
10
+
11
+ # Add the parent directory to the Python path
12
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
13
+ from natural_pdf import PDF
14
+
15
+ # Get the current directory of this script
16
+ script_dir = os.path.dirname(os.path.realpath(__file__))
17
+ # Get the parent directory (project root)
18
+ root_dir = os.path.dirname(script_dir)
19
+ # Default PDF path
20
+ default_pdf = os.path.join(root_dir, "pdfs", "01-practice.pdf")
21
+
22
+ # Set up argument parser
23
+ parser = argparse.ArgumentParser(description="Layout confidence test")
24
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
25
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
26
+ args = parser.parse_args()
27
+
28
+ print(f"Testing layout_confidence=True on: {args.pdf_path}")
29
+ print(f"Page: {args.page}")
30
+
31
+ # Load the PDF
32
+ pdf = PDF(args.pdf_path)
33
+ page = pdf.pages[args.page]
34
+
35
+ # Run layout analysis with YOLO and TATR
36
+ print("Running layout analysis...")
37
+ page.analyze_layout(model="yolo", confidence=0.1) # Use low confidence to get more regions
38
+ page.analyze_layout(model="tatr", confidence=0.1, existing="append") # Low confidence for TATR too
39
+ print(f"Found {len(page.detected_layout_regions)} total layout regions")
40
+
41
+ # Count regions by confidence thresholds
42
+ high_conf = [r for r in page.detected_layout_regions if r.confidence >= 0.5]
43
+ med_conf = [r for r in page.detected_layout_regions if 0.2 <= r.confidence < 0.5]
44
+ low_conf = [r for r in page.detected_layout_regions if r.confidence < 0.2]
45
+
46
+ print(f"High confidence (>=0.5): {len(high_conf)} regions")
47
+ print(f"Medium confidence (0.2-0.5): {len(med_conf)} regions")
48
+ print(f"Low confidence (<0.2): {len(low_conf)} regions")
49
+
50
+ # Test 1: highlight_all with default layout_confidence=0.2
51
+ print("\nTest 1: Using default layout_confidence=0.2")
52
+ page.clear_highlights()
53
+ page.highlight_all(include_layout_regions=True)
54
+ output_path = os.path.join(root_dir, "output", "layout_conf_default.png")
55
+ page.to_image(path=output_path, show_labels=True)
56
+ print(f"Saved to {output_path}")
57
+
58
+ # Test 2: highlight_all with layout_confidence=0.5 (high threshold)
59
+ print("\nTest 2: Using layout_confidence=0.5 (high threshold)")
60
+ page.clear_highlights()
61
+ page.highlight_all(include_layout_regions=True, layout_confidence=0.5)
62
+ output_path = os.path.join(root_dir, "output", "layout_conf_high.png")
63
+ page.to_image(path=output_path, show_labels=True)
64
+ print(f"Saved to {output_path}")
65
+
66
+ # Test 3: highlight_all with layout_confidence=True (include all)
67
+ print("\nTest 3: Using layout_confidence=True (include all)")
68
+ page.clear_highlights()
69
+ page.highlight_all(include_layout_regions=True, layout_confidence=True)
70
+ output_path = os.path.join(root_dir, "output", "layout_conf_all.png")
71
+ page.to_image(path=output_path, show_labels=True)
72
+ print(f"Saved to {output_path}")
73
+
74
+ # Test 4: highlight_all with layout_confidence=0.0 (include all)
75
+ print("\nTest 4: Using layout_confidence=0.0 (include all)")
76
+ page.clear_highlights()
77
+ page.highlight_all(include_layout_regions=True, layout_confidence=0.0)
78
+ output_path = os.path.join(root_dir, "output", "layout_conf_zero.png")
79
+ page.to_image(path=output_path, show_labels=True)
80
+ print(f"Saved to {output_path}")
81
+
82
+ print("\nDone!")