natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,121 @@
1
+ """
2
+ OCR Visualization Test
3
+
4
+ This example demonstrates the OCR text visualization feature using PaddleOCR.
5
+ """
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add project directory to the path
11
+ script_dir = os.path.dirname(os.path.realpath(__file__))
12
+ root_dir = os.path.dirname(script_dir)
13
+ sys.path.insert(0, root_dir)
14
+
15
+ # Import the library
16
+ from natural_pdf import PDF
17
+
18
+ # Set up paths
19
+ output_dir = os.path.join(root_dir, "output")
20
+ os.makedirs(output_dir, exist_ok=True)
21
+
22
+ # Use a PDF that typically needs OCR
23
+ pdf_path = os.path.join(root_dir, "pdfs", "needs-ocr.pdf")
24
+ if not os.path.exists(pdf_path):
25
+ # Fallback to other PDFs if the needs-ocr.pdf doesn't exist
26
+ pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
27
+ if not os.path.exists(pdf_path):
28
+ pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
29
+
30
+ print("OCR Visualization Test")
31
+ print("=====================")
32
+ print(f"Using PDF: {pdf_path}")
33
+
34
+ # Initialize the PDF with PaddleOCR engine
35
+ try:
36
+ # Try with PaddleOCR first
37
+ pdf = PDF(
38
+ pdf_path,
39
+ ocr_engine="paddleocr",
40
+ ocr={
41
+ "enabled": True,
42
+ "languages": ["en"],
43
+ "min_confidence": 0.3
44
+ }
45
+ )
46
+ print("Using PaddleOCR engine")
47
+ except Exception as e:
48
+ print(f"PaddleOCR initialization failed: {e}")
49
+ print("Falling back to EasyOCR")
50
+ # Fall back to EasyOCR
51
+ pdf = PDF(
52
+ pdf_path,
53
+ ocr_engine="easyocr",
54
+ ocr={
55
+ "enabled": True,
56
+ "languages": ["en"],
57
+ "min_confidence": 0.3
58
+ }
59
+ )
60
+
61
+ # Access the first page
62
+ page = pdf.pages[0]
63
+
64
+ # Force OCR text extraction
65
+ print("\nExtracting text with OCR...")
66
+ text = page.extract_text(ocr=True)
67
+ print(f"Extracted {len(text)} characters of text")
68
+ if text:
69
+ print(f"First 100 chars: {text[:100]}...")
70
+
71
+ # Extract OCR elements
72
+ print("\nExtracting OCR elements...")
73
+ ocr_elements = page.extract_ocr_elements()
74
+ print(f"Found {len(ocr_elements)} OCR elements")
75
+
76
+ # Create highlight visualization
77
+ print("\nCreating highlight visualization...")
78
+ for elem in ocr_elements:
79
+ # Use color based on confidence - with full RGB values (0-255) and higher opacity
80
+ if elem.confidence >= 0.8:
81
+ color = (0, 255, 0, 180) # Green for high confidence (more visible)
82
+ elif elem.confidence >= 0.5:
83
+ color = (255, 255, 0, 180) # Yellow for medium confidence
84
+ else:
85
+ color = (255, 0, 0, 180) # Red for low confidence
86
+
87
+ # Add highlight with confidence as label
88
+ elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
89
+
90
+ # Save image with highlights only
91
+ highlight_path = os.path.join(output_dir, "ocr_visualization_highlights.png")
92
+ page.to_image(path=highlight_path, show_labels=True)
93
+ print(f"Saved highlighted image to {highlight_path}")
94
+
95
+ # Now use the OCR text rendering feature
96
+ if len(ocr_elements) > 0:
97
+ print("\nCreating rendered OCR text visualization...")
98
+
99
+ # Save image with OCR text rendered
100
+ ocr_text_path = os.path.join(output_dir, "ocr_visualization_text.png")
101
+ try:
102
+ page.to_image(path=ocr_text_path, show_labels=True, render_ocr=True)
103
+ print(f"Saved OCR text rendering to {ocr_text_path}")
104
+ except ValueError as e:
105
+ print(f"Error rendering OCR text: {e}")
106
+
107
+ # Clear highlights and render only OCR text
108
+ print("\nCreating clean OCR text visualization...")
109
+ page.clear_highlights()
110
+
111
+ # Save clean image with only OCR text
112
+ clean_text_path = os.path.join(output_dir, "ocr_visualization_clean.png")
113
+ try:
114
+ page.to_image(path=clean_text_path, render_ocr=True)
115
+ print(f"Saved clean OCR text rendering to {clean_text_path}")
116
+ except ValueError as e:
117
+ print(f"Error rendering clean OCR text: {e}")
118
+ else:
119
+ print("\nNo OCR elements found to render.")
120
+
121
+ print("\nTest complete!")
@@ -0,0 +1,315 @@
1
+ """
2
+ Document layout analysis example using PaddlePaddle's PP-Structure model.
3
+
4
+ This example demonstrates how to use PaddlePaddle for document layout analysis
5
+ to detect and extract content from different regions of a PDF document.
6
+
7
+ Features:
8
+ - Standard layout detection using PaddlePaddle's PP-Structure
9
+ - Enhanced text detection by combining PP-Structure with direct OCR
10
+ - Visualization of different region types and sources
11
+ - Comparison mode to evaluate performance with and without text detection
12
+ - Support for polygon-based text regions from OCR
13
+ """
14
+ import os
15
+ import sys
16
+ import logging
17
+ from pathlib import Path
18
+ import argparse
19
+
20
+ # Import the library with its logging utilities
21
+ sys.path.insert(0, str(Path(__file__).parent.parent))
22
+ from natural_pdf import configure_logging, PDF
23
+
24
+ # Get the current directory of this script
25
+ script_dir = os.path.dirname(os.path.realpath(__file__))
26
+ # Get the parent directory (project root)
27
+ root_dir = os.path.dirname(script_dir)
28
+ # Default PDF path
29
+ default_pdf = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
30
+
31
+ # Set up argument parser
32
+ parser = argparse.ArgumentParser(description="PaddlePaddle layout analysis example")
33
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
34
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
35
+ parser.add_argument("--conf", type=float, default=0.2, help="Confidence threshold for detections")
36
+ parser.add_argument("--lang", type=str, default="en", help="Language code (en, ch, etc.)")
37
+ parser.add_argument("--device", type=str, default="cpu", help="Device to run inference on ('cpu' or 'gpu')")
38
+ parser.add_argument("--output", type=str, default=None, help="Output file path for highlighted image")
39
+ parser.add_argument("--disable-table", action="store_true", help="Disable table detection")
40
+ parser.add_argument("--text-detection", action="store_true", help="Enable direct text detection")
41
+ parser.add_argument("--compare", action="store_true", help="Compare with and without text detection")
42
+ parser.add_argument("--verbose", action="store_true", help="Show detailed debug output")
43
+ parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
44
+ default="INFO", help="Set logging level")
45
+ args = parser.parse_args()
46
+
47
+ # Configure logging based on command-line arguments
48
+ log_level = getattr(logging, args.log_level)
49
+ configure_logging(level=log_level)
50
+
51
+ # Further adjust logging for verbose mode
52
+ if args.verbose:
53
+ configure_logging(level=logging.DEBUG)
54
+
55
+ print(f"Analyzing PDF: {args.pdf_path}")
56
+ print(f"Page: {args.page}")
57
+ print(f"Confidence threshold: {args.conf}")
58
+
59
+ # Load the PDF
60
+ pdf = PDF(args.pdf_path)
61
+ page = pdf.pages[args.page]
62
+
63
+ print(f"Running PaddlePaddle layout analysis...")
64
+
65
+ # Enable debugging output
66
+ print("PDF page dimensions:", page.width, "x", page.height)
67
+
68
+ # Check if we should run comparison
69
+ if args.compare:
70
+ print("\n=== Comparing Layout Detection With and Without Text Detection ===")
71
+
72
+ # First run without text detection
73
+ print("\nRunning WITHOUT text detection...")
74
+ import time
75
+ start = time.time()
76
+ regions_without_text = page.analyze_layout(
77
+ model="paddle",
78
+ confidence=args.conf,
79
+ device=args.device,
80
+ model_params={
81
+ "lang": args.lang,
82
+ "show_log": args.verbose,
83
+ "detect_text": False,
84
+ "verbose": args.verbose
85
+ }
86
+ )
87
+ time_without = time.time() - start
88
+
89
+ # Highlight without text detection
90
+ page.highlight_layout()
91
+
92
+ # Save the highlighted image
93
+ output_without = os.path.join(
94
+ os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
95
+ "paddle_layout_without_text.png"
96
+ )
97
+ page.to_image(path=output_without, show_labels=True)
98
+ print(f"Found {len(regions_without_text)} regions WITHOUT text detection in {time_without:.2f} seconds")
99
+ print(f"Saved image to {output_without}")
100
+
101
+ # Clear highlights
102
+ page.clear_highlights()
103
+
104
+ # Then run with text detection
105
+ print("\nRunning WITH text detection...")
106
+ start = time.time()
107
+ regions_with_text = page.analyze_layout(
108
+ model="paddle",
109
+ confidence=args.conf,
110
+ device=args.device,
111
+ model_params={
112
+ "lang": args.lang,
113
+ "show_log": args.verbose,
114
+ "detect_text": True,
115
+ "verbose": args.verbose
116
+ }
117
+ )
118
+ time_with = time.time() - start
119
+
120
+ # Highlight with text detection
121
+ page.highlight_layout()
122
+
123
+ # Save the highlighted image
124
+ output_with = os.path.join(
125
+ os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
126
+ "paddle_layout_with_text.png"
127
+ )
128
+ page.to_image(path=output_with, show_labels=True)
129
+ print(f"Found {len(regions_with_text)} regions WITH text detection in {time_with:.2f} seconds")
130
+ print(f"Saved image to {output_with}")
131
+
132
+ # Comparison
133
+ print("\nComparison results:")
134
+ print(f" - WITHOUT text detection: {len(regions_without_text)} regions in {time_without:.2f} seconds")
135
+ print(f" - WITH text detection: {len(regions_with_text)} regions in {time_with:.2f} seconds")
136
+ print(f" - Additional regions: {len(regions_with_text) - len(regions_without_text)}")
137
+ print(f" - Speed difference: {time_with / time_without:.2f}x longer with text detection")
138
+
139
+ # Continue with the regions from the requested mode
140
+ regions = regions_with_text if args.text_detection else regions_without_text
141
+
142
+ else:
143
+ # Run regular layout analysis
144
+ regions = page.analyze_layout(
145
+ model="paddle",
146
+ confidence=args.conf,
147
+ device=args.device,
148
+ model_params={
149
+ "lang": args.lang,
150
+ "show_log": args.verbose,
151
+ "detect_text": args.text_detection,
152
+ "verbose": args.verbose
153
+ }
154
+ )
155
+
156
+ print(f"Found {len(regions)} regions with confidence >= {args.conf}")
157
+
158
+ # Group regions by type and source
159
+ regions_by_type = {}
160
+ sources = {"layout": 0, "ocr": 0, "unknown": 0}
161
+
162
+ for region in regions:
163
+ region_type = region.region_type
164
+ if region_type not in regions_by_type:
165
+ regions_by_type[region_type] = []
166
+ regions_by_type[region_type].append(region)
167
+
168
+ # Count sources
169
+ source = getattr(region, "source", "unknown")
170
+ sources[source] = sources.get(source, 0) + 1
171
+
172
+ # Print a summary of detected regions by type
173
+ for region_type, type_regions in regions_by_type.items():
174
+ print(f" - {region_type}: {len(type_regions)} regions")
175
+
176
+ # Print source information
177
+ print("\nRegion sources:")
178
+ for source, count in sources.items():
179
+ print(f" - {source}: {count} regions")
180
+
181
+ # If the user enabled text detection, show source-specific highlighting
182
+ if args.text_detection:
183
+ print("\nHighlighting regions by source...")
184
+
185
+ # Clear any existing highlights
186
+ page.clear_highlights()
187
+
188
+ # Get text regions separately using normalized_type
189
+ text_regions = page.find_all('region[normalized_type=plain-text][model=paddle]')
190
+ figure_regions = page.find_all('region[normalized_type=figure][model=paddle]')
191
+
192
+ # Highlight figure regions in blue
193
+ for region in figure_regions:
194
+ region.highlight(color=(0, 0, 1, 0.3), label=f"Figure: {region.region_type}")
195
+
196
+ # Highlight text regions in green
197
+ for region in text_regions:
198
+ region.highlight(color=(0, 1, 0, 0.3), label=f"Text: {region.region_type}")
199
+
200
+ # Save the source-highlighted image
201
+ sources_output = os.path.join(
202
+ os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
203
+ "paddle_layout_sources.png"
204
+ )
205
+ page.to_image(path=sources_output, show_labels=True)
206
+ print(f"Saved source-highlighted layout to {sources_output}")
207
+
208
+ # Show polygon visualizations if any OCR regions have polygons
209
+ regions_with_polygons = [r for r in regions if hasattr(r, "polygon")]
210
+ if regions_with_polygons:
211
+ print(f"\nVisualizing {len(regions_with_polygons)} regions with polygon points...")
212
+ page.clear_highlights()
213
+
214
+ # Highlight regions with polygons in red
215
+ for region in regions_with_polygons:
216
+ region.highlight(color=(1, 0, 0, 0.3), label="Polygon Region")
217
+
218
+ # Save the polygon-highlighted image
219
+ polygon_output = os.path.join(
220
+ os.path.dirname(args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")),
221
+ "paddle_layout_polygons.png"
222
+ )
223
+ page.to_image(path=polygon_output, show_labels=True)
224
+ print(f"Saved polygon visualization to {polygon_output}")
225
+
226
+ # Clear highlights for standard view
227
+ page.clear_highlights()
228
+
229
+ # Highlight all detected regions normally
230
+ page.highlight_all(include_layout_regions=True, layout_confidence=args.conf)
231
+
232
+ # Demonstrate using selectors to find regions by type and model
233
+ print("\nSelecting regions by type and model:")
234
+ for region_type in regions_by_type.keys():
235
+ # Convert spaces to hyphens for selector syntax
236
+ selector_type = region_type.lower().replace(' ', '-')
237
+
238
+ # Use model-specific selector
239
+ # Use either type or normalized_type in selector
240
+ if region_type.lower() == 'text':
241
+ selector = f"region[normalized_type=plain-text][model=paddle]"
242
+ else:
243
+ selector = f"region[normalized_type={selector_type}][model=paddle]"
244
+
245
+ found_regions = page.find_all(selector)
246
+ print(f" - {selector}: {len(found_regions)} regions")
247
+
248
+ # Try different selectors to debug the issue
249
+ model_regions = page.find_all(f"region[type={selector_type}]")
250
+ paddle_regions = page.find_all(f"region[model=paddle]")
251
+ layout_regions = page.find_all(f"region[source=layout]")
252
+ ocr_regions = page.find_all(f"region[source=ocr]")
253
+ detected_regions = page.find_all(f"region[source=detected]")
254
+
255
+ print(f" - With type only: {len(model_regions)} regions")
256
+ print(f" - With model=paddle: {len(paddle_regions)} regions")
257
+ print(f" - With source=layout: {len(layout_regions)} regions")
258
+ print(f" - With source=ocr: {len(ocr_regions)} regions")
259
+ print(f" - With source=detected: {len(detected_regions)} regions")
260
+
261
+ # Debug a sample region
262
+ if model_regions:
263
+ region = model_regions[0]
264
+ print(f" - Sample region attributes: type={region.region_type}, normalized_type={getattr(region, 'normalized_type', 'N/A')}, " +
265
+ f"source={getattr(region, 'source', 'N/A')}, model={getattr(region, 'model', 'N/A')}")
266
+
267
+ # For text regions, find a sample to debug
268
+ if region_type.lower() == 'text' and detected_regions:
269
+ text_sample = None
270
+ for i, r in enumerate(detected_regions[:10]):
271
+ print(f" - Detected region {i}: type={r.region_type}, normalized_type={getattr(r, 'normalized_type', 'N/A')}")
272
+
273
+ # Extract text from the first region if available
274
+ if found_regions:
275
+ text = found_regions[0].extract_text()
276
+ preview = text[:50] + "..." if len(text) > 50 else text
277
+ print(f" First region text: {preview}")
278
+
279
+ # Save the highlighted image
280
+ output_path = args.output or os.path.join(root_dir, "output", "paddle_layout_detection.png")
281
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
282
+ print(f"\nSaving highlighted layout to {output_path}")
283
+ page.to_image(path=output_path, show_labels=True)
284
+ print(f"Done!")
285
+
286
+ # Show an example of working with a table region
287
+ if "table" in regions_by_type and regions_by_type["table"]:
288
+ print("\nExample: Working with a detected table region")
289
+ table_region = regions_by_type["table"][0]
290
+
291
+ # Extract table data
292
+ try:
293
+ # Try using the extract_table method on the region
294
+ table_data = table_region.extract_table()
295
+ print(f" Extracted {len(table_data)} rows from table")
296
+
297
+ # Show some table data
298
+ for i, row in enumerate(table_data[:2]): # Show first 2 rows
299
+ print(f" Row {i}: {row}")
300
+
301
+ # Check for cells
302
+ cells = page.find_all('region[type=table_cell][model=paddle]')
303
+ if cells:
304
+ print(f"\n Found {len(cells)} table cells")
305
+ cell = cells[0]
306
+ print(f" First cell text: {cell.extract_text()}")
307
+ print(f" Row index: {getattr(cell, 'row_idx', 'N/A')}, Column index: {getattr(cell, 'col_idx', 'N/A')}")
308
+ except Exception as e:
309
+ print(f" Error extracting table data: {e}")
310
+
311
+ # Save the highlighted table
312
+ table_output = os.path.join(os.path.dirname(output_path), "paddle_detected_table.png")
313
+ table_region.highlight(color=(0, 1, 0, 0.3), label="PaddlePaddle Table")
314
+ page.to_image(path=table_output, show_labels=True)
315
+ print(f" Table highlighted image saved to {table_output}")
@@ -0,0 +1,74 @@
1
+ """
2
+ Simple test of PaddlePaddle layout analysis using minimal parameters.
3
+ """
4
+ import os
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ # Add parent directory to path for imports
9
+ sys.path.insert(0, str(Path(__file__).parent.parent))
10
+
11
+ from natural_pdf import PDF
12
+
13
+ # Get the current directory of this script
14
+ script_dir = os.path.dirname(os.path.realpath(__file__))
15
+ # Get the parent directory (project root)
16
+ root_dir = os.path.dirname(script_dir)
17
+
18
+ # Get PDF path from command line or use default
19
+ if len(sys.argv) > 1:
20
+ pdf_path = sys.argv[1]
21
+ else:
22
+ # Default PDF path
23
+ pdf_path = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
24
+
25
+ # Get page number from command line or use default
26
+ page_num = int(sys.argv[2]) if len(sys.argv) > 2 else 0
27
+
28
+ print(f"Analyzing PDF: {pdf_path}")
29
+ print(f"Page: {page_num}")
30
+
31
+ # Load the PDF
32
+ pdf = PDF(pdf_path)
33
+ page = pdf.pages[page_num]
34
+
35
+ print("Running PaddlePaddle layout analysis...")
36
+
37
+ # Run paddle layout analysis using our minimal approach
38
+ regions = page.analyze_layout(
39
+ model="paddle",
40
+ confidence=0.2, # Lower confidence threshold to detect more regions
41
+ model_params={
42
+ "show_log": True
43
+ }
44
+ )
45
+
46
+ print(f"Found {len(regions)} regions")
47
+
48
+ # Group regions by type and source
49
+ region_groups = {}
50
+ for region in regions:
51
+ region_type = region.region_type
52
+ source = getattr(region, 'source', 'unknown')
53
+ group_key = f"{region_type} ({source})"
54
+
55
+ if group_key not in region_groups:
56
+ region_groups[group_key] = []
57
+ region_groups[group_key].append(region)
58
+
59
+ # Print regions by type and source
60
+ for group_key, group_regions in region_groups.items():
61
+ print(f"{group_key}: {len(group_regions)} regions")
62
+
63
+ # Highlight regions by type and source with different colors
64
+ print("Highlighting regions...")
65
+ for group_key, group_regions in region_groups.items():
66
+ for region in group_regions:
67
+ region.highlight(label=f"{group_key}")
68
+
69
+ # Save highlighted image
70
+ output_path = os.path.join(root_dir, "output", "paddle_layout_simple.png")
71
+ print(f"Saving highlighted image to {output_path}")
72
+ page.to_image(path=output_path, show_labels=True)
73
+
74
+ print("Done!")