natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,123 @@
1
+ """
2
+ Document layout analysis example using YOLO model.
3
+
4
+ This example demonstrates how to use the document layout analysis
5
+ functionality to detect and extract content from different regions
6
+ of a PDF document.
7
+ """
8
+ import os
9
+ import sys
10
+ import argparse
11
+
12
+ # Add the parent directory to the Python path
13
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
14
+ from natural_pdf import PDF
15
+
16
+ # Get the current directory of this script
17
+ script_dir = os.path.dirname(os.path.realpath(__file__))
18
+ # Get the parent directory (project root)
19
+ root_dir = os.path.dirname(script_dir)
20
+ # Default PDF path
21
+ default_pdf = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
22
+
23
+ # Set up argument parser
24
+ parser = argparse.ArgumentParser(description="Document layout analysis example")
25
+ parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
26
+ parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
27
+ parser.add_argument("--conf", type=float, default=0.2, help="Confidence threshold for detections")
28
+ parser.add_argument("--model-path", type=str, default=None, help="Path to custom YOLO model")
29
+ parser.add_argument("--device", type=str, default="cpu", help="Device to run inference on ('cpu' or 'cuda:0')")
30
+ parser.add_argument("--output", type=str, default=None, help="Output file path for highlighted image")
31
+ args = parser.parse_args()
32
+
33
+ print(f"Analyzing PDF: {args.pdf_path}")
34
+ print(f"Page: {args.page}")
35
+ print(f"Confidence threshold: {args.conf}")
36
+
37
+ # Load the PDF
38
+ pdf = PDF(args.pdf_path)
39
+ page = pdf.pages[args.page]
40
+
41
+ print(f"Running document layout analysis...")
42
+
43
+ # Run document layout analysis
44
+ # The analyze_layout method now returns self for method chaining
45
+ page.analyze_layout(
46
+ confidence=args.conf,
47
+ model_path=args.model_path,
48
+ device=args.device
49
+ )
50
+
51
+ print(f"Found {len(page.detected_layout_regions)} regions with confidence >= {args.conf}")
52
+
53
+ # Group regions by type
54
+ regions_by_type = {}
55
+ for region in page.detected_layout_regions:
56
+ region_type = region.region_type
57
+ if region_type not in regions_by_type:
58
+ regions_by_type[region_type] = []
59
+ regions_by_type[region_type].append(region)
60
+
61
+ # Print a summary of detected regions by type
62
+ for region_type, type_regions in regions_by_type.items():
63
+ print(f" - {region_type}: {len(type_regions)} regions")
64
+
65
+ # You can highlight layout regions in two ways:
66
+ # 1. Using the dedicated highlight_layout method
67
+ # page.highlight_layout(regions, confidence=args.conf)
68
+
69
+ # 2. Using highlight_all with include_layout_regions=True
70
+ page.highlight_all(include_layout_regions=True, layout_confidence=args.conf)
71
+
72
+ # Demonstrate using selectors to find regions by type
73
+ print("\nSelecting regions by type:")
74
+ for region_type in regions_by_type.keys():
75
+ # Convert spaces to hyphens for selector syntax
76
+ selector_type = region_type.lower().replace(' ', '-')
77
+ selector = f"region[type={selector_type}]"
78
+
79
+ found_regions = page.find_all(selector)
80
+ print(f" - {selector}: {len(found_regions)} regions")
81
+
82
+ # Extract text from the first region if available
83
+ if found_regions:
84
+ text = found_regions[0].extract_text()
85
+ preview = text[:50] + "..." if len(text) > 50 else text
86
+ print(f" First region text: {preview}")
87
+
88
+ # Finding high-confidence titles
89
+ high_conf_titles = page.find_all('region[type=title][confidence>=0.8]')
90
+ if high_conf_titles:
91
+ print(f"\nFound {len(high_conf_titles)} high-confidence titles:")
92
+ for i, title in enumerate(high_conf_titles):
93
+ text = title.extract_text().strip()
94
+ print(f" {i+1}. {text} (conf: {title.confidence:.2f})")
95
+
96
+ # Save the highlighted image
97
+ output_path = args.output or os.path.join(root_dir, "output", "layout_detection.png")
98
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
99
+ print(f"\nSaving highlighted layout to {output_path}")
100
+ page.to_image(path=output_path, show_labels=True)
101
+ print(f"Done!")
102
+
103
+ # Show an example of using a detected region for further analysis
104
+ if "table" in regions_by_type and regions_by_type["table"]:
105
+ print("\nExample: Working with a detected table region")
106
+ table_region = regions_by_type["table"][0]
107
+
108
+ # Highlight the table region with a specific color
109
+ table_region.highlight(label="Selected Table", color=(0, 1, 0, 0.3))
110
+
111
+ # Find text elements within the table region
112
+ table_text = table_region.find_all('text')
113
+ print(f" Found {len(table_text)} text elements in the table")
114
+
115
+ # Extract the table text
116
+ table_content = table_region.extract_text()
117
+ preview = table_content[:100] + "..." if len(table_content) > 100 else table_content
118
+ print(f" Table content: {preview}")
119
+
120
+ # Save the highlighted table
121
+ table_output = os.path.join(os.path.dirname(output_path), "detected_table.png")
122
+ page.to_image(path=table_output, show_labels=True)
123
+ print(f" Table highlighted image saved to {table_output}")
@@ -0,0 +1,185 @@
1
+ """
2
+ Example demonstrating the Document QA capabilities of Natural PDF.
3
+
4
+ This example shows how to:
5
+ 1. Ask questions to a PDF document
6
+ 2. Ask questions to specific pages
7
+ 3. Ask questions to specific regions
8
+ 4. Control confidence thresholds
9
+ 5. Highlight answer elements
10
+ 6. Handle QA results
11
+
12
+ Requirements:
13
+ - transformers
14
+ - torch
15
+ """
16
+
17
+ import os
18
+ import sys
19
+ import argparse
20
+ from PIL import Image, ImageDraw, ImageFont
21
+ import logging
22
+ from typing import Dict, Any
23
+
24
+ # Add parent directory to path to run without installing
25
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
26
+
27
+ from natural_pdf import PDF, configure_logging
28
+
29
+ def format_qa_result(result: Dict[str, Any]) -> str:
30
+ """Format a QA result as a string."""
31
+ if not result.get("found", False):
32
+ return f"No answer found. {result.get('message', '')}"
33
+
34
+ answer = result.get("answer", "")
35
+ confidence = result.get("confidence", 0.0)
36
+ page_num = result.get("page_num", 0)
37
+
38
+ return f"Answer: {answer} (confidence: {confidence:.2f}, page: {page_num})"
39
+
40
+ def main():
41
+ parser = argparse.ArgumentParser(description="Document QA Example")
42
+ parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
43
+ help="Path to PDF document")
44
+ parser.add_argument("--questions", nargs="+",
45
+ default=["How many votes for Harris and Walz?",
46
+ "How many votes for Trump and Vance?",
47
+ "What precinct is this for?",
48
+ "What state is this for?"],
49
+ help="Questions to ask")
50
+ parser.add_argument("--highlight", action="store_true",
51
+ help="Highlight answer elements")
52
+ parser.add_argument("--min-confidence", type=float, default=0.2,
53
+ help="Minimum confidence threshold (0.0-1.0)")
54
+ parser.add_argument("--verbose", action="store_true",
55
+ help="Enable verbose output")
56
+ parser.add_argument("--model", default="impira/layoutlm-document-qa",
57
+ help="Model to use (default: impira/layoutlm-document-qa)")
58
+ parser.add_argument("--region", action="store_true",
59
+ help="Ask questions to specific regions instead of whole pages")
60
+
61
+ args = parser.parse_args()
62
+
63
+ # Configure logging
64
+ log_level = logging.DEBUG if args.verbose else logging.INFO
65
+ configure_logging(level=log_level)
66
+
67
+ # Open the PDF
68
+ pdf = PDF(args.pdf_path)
69
+ page = pdf.pages[0] # Use the first page for this example
70
+
71
+ print(f"Document: {args.pdf_path}")
72
+ print(f"Page count: {len(pdf.pages)}")
73
+ print(f"Model: {args.model}")
74
+ print(f"Minimum confidence: {args.min_confidence}")
75
+ print()
76
+
77
+ # Create output directory if not exists
78
+ os.makedirs("output", exist_ok=True)
79
+
80
+ # If using regions, detect document layout
81
+ if args.region:
82
+ print("Detecting document layout...")
83
+ page.analyze_layout(confidence=0.3)
84
+ regions = page.find_all('region')
85
+ print(f"Found {len(regions)} regions")
86
+
87
+ # Save an image with detected regions
88
+ page.highlight_layout()
89
+ page.save_image("output/document_qa_regions.png")
90
+ print("Saved layout visualization to output/document_qa_regions.png")
91
+ print()
92
+
93
+ # Process each question
94
+ for i, question in enumerate(args.questions):
95
+ print(f"Question {i+1}: {question}")
96
+
97
+ if args.region:
98
+ # Ask each region (sort by confidence)
99
+ all_results = []
100
+ for region in regions:
101
+ if region.region_type in ['title', 'plain-text', 'table', 'list']:
102
+ result = region.ask(
103
+ question=question,
104
+ min_confidence=args.min_confidence,
105
+ model=args.model
106
+ )
107
+ if result.get("found", False):
108
+ all_results.append(result)
109
+
110
+ # Sort by confidence
111
+ all_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
112
+
113
+ if all_results:
114
+ result = all_results[0] # Use the highest confidence result
115
+ print(format_qa_result(result))
116
+
117
+ # Highlight the answer if requested
118
+ if args.highlight and result.get("source_elements"):
119
+ highlight_image = page.duplicate()
120
+ region_type = result["region"].region_type if "region" in result else "unknown"
121
+ for element in result["source_elements"]:
122
+ element.highlight(color=(1, 0.5, 0, 0.5)) # Orange highlight
123
+
124
+ # Add question and answer as text annotation
125
+ highlight_image.annotate_text(
126
+ x=50, y=20,
127
+ text=f"Q: {question}\nA: {result['answer']} (confidence: {result['confidence']:.2f}, region: {region_type})",
128
+ font_size=14,
129
+ color=(0, 0, 0)
130
+ )
131
+
132
+ # Save the highlighted image
133
+ output_path = f"output/document_qa_answer_{i+1}.png"
134
+ highlight_image.save_image(output_path)
135
+ print(f"Saved answer visualization to {output_path}")
136
+ else:
137
+ print("No answer found in any region")
138
+ else:
139
+ # Ask the whole page
140
+ result = page.ask(
141
+ question=question,
142
+ min_confidence=args.min_confidence,
143
+ model=args.model
144
+ )
145
+
146
+ print(format_qa_result(result))
147
+
148
+ # Highlight the answer if requested
149
+ if args.highlight and result.get("found", False) and result.get("source_elements"):
150
+ highlight_image = page.duplicate()
151
+ for element in result["source_elements"]:
152
+ element.highlight(color=(1, 0.5, 0, 0.5)) # Orange highlight
153
+
154
+ # Add question and answer as text annotation
155
+ highlight_image.annotate_text(
156
+ x=50, y=20,
157
+ text=f"Q: {question}\nA: {result['answer']} (confidence: {result['confidence']:.2f})",
158
+ font_size=14,
159
+ color=(0, 0, 0)
160
+ )
161
+
162
+ # Save the highlighted image
163
+ output_path = f"output/document_qa_answer_{i+1}.png"
164
+ highlight_image.save_image(output_path)
165
+ print(f"Saved answer visualization to {output_path}")
166
+
167
+ print()
168
+
169
+ # Try a different PDF approach - ask the whole document
170
+ print("Asking questions to the whole document:")
171
+
172
+ for i, question in enumerate(args.questions):
173
+ print(f"Question {i+1}: {question}")
174
+
175
+ result = pdf.ask(
176
+ question=question,
177
+ min_confidence=args.min_confidence,
178
+ model=args.model
179
+ )
180
+
181
+ print(format_qa_result(result))
182
+ print()
183
+
184
+ if __name__ == "__main__":
185
+ main()
@@ -0,0 +1,128 @@
1
+ """
2
+ Debug script to compare element counts with different exclusion methods.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def debug_element_counts():
16
+ # Get PDF path - use a default if one isn't specified
17
+ # Look for any PDF in the examples directory or pdfs directory
18
+ example_dir = Path(__file__).parent
19
+ pdf_files = list(example_dir.glob("*.pdf"))
20
+
21
+ if not pdf_files:
22
+ pdfs_dir = example_dir.parent / "pdfs"
23
+ if pdfs_dir.exists():
24
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
25
+
26
+ if pdf_files:
27
+ pdf_path = str(pdf_files[0])
28
+ else:
29
+ print("No PDF file found.")
30
+ sys.exit(1)
31
+
32
+ print(f"Using PDF: {pdf_path}")
33
+
34
+ # Case 1: Direct page-level exclusion
35
+ print("\n=== Case 1: Direct page-level exclusion ===")
36
+ pdf1 = PDF(pdf_path)
37
+ page1 = pdf1.pages[0]
38
+
39
+ # First count without exclusions
40
+ all_text_no_exclusions = page1.find_all('text')
41
+ print(f"Before exclusion: {len(all_text_no_exclusions)} text elements")
42
+
43
+ # Count the elements in the region to be excluded
44
+ line1 = page1.find('line')
45
+ region_above = line1.above()
46
+ elements_in_region = page1.find_all('text')
47
+ excluded_count = 0
48
+ for element in elements_in_region:
49
+ if region_above._is_element_in_region(element):
50
+ excluded_count += 1
51
+ print(f"Region above line contains {excluded_count} elements")
52
+
53
+ # Now add the exclusion and count again
54
+ page1.add_exclusion(region_above)
55
+ all_text_with_exclusion = page1.find_all('text')
56
+ print(f"After direct exclusion: {len(all_text_with_exclusion)} text elements")
57
+ print(f"Elements excluded: {len(all_text_no_exclusions) - len(all_text_with_exclusion)}")
58
+
59
+ # Debug the exclusion regions
60
+ exclusion_regions = page1._get_exclusion_regions(include_callable=True)
61
+ print(f"Found {len(exclusion_regions)} exclusion regions")
62
+ for i, region in enumerate(exclusion_regions):
63
+ print(f" Region {i+1}: top={region.top}, bottom={region.bottom}, x0={region.x0}, x1={region.x1}")
64
+
65
+ # Case 2: PDF-level exclusion with lambda
66
+ print("\n=== Case 2: PDF-level exclusion with lambda ===")
67
+ pdf2 = PDF(pdf_path)
68
+
69
+ # Add lambda exclusion at PDF level
70
+ pdf2.add_exclusion(lambda page: page.find('line').above())
71
+ page2 = pdf2.pages[0]
72
+
73
+ # Count after exclusion
74
+ all_text_with_lambda_exclusion = page2.find_all('text')
75
+ print(f"After PDF-level exclusion: {len(all_text_with_lambda_exclusion)} text elements")
76
+
77
+ # Debug the exclusion regions
78
+ print("\nExclusion regions from PDF-level lambda:")
79
+ exclusion_regions = page2._get_exclusion_regions(include_callable=True, debug=True)
80
+ print(f"Found {len(exclusion_regions)} exclusion regions")
81
+ for i, region in enumerate(exclusion_regions):
82
+ print(f" Region {i+1}: top={region.top}, bottom={region.bottom}, x0={region.x0}, x1={region.x1}")
83
+
84
+ # Compare results
85
+ print("\n=== Comparison ===")
86
+ print(f"Direct page exclusion count: {len(all_text_with_exclusion)}")
87
+ print(f"PDF-level lambda exclusion count: {len(all_text_with_lambda_exclusion)}")
88
+
89
+ # Examine if the region generated by the lambda is identical to the direct region
90
+ if len(exclusion_regions) > 0:
91
+ direct_region = region_above
92
+ lambda_region = exclusion_regions[0]
93
+
94
+ print("\nRegion comparison:")
95
+ print(f"Direct region: top={direct_region.top}, bottom={direct_region.bottom}, x0={direct_region.x0}, x1={direct_region.x1}")
96
+ print(f"Lambda region: top={lambda_region.top}, bottom={lambda_region.bottom}, x0={lambda_region.x0}, x1={lambda_region.x1}")
97
+
98
+ # Check if regions are identical
99
+ regions_identical = (
100
+ direct_region.top == lambda_region.top and
101
+ direct_region.bottom == lambda_region.bottom and
102
+ direct_region.x0 == lambda_region.x0 and
103
+ direct_region.x1 == lambda_region.x1
104
+ )
105
+ print(f"Regions are identical: {regions_identical}")
106
+
107
+ # Case 3: Modified lambda approach - create a lambda that exactly reproduces the region
108
+ print("\n=== Case 3: Explicit region lambda ===")
109
+ pdf3 = PDF(pdf_path)
110
+
111
+ # Get the exact coordinates from the first run
112
+ line3 = pdf1.pages[0].find('line')
113
+ region3 = line3.above()
114
+
115
+ # Create a lambda that returns a fixed region with those coordinates
116
+ def fixed_region_lambda(page):
117
+ return page.create_region(region3.x0, region3.top, region3.x1, region3.bottom)
118
+
119
+ pdf3.add_exclusion(fixed_region_lambda)
120
+ page3 = pdf3.pages[0]
121
+
122
+ # Count with this explicit region lambda
123
+ all_text_with_explicit_lambda = page3.find_all('text')
124
+ print(f"With explicit region lambda: {len(all_text_with_explicit_lambda)} text elements")
125
+
126
+
127
+ if __name__ == "__main__":
128
+ debug_element_counts()
@@ -0,0 +1,107 @@
1
+ """
2
+ Example to debug exclusion issues with highlighting.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def debug_exclusions():
16
+ """Debug exclusion problem."""
17
+ # Get PDF path - use a default if one isn't specified
18
+ # Look for any PDF in the examples directory or pdfs directory
19
+ example_dir = Path(__file__).parent
20
+ pdf_files = list(example_dir.glob("*.pdf"))
21
+
22
+ if not pdf_files:
23
+ pdfs_dir = example_dir.parent / "pdfs"
24
+ if pdfs_dir.exists():
25
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
26
+
27
+ if pdf_files:
28
+ pdf_path = str(pdf_files[0])
29
+ else:
30
+ print("No PDF file found. Please provide a path to a PDF file.")
31
+ sys.exit(1)
32
+
33
+ print(f"Using PDF: {pdf_path}")
34
+
35
+ # Case 1: Direct page exclusion - expected to work
36
+ print("\n=== Case 1: Direct page exclusion ===")
37
+ pdf1 = PDF(pdf_path)
38
+ page1 = pdf1.pages[0]
39
+
40
+ # Create a debug output directory
41
+ output_dir = Path(__file__).parent / "debug_output"
42
+ output_dir.mkdir(exist_ok=True)
43
+
44
+ # First, save without exclusions for comparison
45
+ page1.highlight_all()
46
+ page1.save(str(output_dir / "case1_no_exclusion.png"), labels=True)
47
+ page1.clear_highlights()
48
+
49
+ # Log exclusions we're adding
50
+ line1 = page1.find('line')
51
+ print(f"Adding exclusion for region above line at {line1.top}")
52
+
53
+ # Add exclusion directly to page
54
+ page1.add_exclusion(line1.above())
55
+
56
+ # Show all exclusion regions
57
+ exclusion_regions = page1._get_exclusion_regions(include_callable=True)
58
+ print(f"Found {len(exclusion_regions)} exclusion regions")
59
+ for i, region in enumerate(exclusion_regions):
60
+ print(f" Region {i+1}: top={region.top}, bottom={region.bottom}")
61
+
62
+ # Apply highlight with exclusions
63
+ page1.highlight_all(apply_exclusions=True)
64
+ page1.save(str(output_dir / "case1_with_exclusion.png"), labels=True)
65
+
66
+ # Case 2: PDF-level exclusion - not working correctly
67
+ print("\n=== Case 2: PDF-level exclusion ===")
68
+ pdf2 = PDF(pdf_path)
69
+
70
+ # This should work exactly the same as Case 1
71
+ pdf2.add_exclusion(lambda page: page.find('line').above())
72
+ page2 = pdf2.pages[0]
73
+
74
+ # Show all exclusion regions for comparison
75
+ exclusion_regions = page2._get_exclusion_regions(include_callable=True, debug=True)
76
+ print(f"Found {len(exclusion_regions)} exclusion regions")
77
+ for i, region in enumerate(exclusion_regions):
78
+ print(f" Region {i+1}: top={region.top}, bottom={region.bottom}")
79
+
80
+ # Save highlighting result
81
+ page2.highlight_all(apply_exclusions=True)
82
+ page2.save(str(output_dir / "case2_with_exclusion.png"), labels=True)
83
+
84
+ # Case 3: Using find_all with exclusions - for comparison
85
+ print("\n=== Case 3: Using find_all with exclusions ===")
86
+ pdf3 = PDF(pdf_path)
87
+ pdf3.add_exclusion(lambda page: page.find('line').above())
88
+ page3 = pdf3.pages[0]
89
+
90
+ # Check what find_all returns with exclusions
91
+ all_text = page3.find_all('text', apply_exclusions=True)
92
+ print(f"find_all('text') returns {len(all_text)} elements with exclusions")
93
+
94
+ # Highlight just those elements
95
+ all_text.highlight(label="Text with exclusions")
96
+ page3.save(str(output_dir / "case3_find_all_with_exclusion.png"), labels=True)
97
+
98
+ # Compare to highlight_all
99
+ page3.clear_highlights()
100
+ page3.highlight_all(apply_exclusions=True)
101
+ page3.save(str(output_dir / "case3_highlight_all.png"), labels=True)
102
+
103
+ print(f"\nResults saved to {output_dir}")
104
+
105
+
106
+ if __name__ == "__main__":
107
+ debug_exclusions()
@@ -0,0 +1,150 @@
1
+ """
2
+ Example demonstrating how to use exclusion zones in Natural PDF.
3
+ """
4
+
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ # Add parent directory to path for imports
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from natural_pdf import PDF
13
+
14
+
15
+ def example_page_level_exclusion(pdf_path):
16
+ """
17
+ Example demonstrating page-level exclusion zones.
18
+ """
19
+ with PDF(pdf_path) as pdf:
20
+ page = pdf.pages[0]
21
+
22
+ # Print the full text for comparison
23
+ print("\n--- Original Text ---")
24
+ print(page.extract_text())
25
+
26
+ # Add an exclusion for anything above a heading
27
+ print("\n--- After Excluding Header ---")
28
+ header = page.find('text:contains("Summary")')
29
+ if header:
30
+ # Add the exclusion and extract text with it applied
31
+ page.add_exclusion(header.above())
32
+ print(page.extract_text())
33
+ else:
34
+ print("Header not found. Try with a different selector.")
35
+
36
+ # Add another exclusion for content below the last line
37
+ print("\n--- After Excluding Header and Footer ---")
38
+ lines = page.find_all('line')
39
+ if lines and len(lines) > 0:
40
+ last_line = lines.last if hasattr(lines, 'last') else lines[-1]
41
+ # Add the second exclusion
42
+ page.add_exclusion(last_line.below())
43
+ print(page.extract_text())
44
+ else:
45
+ print("Line not found. Try with a different selector.")
46
+
47
+ # Show that we can disable exclusions if needed
48
+ print("\n--- With Exclusions Disabled ---")
49
+ print(page.extract_text(apply_exclusions=False))
50
+
51
+
52
+ def example_pdf_level_exclusion(pdf_path):
53
+ """
54
+ Example demonstrating PDF-level exclusion zones with lambdas.
55
+ """
56
+ with PDF(pdf_path) as pdf:
57
+ # Print text from the first page for comparison
58
+ print("\n=== Original Text from First Page ===")
59
+ print(pdf.pages[0].extract_text(apply_exclusions=False)[:200] + "...")
60
+
61
+ # Define safer exclusion functions with better error handling
62
+ def header_exclusion(page):
63
+ try:
64
+ header = page.find('text:contains("Page")')
65
+ if header:
66
+ return header.above()
67
+ print(f"Page {page.index}: No 'Page' text found for header exclusion")
68
+ return None
69
+ except Exception as e:
70
+ print(f"ERROR in header exclusion for page {page.index}: {e}")
71
+ return None
72
+
73
+ def footer_exclusion(page):
74
+ try:
75
+ lines = page.find_all('line')
76
+ if lines and len(lines) > 0:
77
+ return lines[-1].below()
78
+ print(f"Page {page.index}: No lines found for footer exclusion")
79
+ return None
80
+ except Exception as e:
81
+ print(f"ERROR in footer exclusion for page {page.index}: {e}")
82
+ return None
83
+
84
+ # Add document-wide exclusions using our safer functions
85
+ # 1. Exclude headers - find text containing "Page" and exclude everything above it
86
+ pdf.add_exclusion(header_exclusion, label="headers")
87
+
88
+ # 2. Exclude footers - find the last line and exclude everything below it
89
+ pdf.add_exclusion(footer_exclusion, label="footers")
90
+
91
+ # Print the cleaned text
92
+ print("\n=== Cleaned Text from First Page ===")
93
+ print(pdf.pages[0].extract_text()[:200] + "...")
94
+
95
+ # Extract text from entire document with exclusions applied - WITH DEBUG INFORMATION
96
+ print("\n=== Extracting from Entire Document with Exclusions ===")
97
+ print("\n--- DETAILED DEBUG INFO ---")
98
+ full_text = pdf.extract_text(debug_exclusions=True) # Enable detailed debugging
99
+ print("--- END OF DEBUG INFO ---\n")
100
+
101
+ print(f"Extracted {len(full_text)} characters with exclusions applied")
102
+ print(full_text[:200] + "...")
103
+
104
+ # Regular extraction (for comparison)
105
+ print("\n=== Regular Extraction Without Debug Info ===")
106
+ full_text_no_debug = pdf.extract_text()
107
+ print(f"Extracted {len(full_text_no_debug)} characters without debug output")
108
+
109
+ # Extract text with exclusions disabled (for comparison)
110
+ print("\n=== Extracting with Exclusions Disabled (for comparison) ===")
111
+ full_text_no_exclusions = pdf.extract_text(apply_exclusions=False)
112
+ print(f"Extracted {len(full_text_no_exclusions)} characters with exclusions disabled")
113
+ if len(full_text) != len(full_text_no_exclusions):
114
+ print(f"Difference: {len(full_text_no_exclusions) - len(full_text)} characters were excluded")
115
+
116
+
117
+ def main():
118
+ """Main entry point."""
119
+ # Get the PDF path from command line or use a default
120
+ if len(sys.argv) > 1:
121
+ pdf_path = sys.argv[1]
122
+ else:
123
+ # Look for any PDF in the examples directory or pdfs directory
124
+ example_dir = Path(__file__).parent
125
+ pdf_files = list(example_dir.glob("*.pdf"))
126
+
127
+ if not pdf_files:
128
+ pdfs_dir = example_dir.parent / "pdfs"
129
+ if pdfs_dir.exists():
130
+ pdf_files = list(pdfs_dir.glob("*.pdf"))
131
+
132
+ if pdf_files:
133
+ pdf_path = str(pdf_files[0])
134
+ else:
135
+ print("No PDF file found. Please provide a path to a PDF file.")
136
+ sys.exit(1)
137
+
138
+ print(f"Using PDF: {pdf_path}")
139
+
140
+ # Run the page-level example
141
+ print("\n=== Page-Level Exclusion Example ===")
142
+ example_page_level_exclusion(pdf_path)
143
+
144
+ # Run the PDF-level example
145
+ print("\n=== PDF-Level Exclusion Example ===")
146
+ example_pdf_level_exclusion(pdf_path)
147
+
148
+
149
+ if __name__ == "__main__":
150
+ main()