natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,123 @@
|
|
1
|
+
"""
|
2
|
+
Document layout analysis example using YOLO model.
|
3
|
+
|
4
|
+
This example demonstrates how to use the document layout analysis
|
5
|
+
functionality to detect and extract content from different regions
|
6
|
+
of a PDF document.
|
7
|
+
"""
|
8
|
+
import os
|
9
|
+
import sys
|
10
|
+
import argparse
|
11
|
+
|
12
|
+
# Add the parent directory to the Python path
|
13
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
14
|
+
from natural_pdf import PDF
|
15
|
+
|
16
|
+
# Get the current directory of this script
|
17
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
18
|
+
# Get the parent directory (project root)
|
19
|
+
root_dir = os.path.dirname(script_dir)
|
20
|
+
# Default PDF path
|
21
|
+
default_pdf = os.path.join(root_dir, "pdfs", "2019 Statistics.pdf")
|
22
|
+
|
23
|
+
# Set up argument parser
|
24
|
+
parser = argparse.ArgumentParser(description="Document layout analysis example")
|
25
|
+
parser.add_argument("pdf_path", nargs="?", default=default_pdf, help="Path to a PDF file")
|
26
|
+
parser.add_argument("--page", type=int, default=0, help="Page number to analyze (0-based)")
|
27
|
+
parser.add_argument("--conf", type=float, default=0.2, help="Confidence threshold for detections")
|
28
|
+
parser.add_argument("--model-path", type=str, default=None, help="Path to custom YOLO model")
|
29
|
+
parser.add_argument("--device", type=str, default="cpu", help="Device to run inference on ('cpu' or 'cuda:0')")
|
30
|
+
parser.add_argument("--output", type=str, default=None, help="Output file path for highlighted image")
|
31
|
+
args = parser.parse_args()
|
32
|
+
|
33
|
+
print(f"Analyzing PDF: {args.pdf_path}")
|
34
|
+
print(f"Page: {args.page}")
|
35
|
+
print(f"Confidence threshold: {args.conf}")
|
36
|
+
|
37
|
+
# Load the PDF
|
38
|
+
pdf = PDF(args.pdf_path)
|
39
|
+
page = pdf.pages[args.page]
|
40
|
+
|
41
|
+
print(f"Running document layout analysis...")
|
42
|
+
|
43
|
+
# Run document layout analysis
|
44
|
+
# The analyze_layout method now returns self for method chaining
|
45
|
+
page.analyze_layout(
|
46
|
+
confidence=args.conf,
|
47
|
+
model_path=args.model_path,
|
48
|
+
device=args.device
|
49
|
+
)
|
50
|
+
|
51
|
+
print(f"Found {len(page.detected_layout_regions)} regions with confidence >= {args.conf}")
|
52
|
+
|
53
|
+
# Group regions by type
|
54
|
+
regions_by_type = {}
|
55
|
+
for region in page.detected_layout_regions:
|
56
|
+
region_type = region.region_type
|
57
|
+
if region_type not in regions_by_type:
|
58
|
+
regions_by_type[region_type] = []
|
59
|
+
regions_by_type[region_type].append(region)
|
60
|
+
|
61
|
+
# Print a summary of detected regions by type
|
62
|
+
for region_type, type_regions in regions_by_type.items():
|
63
|
+
print(f" - {region_type}: {len(type_regions)} regions")
|
64
|
+
|
65
|
+
# You can highlight layout regions in two ways:
|
66
|
+
# 1. Using the dedicated highlight_layout method
|
67
|
+
# page.highlight_layout(regions, confidence=args.conf)
|
68
|
+
|
69
|
+
# 2. Using highlight_all with include_layout_regions=True
|
70
|
+
page.highlight_all(include_layout_regions=True, layout_confidence=args.conf)
|
71
|
+
|
72
|
+
# Demonstrate using selectors to find regions by type
|
73
|
+
print("\nSelecting regions by type:")
|
74
|
+
for region_type in regions_by_type.keys():
|
75
|
+
# Convert spaces to hyphens for selector syntax
|
76
|
+
selector_type = region_type.lower().replace(' ', '-')
|
77
|
+
selector = f"region[type={selector_type}]"
|
78
|
+
|
79
|
+
found_regions = page.find_all(selector)
|
80
|
+
print(f" - {selector}: {len(found_regions)} regions")
|
81
|
+
|
82
|
+
# Extract text from the first region if available
|
83
|
+
if found_regions:
|
84
|
+
text = found_regions[0].extract_text()
|
85
|
+
preview = text[:50] + "..." if len(text) > 50 else text
|
86
|
+
print(f" First region text: {preview}")
|
87
|
+
|
88
|
+
# Finding high-confidence titles
|
89
|
+
high_conf_titles = page.find_all('region[type=title][confidence>=0.8]')
|
90
|
+
if high_conf_titles:
|
91
|
+
print(f"\nFound {len(high_conf_titles)} high-confidence titles:")
|
92
|
+
for i, title in enumerate(high_conf_titles):
|
93
|
+
text = title.extract_text().strip()
|
94
|
+
print(f" {i+1}. {text} (conf: {title.confidence:.2f})")
|
95
|
+
|
96
|
+
# Save the highlighted image
|
97
|
+
output_path = args.output or os.path.join(root_dir, "output", "layout_detection.png")
|
98
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
99
|
+
print(f"\nSaving highlighted layout to {output_path}")
|
100
|
+
page.to_image(path=output_path, show_labels=True)
|
101
|
+
print(f"Done!")
|
102
|
+
|
103
|
+
# Show an example of using a detected region for further analysis
|
104
|
+
if "table" in regions_by_type and regions_by_type["table"]:
|
105
|
+
print("\nExample: Working with a detected table region")
|
106
|
+
table_region = regions_by_type["table"][0]
|
107
|
+
|
108
|
+
# Highlight the table region with a specific color
|
109
|
+
table_region.highlight(label="Selected Table", color=(0, 1, 0, 0.3))
|
110
|
+
|
111
|
+
# Find text elements within the table region
|
112
|
+
table_text = table_region.find_all('text')
|
113
|
+
print(f" Found {len(table_text)} text elements in the table")
|
114
|
+
|
115
|
+
# Extract the table text
|
116
|
+
table_content = table_region.extract_text()
|
117
|
+
preview = table_content[:100] + "..." if len(table_content) > 100 else table_content
|
118
|
+
print(f" Table content: {preview}")
|
119
|
+
|
120
|
+
# Save the highlighted table
|
121
|
+
table_output = os.path.join(os.path.dirname(output_path), "detected_table.png")
|
122
|
+
page.to_image(path=table_output, show_labels=True)
|
123
|
+
print(f" Table highlighted image saved to {table_output}")
|
@@ -0,0 +1,185 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the Document QA capabilities of Natural PDF.
|
3
|
+
|
4
|
+
This example shows how to:
|
5
|
+
1. Ask questions to a PDF document
|
6
|
+
2. Ask questions to specific pages
|
7
|
+
3. Ask questions to specific regions
|
8
|
+
4. Control confidence thresholds
|
9
|
+
5. Highlight answer elements
|
10
|
+
6. Handle QA results
|
11
|
+
|
12
|
+
Requirements:
|
13
|
+
- transformers
|
14
|
+
- torch
|
15
|
+
"""
|
16
|
+
|
17
|
+
import os
|
18
|
+
import sys
|
19
|
+
import argparse
|
20
|
+
from PIL import Image, ImageDraw, ImageFont
|
21
|
+
import logging
|
22
|
+
from typing import Dict, Any
|
23
|
+
|
24
|
+
# Add parent directory to path to run without installing
|
25
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
26
|
+
|
27
|
+
from natural_pdf import PDF, configure_logging
|
28
|
+
|
29
|
+
def format_qa_result(result: Dict[str, Any]) -> str:
|
30
|
+
"""Format a QA result as a string."""
|
31
|
+
if not result.get("found", False):
|
32
|
+
return f"No answer found. {result.get('message', '')}"
|
33
|
+
|
34
|
+
answer = result.get("answer", "")
|
35
|
+
confidence = result.get("confidence", 0.0)
|
36
|
+
page_num = result.get("page_num", 0)
|
37
|
+
|
38
|
+
return f"Answer: {answer} (confidence: {confidence:.2f}, page: {page_num})"
|
39
|
+
|
40
|
+
def main():
|
41
|
+
parser = argparse.ArgumentParser(description="Document QA Example")
|
42
|
+
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
43
|
+
help="Path to PDF document")
|
44
|
+
parser.add_argument("--questions", nargs="+",
|
45
|
+
default=["How many votes for Harris and Walz?",
|
46
|
+
"How many votes for Trump and Vance?",
|
47
|
+
"What precinct is this for?",
|
48
|
+
"What state is this for?"],
|
49
|
+
help="Questions to ask")
|
50
|
+
parser.add_argument("--highlight", action="store_true",
|
51
|
+
help="Highlight answer elements")
|
52
|
+
parser.add_argument("--min-confidence", type=float, default=0.2,
|
53
|
+
help="Minimum confidence threshold (0.0-1.0)")
|
54
|
+
parser.add_argument("--verbose", action="store_true",
|
55
|
+
help="Enable verbose output")
|
56
|
+
parser.add_argument("--model", default="impira/layoutlm-document-qa",
|
57
|
+
help="Model to use (default: impira/layoutlm-document-qa)")
|
58
|
+
parser.add_argument("--region", action="store_true",
|
59
|
+
help="Ask questions to specific regions instead of whole pages")
|
60
|
+
|
61
|
+
args = parser.parse_args()
|
62
|
+
|
63
|
+
# Configure logging
|
64
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
65
|
+
configure_logging(level=log_level)
|
66
|
+
|
67
|
+
# Open the PDF
|
68
|
+
pdf = PDF(args.pdf_path)
|
69
|
+
page = pdf.pages[0] # Use the first page for this example
|
70
|
+
|
71
|
+
print(f"Document: {args.pdf_path}")
|
72
|
+
print(f"Page count: {len(pdf.pages)}")
|
73
|
+
print(f"Model: {args.model}")
|
74
|
+
print(f"Minimum confidence: {args.min_confidence}")
|
75
|
+
print()
|
76
|
+
|
77
|
+
# Create output directory if not exists
|
78
|
+
os.makedirs("output", exist_ok=True)
|
79
|
+
|
80
|
+
# If using regions, detect document layout
|
81
|
+
if args.region:
|
82
|
+
print("Detecting document layout...")
|
83
|
+
page.analyze_layout(confidence=0.3)
|
84
|
+
regions = page.find_all('region')
|
85
|
+
print(f"Found {len(regions)} regions")
|
86
|
+
|
87
|
+
# Save an image with detected regions
|
88
|
+
page.highlight_layout()
|
89
|
+
page.save_image("output/document_qa_regions.png")
|
90
|
+
print("Saved layout visualization to output/document_qa_regions.png")
|
91
|
+
print()
|
92
|
+
|
93
|
+
# Process each question
|
94
|
+
for i, question in enumerate(args.questions):
|
95
|
+
print(f"Question {i+1}: {question}")
|
96
|
+
|
97
|
+
if args.region:
|
98
|
+
# Ask each region (sort by confidence)
|
99
|
+
all_results = []
|
100
|
+
for region in regions:
|
101
|
+
if region.region_type in ['title', 'plain-text', 'table', 'list']:
|
102
|
+
result = region.ask(
|
103
|
+
question=question,
|
104
|
+
min_confidence=args.min_confidence,
|
105
|
+
model=args.model
|
106
|
+
)
|
107
|
+
if result.get("found", False):
|
108
|
+
all_results.append(result)
|
109
|
+
|
110
|
+
# Sort by confidence
|
111
|
+
all_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
112
|
+
|
113
|
+
if all_results:
|
114
|
+
result = all_results[0] # Use the highest confidence result
|
115
|
+
print(format_qa_result(result))
|
116
|
+
|
117
|
+
# Highlight the answer if requested
|
118
|
+
if args.highlight and result.get("source_elements"):
|
119
|
+
highlight_image = page.duplicate()
|
120
|
+
region_type = result["region"].region_type if "region" in result else "unknown"
|
121
|
+
for element in result["source_elements"]:
|
122
|
+
element.highlight(color=(1, 0.5, 0, 0.5)) # Orange highlight
|
123
|
+
|
124
|
+
# Add question and answer as text annotation
|
125
|
+
highlight_image.annotate_text(
|
126
|
+
x=50, y=20,
|
127
|
+
text=f"Q: {question}\nA: {result['answer']} (confidence: {result['confidence']:.2f}, region: {region_type})",
|
128
|
+
font_size=14,
|
129
|
+
color=(0, 0, 0)
|
130
|
+
)
|
131
|
+
|
132
|
+
# Save the highlighted image
|
133
|
+
output_path = f"output/document_qa_answer_{i+1}.png"
|
134
|
+
highlight_image.save_image(output_path)
|
135
|
+
print(f"Saved answer visualization to {output_path}")
|
136
|
+
else:
|
137
|
+
print("No answer found in any region")
|
138
|
+
else:
|
139
|
+
# Ask the whole page
|
140
|
+
result = page.ask(
|
141
|
+
question=question,
|
142
|
+
min_confidence=args.min_confidence,
|
143
|
+
model=args.model
|
144
|
+
)
|
145
|
+
|
146
|
+
print(format_qa_result(result))
|
147
|
+
|
148
|
+
# Highlight the answer if requested
|
149
|
+
if args.highlight and result.get("found", False) and result.get("source_elements"):
|
150
|
+
highlight_image = page.duplicate()
|
151
|
+
for element in result["source_elements"]:
|
152
|
+
element.highlight(color=(1, 0.5, 0, 0.5)) # Orange highlight
|
153
|
+
|
154
|
+
# Add question and answer as text annotation
|
155
|
+
highlight_image.annotate_text(
|
156
|
+
x=50, y=20,
|
157
|
+
text=f"Q: {question}\nA: {result['answer']} (confidence: {result['confidence']:.2f})",
|
158
|
+
font_size=14,
|
159
|
+
color=(0, 0, 0)
|
160
|
+
)
|
161
|
+
|
162
|
+
# Save the highlighted image
|
163
|
+
output_path = f"output/document_qa_answer_{i+1}.png"
|
164
|
+
highlight_image.save_image(output_path)
|
165
|
+
print(f"Saved answer visualization to {output_path}")
|
166
|
+
|
167
|
+
print()
|
168
|
+
|
169
|
+
# Try a different PDF approach - ask the whole document
|
170
|
+
print("Asking questions to the whole document:")
|
171
|
+
|
172
|
+
for i, question in enumerate(args.questions):
|
173
|
+
print(f"Question {i+1}: {question}")
|
174
|
+
|
175
|
+
result = pdf.ask(
|
176
|
+
question=question,
|
177
|
+
min_confidence=args.min_confidence,
|
178
|
+
model=args.model
|
179
|
+
)
|
180
|
+
|
181
|
+
print(format_qa_result(result))
|
182
|
+
print()
|
183
|
+
|
184
|
+
if __name__ == "__main__":
|
185
|
+
main()
|
@@ -0,0 +1,128 @@
|
|
1
|
+
"""
|
2
|
+
Debug script to compare element counts with different exclusion methods.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def debug_element_counts():
|
16
|
+
# Get PDF path - use a default if one isn't specified
|
17
|
+
# Look for any PDF in the examples directory or pdfs directory
|
18
|
+
example_dir = Path(__file__).parent
|
19
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
20
|
+
|
21
|
+
if not pdf_files:
|
22
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
23
|
+
if pdfs_dir.exists():
|
24
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
25
|
+
|
26
|
+
if pdf_files:
|
27
|
+
pdf_path = str(pdf_files[0])
|
28
|
+
else:
|
29
|
+
print("No PDF file found.")
|
30
|
+
sys.exit(1)
|
31
|
+
|
32
|
+
print(f"Using PDF: {pdf_path}")
|
33
|
+
|
34
|
+
# Case 1: Direct page-level exclusion
|
35
|
+
print("\n=== Case 1: Direct page-level exclusion ===")
|
36
|
+
pdf1 = PDF(pdf_path)
|
37
|
+
page1 = pdf1.pages[0]
|
38
|
+
|
39
|
+
# First count without exclusions
|
40
|
+
all_text_no_exclusions = page1.find_all('text')
|
41
|
+
print(f"Before exclusion: {len(all_text_no_exclusions)} text elements")
|
42
|
+
|
43
|
+
# Count the elements in the region to be excluded
|
44
|
+
line1 = page1.find('line')
|
45
|
+
region_above = line1.above()
|
46
|
+
elements_in_region = page1.find_all('text')
|
47
|
+
excluded_count = 0
|
48
|
+
for element in elements_in_region:
|
49
|
+
if region_above._is_element_in_region(element):
|
50
|
+
excluded_count += 1
|
51
|
+
print(f"Region above line contains {excluded_count} elements")
|
52
|
+
|
53
|
+
# Now add the exclusion and count again
|
54
|
+
page1.add_exclusion(region_above)
|
55
|
+
all_text_with_exclusion = page1.find_all('text')
|
56
|
+
print(f"After direct exclusion: {len(all_text_with_exclusion)} text elements")
|
57
|
+
print(f"Elements excluded: {len(all_text_no_exclusions) - len(all_text_with_exclusion)}")
|
58
|
+
|
59
|
+
# Debug the exclusion regions
|
60
|
+
exclusion_regions = page1._get_exclusion_regions(include_callable=True)
|
61
|
+
print(f"Found {len(exclusion_regions)} exclusion regions")
|
62
|
+
for i, region in enumerate(exclusion_regions):
|
63
|
+
print(f" Region {i+1}: top={region.top}, bottom={region.bottom}, x0={region.x0}, x1={region.x1}")
|
64
|
+
|
65
|
+
# Case 2: PDF-level exclusion with lambda
|
66
|
+
print("\n=== Case 2: PDF-level exclusion with lambda ===")
|
67
|
+
pdf2 = PDF(pdf_path)
|
68
|
+
|
69
|
+
# Add lambda exclusion at PDF level
|
70
|
+
pdf2.add_exclusion(lambda page: page.find('line').above())
|
71
|
+
page2 = pdf2.pages[0]
|
72
|
+
|
73
|
+
# Count after exclusion
|
74
|
+
all_text_with_lambda_exclusion = page2.find_all('text')
|
75
|
+
print(f"After PDF-level exclusion: {len(all_text_with_lambda_exclusion)} text elements")
|
76
|
+
|
77
|
+
# Debug the exclusion regions
|
78
|
+
print("\nExclusion regions from PDF-level lambda:")
|
79
|
+
exclusion_regions = page2._get_exclusion_regions(include_callable=True, debug=True)
|
80
|
+
print(f"Found {len(exclusion_regions)} exclusion regions")
|
81
|
+
for i, region in enumerate(exclusion_regions):
|
82
|
+
print(f" Region {i+1}: top={region.top}, bottom={region.bottom}, x0={region.x0}, x1={region.x1}")
|
83
|
+
|
84
|
+
# Compare results
|
85
|
+
print("\n=== Comparison ===")
|
86
|
+
print(f"Direct page exclusion count: {len(all_text_with_exclusion)}")
|
87
|
+
print(f"PDF-level lambda exclusion count: {len(all_text_with_lambda_exclusion)}")
|
88
|
+
|
89
|
+
# Examine if the region generated by the lambda is identical to the direct region
|
90
|
+
if len(exclusion_regions) > 0:
|
91
|
+
direct_region = region_above
|
92
|
+
lambda_region = exclusion_regions[0]
|
93
|
+
|
94
|
+
print("\nRegion comparison:")
|
95
|
+
print(f"Direct region: top={direct_region.top}, bottom={direct_region.bottom}, x0={direct_region.x0}, x1={direct_region.x1}")
|
96
|
+
print(f"Lambda region: top={lambda_region.top}, bottom={lambda_region.bottom}, x0={lambda_region.x0}, x1={lambda_region.x1}")
|
97
|
+
|
98
|
+
# Check if regions are identical
|
99
|
+
regions_identical = (
|
100
|
+
direct_region.top == lambda_region.top and
|
101
|
+
direct_region.bottom == lambda_region.bottom and
|
102
|
+
direct_region.x0 == lambda_region.x0 and
|
103
|
+
direct_region.x1 == lambda_region.x1
|
104
|
+
)
|
105
|
+
print(f"Regions are identical: {regions_identical}")
|
106
|
+
|
107
|
+
# Case 3: Modified lambda approach - create a lambda that exactly reproduces the region
|
108
|
+
print("\n=== Case 3: Explicit region lambda ===")
|
109
|
+
pdf3 = PDF(pdf_path)
|
110
|
+
|
111
|
+
# Get the exact coordinates from the first run
|
112
|
+
line3 = pdf1.pages[0].find('line')
|
113
|
+
region3 = line3.above()
|
114
|
+
|
115
|
+
# Create a lambda that returns a fixed region with those coordinates
|
116
|
+
def fixed_region_lambda(page):
|
117
|
+
return page.create_region(region3.x0, region3.top, region3.x1, region3.bottom)
|
118
|
+
|
119
|
+
pdf3.add_exclusion(fixed_region_lambda)
|
120
|
+
page3 = pdf3.pages[0]
|
121
|
+
|
122
|
+
# Count with this explicit region lambda
|
123
|
+
all_text_with_explicit_lambda = page3.find_all('text')
|
124
|
+
print(f"With explicit region lambda: {len(all_text_with_explicit_lambda)} text elements")
|
125
|
+
|
126
|
+
|
127
|
+
if __name__ == "__main__":
|
128
|
+
debug_element_counts()
|
@@ -0,0 +1,107 @@
|
|
1
|
+
"""
|
2
|
+
Example to debug exclusion issues with highlighting.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def debug_exclusions():
|
16
|
+
"""Debug exclusion problem."""
|
17
|
+
# Get PDF path - use a default if one isn't specified
|
18
|
+
# Look for any PDF in the examples directory or pdfs directory
|
19
|
+
example_dir = Path(__file__).parent
|
20
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
21
|
+
|
22
|
+
if not pdf_files:
|
23
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
24
|
+
if pdfs_dir.exists():
|
25
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
26
|
+
|
27
|
+
if pdf_files:
|
28
|
+
pdf_path = str(pdf_files[0])
|
29
|
+
else:
|
30
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
31
|
+
sys.exit(1)
|
32
|
+
|
33
|
+
print(f"Using PDF: {pdf_path}")
|
34
|
+
|
35
|
+
# Case 1: Direct page exclusion - expected to work
|
36
|
+
print("\n=== Case 1: Direct page exclusion ===")
|
37
|
+
pdf1 = PDF(pdf_path)
|
38
|
+
page1 = pdf1.pages[0]
|
39
|
+
|
40
|
+
# Create a debug output directory
|
41
|
+
output_dir = Path(__file__).parent / "debug_output"
|
42
|
+
output_dir.mkdir(exist_ok=True)
|
43
|
+
|
44
|
+
# First, save without exclusions for comparison
|
45
|
+
page1.highlight_all()
|
46
|
+
page1.save(str(output_dir / "case1_no_exclusion.png"), labels=True)
|
47
|
+
page1.clear_highlights()
|
48
|
+
|
49
|
+
# Log exclusions we're adding
|
50
|
+
line1 = page1.find('line')
|
51
|
+
print(f"Adding exclusion for region above line at {line1.top}")
|
52
|
+
|
53
|
+
# Add exclusion directly to page
|
54
|
+
page1.add_exclusion(line1.above())
|
55
|
+
|
56
|
+
# Show all exclusion regions
|
57
|
+
exclusion_regions = page1._get_exclusion_regions(include_callable=True)
|
58
|
+
print(f"Found {len(exclusion_regions)} exclusion regions")
|
59
|
+
for i, region in enumerate(exclusion_regions):
|
60
|
+
print(f" Region {i+1}: top={region.top}, bottom={region.bottom}")
|
61
|
+
|
62
|
+
# Apply highlight with exclusions
|
63
|
+
page1.highlight_all(apply_exclusions=True)
|
64
|
+
page1.save(str(output_dir / "case1_with_exclusion.png"), labels=True)
|
65
|
+
|
66
|
+
# Case 2: PDF-level exclusion - not working correctly
|
67
|
+
print("\n=== Case 2: PDF-level exclusion ===")
|
68
|
+
pdf2 = PDF(pdf_path)
|
69
|
+
|
70
|
+
# This should work exactly the same as Case 1
|
71
|
+
pdf2.add_exclusion(lambda page: page.find('line').above())
|
72
|
+
page2 = pdf2.pages[0]
|
73
|
+
|
74
|
+
# Show all exclusion regions for comparison
|
75
|
+
exclusion_regions = page2._get_exclusion_regions(include_callable=True, debug=True)
|
76
|
+
print(f"Found {len(exclusion_regions)} exclusion regions")
|
77
|
+
for i, region in enumerate(exclusion_regions):
|
78
|
+
print(f" Region {i+1}: top={region.top}, bottom={region.bottom}")
|
79
|
+
|
80
|
+
# Save highlighting result
|
81
|
+
page2.highlight_all(apply_exclusions=True)
|
82
|
+
page2.save(str(output_dir / "case2_with_exclusion.png"), labels=True)
|
83
|
+
|
84
|
+
# Case 3: Using find_all with exclusions - for comparison
|
85
|
+
print("\n=== Case 3: Using find_all with exclusions ===")
|
86
|
+
pdf3 = PDF(pdf_path)
|
87
|
+
pdf3.add_exclusion(lambda page: page.find('line').above())
|
88
|
+
page3 = pdf3.pages[0]
|
89
|
+
|
90
|
+
# Check what find_all returns with exclusions
|
91
|
+
all_text = page3.find_all('text', apply_exclusions=True)
|
92
|
+
print(f"find_all('text') returns {len(all_text)} elements with exclusions")
|
93
|
+
|
94
|
+
# Highlight just those elements
|
95
|
+
all_text.highlight(label="Text with exclusions")
|
96
|
+
page3.save(str(output_dir / "case3_find_all_with_exclusion.png"), labels=True)
|
97
|
+
|
98
|
+
# Compare to highlight_all
|
99
|
+
page3.clear_highlights()
|
100
|
+
page3.highlight_all(apply_exclusions=True)
|
101
|
+
page3.save(str(output_dir / "case3_highlight_all.png"), labels=True)
|
102
|
+
|
103
|
+
print(f"\nResults saved to {output_dir}")
|
104
|
+
|
105
|
+
|
106
|
+
if __name__ == "__main__":
|
107
|
+
debug_exclusions()
|
@@ -0,0 +1,150 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating how to use exclusion zones in Natural PDF.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def example_page_level_exclusion(pdf_path):
|
16
|
+
"""
|
17
|
+
Example demonstrating page-level exclusion zones.
|
18
|
+
"""
|
19
|
+
with PDF(pdf_path) as pdf:
|
20
|
+
page = pdf.pages[0]
|
21
|
+
|
22
|
+
# Print the full text for comparison
|
23
|
+
print("\n--- Original Text ---")
|
24
|
+
print(page.extract_text())
|
25
|
+
|
26
|
+
# Add an exclusion for anything above a heading
|
27
|
+
print("\n--- After Excluding Header ---")
|
28
|
+
header = page.find('text:contains("Summary")')
|
29
|
+
if header:
|
30
|
+
# Add the exclusion and extract text with it applied
|
31
|
+
page.add_exclusion(header.above())
|
32
|
+
print(page.extract_text())
|
33
|
+
else:
|
34
|
+
print("Header not found. Try with a different selector.")
|
35
|
+
|
36
|
+
# Add another exclusion for content below the last line
|
37
|
+
print("\n--- After Excluding Header and Footer ---")
|
38
|
+
lines = page.find_all('line')
|
39
|
+
if lines and len(lines) > 0:
|
40
|
+
last_line = lines.last if hasattr(lines, 'last') else lines[-1]
|
41
|
+
# Add the second exclusion
|
42
|
+
page.add_exclusion(last_line.below())
|
43
|
+
print(page.extract_text())
|
44
|
+
else:
|
45
|
+
print("Line not found. Try with a different selector.")
|
46
|
+
|
47
|
+
# Show that we can disable exclusions if needed
|
48
|
+
print("\n--- With Exclusions Disabled ---")
|
49
|
+
print(page.extract_text(apply_exclusions=False))
|
50
|
+
|
51
|
+
|
52
|
+
def example_pdf_level_exclusion(pdf_path):
|
53
|
+
"""
|
54
|
+
Example demonstrating PDF-level exclusion zones with lambdas.
|
55
|
+
"""
|
56
|
+
with PDF(pdf_path) as pdf:
|
57
|
+
# Print text from the first page for comparison
|
58
|
+
print("\n=== Original Text from First Page ===")
|
59
|
+
print(pdf.pages[0].extract_text(apply_exclusions=False)[:200] + "...")
|
60
|
+
|
61
|
+
# Define safer exclusion functions with better error handling
|
62
|
+
def header_exclusion(page):
|
63
|
+
try:
|
64
|
+
header = page.find('text:contains("Page")')
|
65
|
+
if header:
|
66
|
+
return header.above()
|
67
|
+
print(f"Page {page.index}: No 'Page' text found for header exclusion")
|
68
|
+
return None
|
69
|
+
except Exception as e:
|
70
|
+
print(f"ERROR in header exclusion for page {page.index}: {e}")
|
71
|
+
return None
|
72
|
+
|
73
|
+
def footer_exclusion(page):
|
74
|
+
try:
|
75
|
+
lines = page.find_all('line')
|
76
|
+
if lines and len(lines) > 0:
|
77
|
+
return lines[-1].below()
|
78
|
+
print(f"Page {page.index}: No lines found for footer exclusion")
|
79
|
+
return None
|
80
|
+
except Exception as e:
|
81
|
+
print(f"ERROR in footer exclusion for page {page.index}: {e}")
|
82
|
+
return None
|
83
|
+
|
84
|
+
# Add document-wide exclusions using our safer functions
|
85
|
+
# 1. Exclude headers - find text containing "Page" and exclude everything above it
|
86
|
+
pdf.add_exclusion(header_exclusion, label="headers")
|
87
|
+
|
88
|
+
# 2. Exclude footers - find the last line and exclude everything below it
|
89
|
+
pdf.add_exclusion(footer_exclusion, label="footers")
|
90
|
+
|
91
|
+
# Print the cleaned text
|
92
|
+
print("\n=== Cleaned Text from First Page ===")
|
93
|
+
print(pdf.pages[0].extract_text()[:200] + "...")
|
94
|
+
|
95
|
+
# Extract text from entire document with exclusions applied - WITH DEBUG INFORMATION
|
96
|
+
print("\n=== Extracting from Entire Document with Exclusions ===")
|
97
|
+
print("\n--- DETAILED DEBUG INFO ---")
|
98
|
+
full_text = pdf.extract_text(debug_exclusions=True) # Enable detailed debugging
|
99
|
+
print("--- END OF DEBUG INFO ---\n")
|
100
|
+
|
101
|
+
print(f"Extracted {len(full_text)} characters with exclusions applied")
|
102
|
+
print(full_text[:200] + "...")
|
103
|
+
|
104
|
+
# Regular extraction (for comparison)
|
105
|
+
print("\n=== Regular Extraction Without Debug Info ===")
|
106
|
+
full_text_no_debug = pdf.extract_text()
|
107
|
+
print(f"Extracted {len(full_text_no_debug)} characters without debug output")
|
108
|
+
|
109
|
+
# Extract text with exclusions disabled (for comparison)
|
110
|
+
print("\n=== Extracting with Exclusions Disabled (for comparison) ===")
|
111
|
+
full_text_no_exclusions = pdf.extract_text(apply_exclusions=False)
|
112
|
+
print(f"Extracted {len(full_text_no_exclusions)} characters with exclusions disabled")
|
113
|
+
if len(full_text) != len(full_text_no_exclusions):
|
114
|
+
print(f"Difference: {len(full_text_no_exclusions) - len(full_text)} characters were excluded")
|
115
|
+
|
116
|
+
|
117
|
+
def main():
|
118
|
+
"""Main entry point."""
|
119
|
+
# Get the PDF path from command line or use a default
|
120
|
+
if len(sys.argv) > 1:
|
121
|
+
pdf_path = sys.argv[1]
|
122
|
+
else:
|
123
|
+
# Look for any PDF in the examples directory or pdfs directory
|
124
|
+
example_dir = Path(__file__).parent
|
125
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
126
|
+
|
127
|
+
if not pdf_files:
|
128
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
129
|
+
if pdfs_dir.exists():
|
130
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
131
|
+
|
132
|
+
if pdf_files:
|
133
|
+
pdf_path = str(pdf_files[0])
|
134
|
+
else:
|
135
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
136
|
+
sys.exit(1)
|
137
|
+
|
138
|
+
print(f"Using PDF: {pdf_path}")
|
139
|
+
|
140
|
+
# Run the page-level example
|
141
|
+
print("\n=== Page-Level Exclusion Example ===")
|
142
|
+
example_page_level_exclusion(pdf_path)
|
143
|
+
|
144
|
+
# Run the PDF-level example
|
145
|
+
print("\n=== PDF-Level Exclusion Example ===")
|
146
|
+
example_pdf_level_exclusion(pdf_path)
|
147
|
+
|
148
|
+
|
149
|
+
if __name__ == "__main__":
|
150
|
+
main()
|