natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,155 @@
|
|
1
|
+
"""
|
2
|
+
Table structure detection example using Table Transformer.
|
3
|
+
|
4
|
+
This example demonstrates how to use the Table Transformer (TATR)
|
5
|
+
to detect tables and their structure in PDF documents.
|
6
|
+
|
7
|
+
Note: This example requires additional dependencies:
|
8
|
+
- torch
|
9
|
+
- torchvision
|
10
|
+
- transformers
|
11
|
+
|
12
|
+
These will be automatically installed when you install natural-pdf.
|
13
|
+
"""
|
14
|
+
import os
|
15
|
+
from natural_pdf import PDF
|
16
|
+
|
17
|
+
# Get the current directory of this script
|
18
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
19
|
+
# Get the parent directory (project root)
|
20
|
+
root_dir = os.path.dirname(script_dir)
|
21
|
+
# Setup paths
|
22
|
+
pdf_path = os.path.join(root_dir, "pdfs", "01-practice.pdf")
|
23
|
+
output_dir = os.path.join(root_dir, "output")
|
24
|
+
os.makedirs(output_dir, exist_ok=True)
|
25
|
+
|
26
|
+
print(f"Analyzing table structure in: {pdf_path}")
|
27
|
+
|
28
|
+
# Load the PDF - this file has a single page with a table
|
29
|
+
pdf = PDF(pdf_path)
|
30
|
+
page = pdf.pages[0] # Get the first page
|
31
|
+
|
32
|
+
print("Running YOLO layout analysis first (excluding tables)...")
|
33
|
+
# First run YOLO detector but exclude tables
|
34
|
+
page.analyze_layout(
|
35
|
+
model="yolo",
|
36
|
+
confidence=0.3,
|
37
|
+
exclude_classes=["table", "table_caption", "table_footnote"]
|
38
|
+
)
|
39
|
+
|
40
|
+
print(f"Found {len(page.detected_layout_regions)} general layout regions")
|
41
|
+
|
42
|
+
print("Now running Table Transformer detection...")
|
43
|
+
# Then run Table Transformer detection and add to existing regions
|
44
|
+
page.analyze_layout(
|
45
|
+
model="tatr",
|
46
|
+
confidence=0.4, # Table detection confidence threshold
|
47
|
+
existing="append"
|
48
|
+
)
|
49
|
+
|
50
|
+
print(f"Found {len(page.detected_layout_regions)} total regions (including table structure)")
|
51
|
+
|
52
|
+
# Example of method chaining
|
53
|
+
print("\nDemonstrating method chaining for layout analysis and highlighting:")
|
54
|
+
# Create a highlighted image with a single method chain
|
55
|
+
page.clear_highlights()\
|
56
|
+
.analyze_layout(model="tatr", confidence=0.3)\
|
57
|
+
.highlight_layout()\
|
58
|
+
.to_image(path=os.path.join(output_dir, "chained_analysis.png"), show_labels=True)
|
59
|
+
print("Created highlighted image with method chaining")
|
60
|
+
|
61
|
+
# Group regions by type and model
|
62
|
+
regions_by_type = {}
|
63
|
+
for region in page.detected_layout_regions:
|
64
|
+
region_type = region.region_type
|
65
|
+
if region_type not in regions_by_type:
|
66
|
+
regions_by_type[region_type] = []
|
67
|
+
regions_by_type[region_type].append(region)
|
68
|
+
|
69
|
+
# Print a summary of all detected regions by type
|
70
|
+
print("\nAll detected regions:")
|
71
|
+
for region_type, type_regions in regions_by_type.items():
|
72
|
+
model_name = type_regions[0].model if hasattr(type_regions[0], 'model') else "unknown"
|
73
|
+
print(f" - {region_type} ({model_name}): {len(type_regions)} regions")
|
74
|
+
|
75
|
+
# Highlight all regions using method chaining
|
76
|
+
output_path = os.path.join(output_dir, "all_detected_regions.png")
|
77
|
+
page.clear_highlights()\
|
78
|
+
.highlight_layout()\
|
79
|
+
.to_image(path=output_path, show_labels=True)
|
80
|
+
print(f"\nSaved combined layout visualization to {output_path}")
|
81
|
+
|
82
|
+
# Highlight only YOLO regions using selector and chaining
|
83
|
+
output_path = os.path.join(output_dir, "yolo_regions.png")
|
84
|
+
page.clear_highlights()\
|
85
|
+
.find_all('region[model=yolo]')\
|
86
|
+
.highlight(label="YOLO Regions")
|
87
|
+
page.to_image(path=output_path, show_labels=True)
|
88
|
+
print(f"Saved YOLO layout visualization to {output_path}")
|
89
|
+
|
90
|
+
# Highlight only Table Transformer regions using selector and chaining
|
91
|
+
output_path = os.path.join(output_dir, "table_structure.png")
|
92
|
+
page.clear_highlights()\
|
93
|
+
.find_all('region[model=tatr]')\
|
94
|
+
.highlight(label="Table Structure")
|
95
|
+
page.to_image(path=output_path, show_labels=True)
|
96
|
+
print(f"Saved Table Transformer visualization to {output_path}")
|
97
|
+
|
98
|
+
# Find tables and process their content
|
99
|
+
tables = page.find_all('region[type=table]')
|
100
|
+
if tables:
|
101
|
+
print(f"\nFound {len(tables)} tables")
|
102
|
+
|
103
|
+
# Get the first table
|
104
|
+
table = tables[0]
|
105
|
+
print(f"Table details:")
|
106
|
+
print(f" Confidence: {table.confidence:.2f}")
|
107
|
+
print(f" Bounding box: {table.bbox}")
|
108
|
+
|
109
|
+
# Find rows, columns, and headers within this table
|
110
|
+
# Note: Original class names with spaces are converted to hyphenated format in selectors
|
111
|
+
rows = page.find_all('region[type=table-row]')
|
112
|
+
columns = page.find_all('region[type=table-column]')
|
113
|
+
headers = page.find_all('region[type=table-column-header]')
|
114
|
+
|
115
|
+
print(f" Structure: {len(rows)} rows, {len(columns)} columns, {len(headers)} headers")
|
116
|
+
|
117
|
+
# Extract text from the table
|
118
|
+
table_text = table.extract_text()
|
119
|
+
print(f" Content preview: {table_text[:150]}..." if len(table_text) > 150 else table_text)
|
120
|
+
|
121
|
+
# Highlight the table structure with distinct colors
|
122
|
+
page.clear_highlights()
|
123
|
+
|
124
|
+
# First highlight the table
|
125
|
+
table.highlight(label="Table", color=(1, 0, 0, 0.3))
|
126
|
+
|
127
|
+
# Then highlight the structure elements
|
128
|
+
for row in rows:
|
129
|
+
row.highlight(label="Row", color=(0, 1, 0, 0.3))
|
130
|
+
for column in columns:
|
131
|
+
column.highlight(label="Column", color=(0, 0, 1, 0.3))
|
132
|
+
for header in headers:
|
133
|
+
header.highlight(label="Header", color=(0, 1, 1, 0.3))
|
134
|
+
|
135
|
+
# Save the highlighted table structure
|
136
|
+
output_path = os.path.join(output_dir, "table_structure_detail.png")
|
137
|
+
page.to_image(path=output_path, show_labels=True)
|
138
|
+
print(f" Saved detailed table structure visualization to {output_path}")
|
139
|
+
|
140
|
+
# Now find text elements within the table
|
141
|
+
print("\nExtracting text from table cells:")
|
142
|
+
table_text_elements = table.find_all('text')
|
143
|
+
print(f" Found {len(table_text_elements)} text elements in the table")
|
144
|
+
|
145
|
+
# Show the first few text elements
|
146
|
+
for i, elem in enumerate(table_text_elements[:5]):
|
147
|
+
print(f" Text {i+1}: '{elem.text}'")
|
148
|
+
|
149
|
+
# You can also extract text just from table headers
|
150
|
+
if headers:
|
151
|
+
header = headers[0]
|
152
|
+
header_text = header.extract_text()
|
153
|
+
print(f"\nHeader text: {header_text}")
|
154
|
+
else:
|
155
|
+
print("\nNo tables detected on this page")
|
@@ -0,0 +1,56 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
# Add parent directory to path for imports
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
7
|
+
|
8
|
+
from natural_pdf import PDF
|
9
|
+
|
10
|
+
# Get absolute path for the PDF
|
11
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
12
|
+
root_dir = os.path.dirname(script_dir)
|
13
|
+
pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
14
|
+
|
15
|
+
print(f"Loading PDF: {pdf_path}")
|
16
|
+
pdf = PDF(pdf_path)
|
17
|
+
|
18
|
+
# Create output directory if it doesn't exist
|
19
|
+
output_dir = os.path.join(root_dir, "output")
|
20
|
+
os.makedirs(output_dir, exist_ok=True)
|
21
|
+
|
22
|
+
# Use a specific page
|
23
|
+
page = pdf.pages[6]
|
24
|
+
|
25
|
+
# Test 1: Analyze layout with create_cells=True
|
26
|
+
print("\n-- Testing layout detection with cell creation --")
|
27
|
+
regions = page.analyze_layout(model='tatr', create_cells=True)
|
28
|
+
|
29
|
+
# Count tables and cells
|
30
|
+
tables = page.find_all('region[type=table][model=tatr]')
|
31
|
+
cells = page.find_all('region[type=table-cell][model=tatr]')
|
32
|
+
|
33
|
+
print(f"Found {len(tables)} tables")
|
34
|
+
print(f"Found {len(cells)} table cells")
|
35
|
+
|
36
|
+
# Test 2: Create cells explicitly from a table
|
37
|
+
if tables:
|
38
|
+
print("\n-- Testing explicit cell creation from a table --")
|
39
|
+
table = tables[0]
|
40
|
+
# Create cells if not already created
|
41
|
+
explicit_cells = table.create_cells()
|
42
|
+
print(f"Created {len(explicit_cells)} cells explicitly")
|
43
|
+
|
44
|
+
# Highlight the first few cells
|
45
|
+
for i, cell in enumerate(explicit_cells[:5]):
|
46
|
+
cell.highlight(label=f"Cell {i+1}", color=(255, 0, 0, 50))
|
47
|
+
|
48
|
+
# Highlight the table
|
49
|
+
table.highlight(label="Table", color=(0, 0, 255, 50))
|
50
|
+
|
51
|
+
# Save the highlighted image
|
52
|
+
output_path = os.path.join(output_dir, "tatr_cells_test.png")
|
53
|
+
print(f"\nSaving highlighted image to: {output_path}")
|
54
|
+
page.to_image(path=output_path, show_labels=True)
|
55
|
+
|
56
|
+
print("\nTest completed successfully!")
|
@@ -0,0 +1,94 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
# Add parent directory to path for imports
|
7
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
8
|
+
|
9
|
+
from natural_pdf import PDF
|
10
|
+
|
11
|
+
# Get absolute path for the PDF
|
12
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
13
|
+
root_dir = os.path.dirname(script_dir)
|
14
|
+
pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
15
|
+
|
16
|
+
print(f"Loading PDF: {pdf_path}")
|
17
|
+
pdf = PDF(pdf_path)
|
18
|
+
|
19
|
+
# Create output directory if it doesn't exist
|
20
|
+
output_dir = os.path.join(root_dir, "output")
|
21
|
+
os.makedirs(output_dir, exist_ok=True)
|
22
|
+
|
23
|
+
# Use a specific page
|
24
|
+
page = pdf.pages[3] # Try page 3 (this should be correct - pages are indexed from 0)
|
25
|
+
|
26
|
+
# Run document layout analysis to find tables
|
27
|
+
print("\n-- Running layout analysis to find tables --")
|
28
|
+
regions = page.analyze_layout(model='tatr')
|
29
|
+
|
30
|
+
# Find the first table
|
31
|
+
table = page.find('region[type=table][model=tatr]')
|
32
|
+
if not table:
|
33
|
+
print("No tables found.")
|
34
|
+
sys.exit(1)
|
35
|
+
|
36
|
+
print(f"Found table at coordinates: {table.bbox}")
|
37
|
+
|
38
|
+
# Find table structure elements
|
39
|
+
rows = page.find_all(f'region[type=table-row][model=tatr]')
|
40
|
+
columns = page.find_all(f'region[type=table-column][model=tatr]')
|
41
|
+
headers = page.find_all(f'region[type=table-column-header][model=tatr]')
|
42
|
+
|
43
|
+
# Filter to elements that are part of this table
|
44
|
+
def is_in_table(region, table):
|
45
|
+
region_center_x = (region.x0 + region.x1) / 2
|
46
|
+
region_center_y = (region.top + region.bottom) / 2
|
47
|
+
return (table.x0 <= region_center_x <= table.x1 and
|
48
|
+
table.top <= region_center_y <= table.bottom)
|
49
|
+
|
50
|
+
table_rows = [r for r in rows if is_in_table(r, table)]
|
51
|
+
table_columns = [c for c in columns if is_in_table(c, table)]
|
52
|
+
table_headers = [h for h in headers if is_in_table(h, table)]
|
53
|
+
|
54
|
+
# Print structure info
|
55
|
+
print(f"Table has {len(table_rows)} rows, {len(table_columns)} columns, and {len(table_headers)} headers")
|
56
|
+
|
57
|
+
# Create cells and check OCR on some of them
|
58
|
+
cells = table.create_cells()
|
59
|
+
print(f"Created {len(cells)} cells")
|
60
|
+
|
61
|
+
# Try OCR on a few individual cells to debug
|
62
|
+
print("\n-- Testing OCR on individual cells --")
|
63
|
+
if cells:
|
64
|
+
sample_cells = cells[:50] # First 50 cells
|
65
|
+
|
66
|
+
for i, cell in enumerate(sample_cells):
|
67
|
+
# print(f"Cell {i+1}:", cell.bbox)
|
68
|
+
|
69
|
+
# Try OCR with very low confidence
|
70
|
+
ocr_config = {
|
71
|
+
"enabled": True,
|
72
|
+
"min_confidence": 0.01,
|
73
|
+
"detection_params": {
|
74
|
+
"text_threshold": 0.001, # Lower threshold to detect more text (default is 0.7)
|
75
|
+
"mag_ratio": 4.0, # Double the magnification during detectio
|
76
|
+
"link_threshold": 1
|
77
|
+
},
|
78
|
+
"recognition_params": {
|
79
|
+
"min_size": 6
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
83
|
+
ocr_elements = cell.apply_ocr(**ocr_config)
|
84
|
+
if ocr_elements:
|
85
|
+
print(f" OCR detected {len(ocr_elements)} text elements:")
|
86
|
+
for elem in ocr_elements:
|
87
|
+
print(f" '{elem.text}' (conf: {elem.confidence:.2f})")
|
88
|
+
|
89
|
+
# Get regular text
|
90
|
+
text = cell.extract_text().strip()
|
91
|
+
if text:
|
92
|
+
print(f" Regular extraction: '{text}'")
|
93
|
+
|
94
|
+
print("\nTest completed successfully!")
|
@@ -0,0 +1,122 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating enhanced text search capabilities in Natural PDF.
|
3
|
+
|
4
|
+
This showcases:
|
5
|
+
1. Multi-word searching with keep_spaces enabled (default)
|
6
|
+
2. Case-insensitive searching
|
7
|
+
3. Regular expression searching
|
8
|
+
4. Turning off keep_spaces to see the difference
|
9
|
+
"""
|
10
|
+
|
11
|
+
import os
|
12
|
+
import sys
|
13
|
+
import argparse
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
# Add parent directory to path for running the example
|
17
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
18
|
+
|
19
|
+
from natural_pdf import PDF, configure_logging
|
20
|
+
import logging
|
21
|
+
|
22
|
+
|
23
|
+
def main(pdf_path=None):
|
24
|
+
# Use a default PDF if none provided
|
25
|
+
if not pdf_path:
|
26
|
+
pdf_path = os.path.join(os.path.dirname(__file__), '..', 'pdfs', '2019 Statistics.pdf')
|
27
|
+
|
28
|
+
print(f"Using PDF: {pdf_path}")
|
29
|
+
print("-" * 50)
|
30
|
+
|
31
|
+
# Create PDF with default settings (keep_spaces=True)
|
32
|
+
pdf = PDF(pdf_path)
|
33
|
+
page = pdf.pages[0]
|
34
|
+
|
35
|
+
# Display basic page info
|
36
|
+
print(f"Page dimensions: {page.width} x {page.height}")
|
37
|
+
|
38
|
+
# 1. Basic multi-word search with default keep_spaces=True
|
39
|
+
print("\nMulti-word search with keep_spaces=True (default):")
|
40
|
+
print("-" * 50)
|
41
|
+
|
42
|
+
# Search for a multi-word phrase
|
43
|
+
results = page.find_all('text:contains("annual report")', case=False)
|
44
|
+
print(f"Found {len(results)} results for 'annual report' (case-insensitive)")
|
45
|
+
for i, result in enumerate(results):
|
46
|
+
print(f" Result {i+1}: '{result.text}'")
|
47
|
+
# Highlight the results
|
48
|
+
result.highlight(label=f"Match {i+1}: 'annual report'", color=(1, 0.7, 0, 0.3))
|
49
|
+
|
50
|
+
# 2. Case-sensitive search
|
51
|
+
print("\nCase-sensitive search:")
|
52
|
+
print("-" * 50)
|
53
|
+
|
54
|
+
# Search with case sensitivity
|
55
|
+
results = page.find_all('text:contains("Annual Report")', case=True)
|
56
|
+
print(f"Found {len(results)} results for 'Annual Report' (case-sensitive)")
|
57
|
+
for i, result in enumerate(results):
|
58
|
+
print(f" Result {i+1}: '{result.text}'")
|
59
|
+
# Highlight with a different color
|
60
|
+
result.highlight(label=f"Match {i+1}: 'Annual Report'", color=(0, 0.7, 1, 0.3))
|
61
|
+
|
62
|
+
# 3. Regular expression search
|
63
|
+
print("\nRegular expression search:")
|
64
|
+
print("-" * 50)
|
65
|
+
|
66
|
+
# Use regex to find patterns
|
67
|
+
pattern = "report\\s+\\d{4}" # "report" followed by whitespace and 4 digits
|
68
|
+
results = page.find_all(f'text:contains("{pattern}")', regex=True, case=False)
|
69
|
+
print(f"Found {len(results)} results for regex pattern '{pattern}'")
|
70
|
+
for i, result in enumerate(results):
|
71
|
+
print(f" Result {i+1}: '{result.text}'")
|
72
|
+
# Highlight with another color
|
73
|
+
result.highlight(label=f"Match {i+1}: regex '{pattern}'", color=(0, 1, 0, 0.3))
|
74
|
+
|
75
|
+
# Save highlighted page as an image
|
76
|
+
output_path = os.path.join(os.path.dirname(__file__), '..', 'output', 'text_search_results.png')
|
77
|
+
page.save_image(output_path, labels=True)
|
78
|
+
print(f"\nSaved highlighted results to: {output_path}")
|
79
|
+
|
80
|
+
# 4. Create a new PDF with keep_spaces=False to compare
|
81
|
+
print("\nComparing with keep_spaces=False (legacy behavior):")
|
82
|
+
print("-" * 50)
|
83
|
+
|
84
|
+
# Create a new PDF with keep_spaces=False
|
85
|
+
pdf_legacy = PDF(pdf_path, keep_spaces=False)
|
86
|
+
page_legacy = pdf_legacy.pages[0]
|
87
|
+
|
88
|
+
# Try the same multi-word search
|
89
|
+
results_legacy = page_legacy.find_all('text:contains("annual report")', case=False)
|
90
|
+
print(f"Found {len(results_legacy)} results for 'annual report' (case-insensitive)")
|
91
|
+
|
92
|
+
# Try regex to find occurrences in separate words
|
93
|
+
pattern = "annual\\s+report" # "annual" followed by whitespace and "report"
|
94
|
+
regex_results = page_legacy.find_all(f'text:contains("{pattern}")', regex=True, case=False)
|
95
|
+
print(f"With regex '{pattern}': Found {len(regex_results)} results")
|
96
|
+
|
97
|
+
# Show conclusion
|
98
|
+
print("\nConclusion:")
|
99
|
+
print("-" * 50)
|
100
|
+
print("1. With keep_spaces=True (default):")
|
101
|
+
print(" - Multi-word phrases can be found directly with :contains()")
|
102
|
+
print(" - Text maintains its natural spacing within word elements")
|
103
|
+
print("\n2. With keep_spaces=False (legacy):")
|
104
|
+
print(" - Words are split at spaces, making multi-word search less effective")
|
105
|
+
print(" - Regular expressions with \\s patterns can help bridge words")
|
106
|
+
|
107
|
+
return pdf
|
108
|
+
|
109
|
+
|
110
|
+
if __name__ == "__main__":
|
111
|
+
# Set up command line arguments
|
112
|
+
parser = argparse.ArgumentParser(description="Demonstrate Natural PDF's enhanced text search capabilities")
|
113
|
+
parser.add_argument("--pdf", help="Path to a PDF file to analyze")
|
114
|
+
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
|
115
|
+
args = parser.parse_args()
|
116
|
+
|
117
|
+
# Configure logging
|
118
|
+
log_level = logging.DEBUG if args.verbose else logging.INFO
|
119
|
+
configure_logging(level=log_level)
|
120
|
+
|
121
|
+
# Run the example
|
122
|
+
pdf = main(args.pdf)
|
@@ -0,0 +1,110 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the text style analysis feature of natural-pdf.
|
3
|
+
"""
|
4
|
+
import os
|
5
|
+
import sys
|
6
|
+
|
7
|
+
# Add the parent directory to the path to import the package
|
8
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
9
|
+
|
10
|
+
from natural_pdf import PDF
|
11
|
+
|
12
|
+
def text_style_example(pdf_path):
|
13
|
+
"""Demonstrates the text style analysis feature."""
|
14
|
+
# Open the PDF
|
15
|
+
with PDF(pdf_path) as pdf:
|
16
|
+
page = pdf.pages[0]
|
17
|
+
|
18
|
+
print(f"PDF loaded: {pdf_path}")
|
19
|
+
print(f"PDF has {len(pdf)} pages")
|
20
|
+
|
21
|
+
# Create an output directory for saving images
|
22
|
+
output_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'output'))
|
23
|
+
os.makedirs(output_dir, exist_ok=True)
|
24
|
+
|
25
|
+
# EXAMPLE 1: Analyze text styles
|
26
|
+
print("\nEXAMPLE 1: Analyzing text styles")
|
27
|
+
print("-" * 60)
|
28
|
+
|
29
|
+
# Analyze the styles
|
30
|
+
styles = page.analyze_text_styles()
|
31
|
+
|
32
|
+
# Display what was found
|
33
|
+
print("Text style analysis results:")
|
34
|
+
for label, elements in styles.items():
|
35
|
+
print(f"- {label}: {len(elements)} elements")
|
36
|
+
|
37
|
+
# Show a sample of each style
|
38
|
+
if len(elements) > 0:
|
39
|
+
sample = elements[0]
|
40
|
+
# Get style properties
|
41
|
+
size = getattr(sample, 'size', 'N/A')
|
42
|
+
font = getattr(sample, 'fontname', 'N/A')
|
43
|
+
|
44
|
+
# Determine if bold/italic based on font name
|
45
|
+
is_bold = False
|
46
|
+
is_italic = False
|
47
|
+
if hasattr(sample, 'fontname') and sample.fontname:
|
48
|
+
font_lower = sample.fontname.lower()
|
49
|
+
is_bold = ('bold' in font_lower or 'black' in font_lower or
|
50
|
+
sample.fontname.endswith('-B'))
|
51
|
+
is_italic = ('italic' in font_lower or 'oblique' in font_lower or
|
52
|
+
sample.fontname.endswith('-I'))
|
53
|
+
|
54
|
+
style_desc = []
|
55
|
+
if is_bold:
|
56
|
+
style_desc.append("bold")
|
57
|
+
if is_italic:
|
58
|
+
style_desc.append("italic")
|
59
|
+
|
60
|
+
style_text = ", ".join(style_desc) if style_desc else "regular"
|
61
|
+
|
62
|
+
print(f" Sample: '{sample.text}' (size={size}, {style_text}, font={font})")
|
63
|
+
|
64
|
+
# EXAMPLE 2: Visualize text styles with highlighting
|
65
|
+
print("\nEXAMPLE 2: Visualizing text styles")
|
66
|
+
print("-" * 60)
|
67
|
+
|
68
|
+
# Highlight the styles
|
69
|
+
page.highlight_text_styles()
|
70
|
+
|
71
|
+
# Save the image with a legend
|
72
|
+
output_file = os.path.join(output_dir, "text_styles.png")
|
73
|
+
page.to_image(path=output_file, show_labels=True)
|
74
|
+
print(f"Saved text style visualization to: {output_file}")
|
75
|
+
|
76
|
+
# Clear highlights for the next example
|
77
|
+
page.clear_highlights()
|
78
|
+
|
79
|
+
# EXAMPLE 3: Using highlight_all with text styles
|
80
|
+
print("\nEXAMPLE 3: Using highlight_all with text styles")
|
81
|
+
print("-" * 60)
|
82
|
+
|
83
|
+
# Highlight all elements including text styles
|
84
|
+
page.highlight_all(include_text_styles=True)
|
85
|
+
|
86
|
+
# Save the image with a legend
|
87
|
+
output_file = os.path.join(output_dir, "highlight_all_styles.png")
|
88
|
+
page.to_image(path=output_file, show_labels=True)
|
89
|
+
print(f"Saved highlight_all with text styles to: {output_file}")
|
90
|
+
|
91
|
+
print("\nEnd of text style demonstration.")
|
92
|
+
|
93
|
+
if __name__ == "__main__":
|
94
|
+
# Default to example PDF if no path is provided
|
95
|
+
if len(sys.argv) < 2:
|
96
|
+
# Use the example PDF in the pdfs directory
|
97
|
+
pdf_path = os.path.abspath(os.path.join(
|
98
|
+
os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
|
99
|
+
if not os.path.exists(pdf_path):
|
100
|
+
print("Example PDF not found. Please provide a path to a PDF file.")
|
101
|
+
print("Usage: python text_style_example.py [path/to/file.pdf]")
|
102
|
+
sys.exit(1)
|
103
|
+
else:
|
104
|
+
pdf_path = sys.argv[1]
|
105
|
+
# Check if the file exists
|
106
|
+
if not os.path.exists(pdf_path):
|
107
|
+
print(f"File not found: {pdf_path}")
|
108
|
+
sys.exit(1)
|
109
|
+
|
110
|
+
text_style_example(pdf_path)
|
examples/tiny-text.py
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
from pathlib import Path
|
4
|
+
|
5
|
+
# Add parent directory to path for imports
|
6
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
7
|
+
|
8
|
+
from natural_pdf import PDF
|
9
|
+
|
10
|
+
# Get absolute path for the PDF
|
11
|
+
script_dir = os.path.dirname(os.path.realpath(__file__))
|
12
|
+
root_dir = os.path.dirname(script_dir)
|
13
|
+
pdf_path = os.path.join(root_dir, "pdfs", "HARRY ROQUE_redacted.pdf")
|
14
|
+
|
15
|
+
print(f"Loading PDF: {pdf_path}")
|
16
|
+
pdf = PDF(pdf_path, ocr={
|
17
|
+
"enabled": True,
|
18
|
+
"engine": "easyocr",
|
19
|
+
"languages": ["en"],
|
20
|
+
"detection_params": {
|
21
|
+
"text_threshold": 0.001,
|
22
|
+
"mag_ratio": 3.0, # Quadruple the magnification during detection
|
23
|
+
"canvas_size": 5000,
|
24
|
+
},
|
25
|
+
"recognition_params": {
|
26
|
+
"min_size": 4,
|
27
|
+
"contrast_ths": 0.5
|
28
|
+
}
|
29
|
+
})
|
30
|
+
|
31
|
+
# Create output directory if it doesn't exist
|
32
|
+
output_dir = os.path.join(root_dir, "output")
|
33
|
+
os.makedirs(output_dir, exist_ok=True)
|
34
|
+
|
35
|
+
# Use a specific page
|
36
|
+
page = pdf.pages[6]
|
37
|
+
# Run document layout analysis
|
38
|
+
regions = page.analyze_layout(model='tatr')
|
39
|
+
|
40
|
+
print(f"Found {len(regions)} regions")
|
41
|
+
|
42
|
+
# # Apply OCR explicitly
|
43
|
+
# print("Applying OCR...")
|
44
|
+
ocr_elements = page.apply_ocr()
|
45
|
+
print(f"Found {len(ocr_elements)} OCR elements")
|
46
|
+
|
47
|
+
# Print some sample elements
|
48
|
+
print("\nSample OCR elements:")
|
49
|
+
for i, elem in enumerate(ocr_elements[:30]):
|
50
|
+
print(f"{i+1}. {elem}")
|
51
|
+
|
52
|
+
# Highlight the OCR text elements
|
53
|
+
print("\nHighlighting OCR elements...")
|
54
|
+
for elem in ocr_elements:
|
55
|
+
elem.highlight(label=f"OCR ({elem.confidence:.2f})")
|
56
|
+
|
57
|
+
output_path = os.path.join(output_dir, "ocr_highlight_all_test.png")
|
58
|
+
print(f"Saving highlight_all image to: {output_path}")
|
59
|
+
page.to_image(path=output_path, show_labels=True)
|
60
|
+
|
61
|
+
print("\nTest completed successfully!")
|