natural-pdf 25.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/__init__.py +3 -0
- examples/another_exclusion_example.py +20 -0
- examples/basic_usage.py +190 -0
- examples/boundary_exclusion_test.py +137 -0
- examples/boundary_inclusion_fix_test.py +157 -0
- examples/chainable_layout_example.py +70 -0
- examples/color_basic_test.py +49 -0
- examples/color_name_example.py +71 -0
- examples/color_test.py +62 -0
- examples/debug_ocr.py +91 -0
- examples/direct_ocr_test.py +148 -0
- examples/direct_paddle_test.py +99 -0
- examples/direct_qa_example.py +165 -0
- examples/document_layout_analysis.py +123 -0
- examples/document_qa_example.py +185 -0
- examples/exclusion_count_debug.py +128 -0
- examples/exclusion_debug.py +107 -0
- examples/exclusion_example.py +150 -0
- examples/exclusion_optimization_example.py +190 -0
- examples/extract_text_test.py +128 -0
- examples/font_aware_example.py +101 -0
- examples/font_variant_example.py +124 -0
- examples/footer_overlap_test.py +124 -0
- examples/highlight_all_example.py +82 -0
- examples/highlight_attributes_test.py +114 -0
- examples/highlight_confidence_display.py +122 -0
- examples/highlight_demo.py +110 -0
- examples/highlight_float_test.py +71 -0
- examples/highlight_test.py +147 -0
- examples/highlighting_example.py +123 -0
- examples/image_width_example.py +84 -0
- examples/improved_api_example.py +128 -0
- examples/layout_confidence_display_test.py +65 -0
- examples/layout_confidence_test.py +82 -0
- examples/layout_coordinate_debug.py +258 -0
- examples/layout_highlight_test.py +77 -0
- examples/logging_example.py +70 -0
- examples/ocr_comprehensive.py +193 -0
- examples/ocr_debug_example.py +87 -0
- examples/ocr_default_test.py +97 -0
- examples/ocr_engine_comparison.py +235 -0
- examples/ocr_example.py +89 -0
- examples/ocr_simplified_params.py +79 -0
- examples/ocr_visualization.py +102 -0
- examples/ocr_visualization_test.py +121 -0
- examples/paddle_layout_example.py +315 -0
- examples/paddle_layout_simple.py +74 -0
- examples/paddleocr_example.py +224 -0
- examples/page_collection_example.py +103 -0
- examples/polygon_highlight_example.py +83 -0
- examples/position_methods_example.py +134 -0
- examples/region_boundary_test.py +73 -0
- examples/region_exclusion_test.py +149 -0
- examples/region_expand_example.py +109 -0
- examples/region_image_example.py +116 -0
- examples/region_ocr_test.py +119 -0
- examples/region_sections_example.py +115 -0
- examples/school_books.py +49 -0
- examples/school_books_all.py +52 -0
- examples/scouring.py +36 -0
- examples/section_extraction_example.py +232 -0
- examples/simple_document_qa.py +97 -0
- examples/spatial_navigation_example.py +108 -0
- examples/table_extraction_example.py +135 -0
- examples/table_structure_detection.py +155 -0
- examples/tatr_cells_test.py +56 -0
- examples/tatr_ocr_table_test.py +94 -0
- examples/text_search_example.py +122 -0
- examples/text_style_example.py +110 -0
- examples/tiny-text.py +61 -0
- examples/until_boundaries_example.py +156 -0
- examples/until_example.py +112 -0
- examples/very_basics.py +15 -0
- natural_pdf/__init__.py +55 -0
- natural_pdf/analyzers/__init__.py +9 -0
- natural_pdf/analyzers/document_layout.py +736 -0
- natural_pdf/analyzers/text_structure.py +153 -0
- natural_pdf/core/__init__.py +3 -0
- natural_pdf/core/page.py +2376 -0
- natural_pdf/core/pdf.py +572 -0
- natural_pdf/elements/__init__.py +3 -0
- natural_pdf/elements/base.py +553 -0
- natural_pdf/elements/collections.py +770 -0
- natural_pdf/elements/line.py +124 -0
- natural_pdf/elements/rect.py +122 -0
- natural_pdf/elements/region.py +1366 -0
- natural_pdf/elements/text.py +304 -0
- natural_pdf/ocr/__init__.py +62 -0
- natural_pdf/ocr/easyocr_engine.py +254 -0
- natural_pdf/ocr/engine.py +158 -0
- natural_pdf/ocr/paddleocr_engine.py +263 -0
- natural_pdf/qa/__init__.py +3 -0
- natural_pdf/qa/document_qa.py +405 -0
- natural_pdf/selectors/__init__.py +4 -0
- natural_pdf/selectors/parser.py +360 -0
- natural_pdf/templates/__init__.py +1 -0
- natural_pdf/templates/ocr_debug.html +517 -0
- natural_pdf/utils/__init__.py +4 -0
- natural_pdf/utils/highlighting.py +605 -0
- natural_pdf/utils/ocr.py +515 -0
- natural_pdf/utils/reading_order.py +227 -0
- natural_pdf/utils/visualization.py +151 -0
- natural_pdf-25.3.16.dist-info/LICENSE +21 -0
- natural_pdf-25.3.16.dist-info/METADATA +268 -0
- natural_pdf-25.3.16.dist-info/RECORD +109 -0
- natural_pdf-25.3.16.dist-info/WHEEL +5 -0
- natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
- tests/__init__.py +3 -0
- tests/test_pdf.py +39 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating how to use the until parameter with above() and below() methods.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
|
9
|
+
# Add parent directory to path for imports
|
10
|
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
11
|
+
|
12
|
+
from natural_pdf import PDF
|
13
|
+
|
14
|
+
|
15
|
+
def main():
|
16
|
+
"""Main entry point."""
|
17
|
+
# Get the PDF path from command line or use a default
|
18
|
+
if len(sys.argv) > 1:
|
19
|
+
pdf_path = sys.argv[1]
|
20
|
+
else:
|
21
|
+
# Look for any PDF in the examples directory or pdfs directory
|
22
|
+
example_dir = Path(__file__).parent
|
23
|
+
pdf_files = list(example_dir.glob("*.pdf"))
|
24
|
+
|
25
|
+
if not pdf_files:
|
26
|
+
pdfs_dir = example_dir.parent / "pdfs"
|
27
|
+
if pdfs_dir.exists():
|
28
|
+
pdf_files = list(pdfs_dir.glob("*.pdf"))
|
29
|
+
|
30
|
+
if pdf_files:
|
31
|
+
pdf_path = str(pdf_files[0])
|
32
|
+
else:
|
33
|
+
print("No PDF file found. Please provide a path to a PDF file.")
|
34
|
+
sys.exit(1)
|
35
|
+
|
36
|
+
print(f"Using PDF: {pdf_path}")
|
37
|
+
|
38
|
+
# Open the PDF
|
39
|
+
pdf = PDF(pdf_path)
|
40
|
+
page = pdf.pages[0]
|
41
|
+
|
42
|
+
# Clear any existing highlights
|
43
|
+
page.clear_highlights()
|
44
|
+
|
45
|
+
# First, find some key elements on the page
|
46
|
+
heading1 = page.find('text[size>=12]')
|
47
|
+
|
48
|
+
if not heading1:
|
49
|
+
# If no large headings, just use the first few elements as examples
|
50
|
+
elements = page.get_elements()
|
51
|
+
elements.sort(key=lambda e: (e.top, e.x0)) # Sort in reading order
|
52
|
+
|
53
|
+
if len(elements) < 3:
|
54
|
+
print("Not enough elements found for demonstration")
|
55
|
+
return
|
56
|
+
|
57
|
+
element1 = elements[0]
|
58
|
+
element2 = elements[len(elements) // 3] # About 1/3 down
|
59
|
+
element3 = elements[len(elements) // 2] # About halfway down
|
60
|
+
|
61
|
+
# Highlight the reference elements
|
62
|
+
element1.highlight(label="First Element")
|
63
|
+
element2.highlight(label="Second Element")
|
64
|
+
element3.highlight(label="Third Element")
|
65
|
+
|
66
|
+
print(f"First element: '{element1.text if hasattr(element1, 'text') else 'non-text'}' at y={element1.top}")
|
67
|
+
print(f"Second element: '{element2.text if hasattr(element2, 'text') else 'non-text'}' at y={element2.top}")
|
68
|
+
print(f"Third element: '{element3.text if hasattr(element3, 'text') else 'non-text'}' at y={element3.top}")
|
69
|
+
|
70
|
+
# Demonstrate below() with until parameter
|
71
|
+
print("\nDemonstrating below() with until parameter")
|
72
|
+
|
73
|
+
# Get the region from element1 to element2
|
74
|
+
region1 = element1.below(until=f'text:contains("{element2.text}")')
|
75
|
+
region1.highlight(label="Below until Second Element")
|
76
|
+
|
77
|
+
# Get the region from element2 to element3, excluding element3
|
78
|
+
region2 = element2.below(until=f'text:contains("{element3.text}")', include_until=False)
|
79
|
+
region2.highlight(label="Below until Third Element (excluded)")
|
80
|
+
|
81
|
+
# Demonstrate above() with until parameter
|
82
|
+
print("\nDemonstrating above() with until parameter")
|
83
|
+
|
84
|
+
# Get the region from element3 up to element2
|
85
|
+
region3 = element3.above(until=f'text:contains("{element2.text}")')
|
86
|
+
region3.highlight(label="Above until Second Element")
|
87
|
+
|
88
|
+
# Get the region from element2 up to element1, excluding element1
|
89
|
+
region4 = element2.above(until=f'text:contains("{element1.text}")', include_until=False)
|
90
|
+
region4.highlight(label="Above until First Element (excluded)")
|
91
|
+
|
92
|
+
# Create an output directory
|
93
|
+
output_dir = Path(__file__).parent / "until_output"
|
94
|
+
output_dir.mkdir(exist_ok=True)
|
95
|
+
|
96
|
+
# Save the result
|
97
|
+
page.save(str(output_dir / "until_boundaries.png"), labels=True)
|
98
|
+
|
99
|
+
# Print the contents of the regions
|
100
|
+
print("\nContent in region 'below until second element':")
|
101
|
+
print(region1.extract_text()[:100] + "..." if len(region1.extract_text()) > 100 else region1.extract_text())
|
102
|
+
|
103
|
+
print("\nContent in region 'above until second element':")
|
104
|
+
print(region3.extract_text()[:100] + "..." if len(region3.extract_text()) > 100 else region3.extract_text())
|
105
|
+
|
106
|
+
print("\nExample completed. Check 'until_output/until_boundaries.png' for the result.")
|
107
|
+
else:
|
108
|
+
# Find more headings
|
109
|
+
headings = page.find_all('text[size>=12]')
|
110
|
+
|
111
|
+
if len(headings) < 2:
|
112
|
+
# If not enough headings, fall back to the approach above
|
113
|
+
print("Not enough headings found. Using generic elements instead.")
|
114
|
+
main() # Re-run with the above approach
|
115
|
+
return
|
116
|
+
|
117
|
+
# Use the first two headings
|
118
|
+
heading1 = headings[0]
|
119
|
+
heading2 = headings[1]
|
120
|
+
|
121
|
+
# Highlight the headings
|
122
|
+
heading1.highlight(label="First Heading")
|
123
|
+
heading2.highlight(label="Second Heading")
|
124
|
+
|
125
|
+
print(f"First heading: '{heading1.text}' at y={heading1.top}")
|
126
|
+
print(f"Second heading: '{heading2.text}' at y={heading2.top}")
|
127
|
+
|
128
|
+
# Demonstrate below() with until parameter
|
129
|
+
print("\nDemonstrating below() with until parameter")
|
130
|
+
|
131
|
+
# Get the region from heading1 to heading2
|
132
|
+
region1 = heading1.below(until=f'text:contains("{heading2.text}")')
|
133
|
+
region1.highlight(label="Below until Second Heading")
|
134
|
+
|
135
|
+
# Get the region from heading1 to heading2, excluding heading2
|
136
|
+
region2 = heading1.below(until=f'text:contains("{heading2.text}")', include_until=False)
|
137
|
+
region2.highlight(label="Below until Second Heading (excluded)")
|
138
|
+
|
139
|
+
# Create an output directory
|
140
|
+
output_dir = Path(__file__).parent / "until_output"
|
141
|
+
output_dir.mkdir(exist_ok=True)
|
142
|
+
|
143
|
+
# Save the result
|
144
|
+
page.to_image(path=str(output_dir / "until_boundaries_headings.png"), show_labels=True)
|
145
|
+
|
146
|
+
# Print the contents of the regions
|
147
|
+
print("\nContent in region 'below until second heading':")
|
148
|
+
print(region1.extract_text()[:100] + "..." if len(region1.extract_text()) > 100 else region1.extract_text())
|
149
|
+
|
150
|
+
print("\nContent in region 'below until second heading (excluded)':")
|
151
|
+
print(region2.extract_text()[:100] + "..." if len(region2.extract_text()) > 100 else region2.extract_text())
|
152
|
+
|
153
|
+
print("\nExample completed. Check 'until_output/until_boundaries_headings.png' for the result.")
|
154
|
+
|
155
|
+
if __name__ == "__main__":
|
156
|
+
main()
|
@@ -0,0 +1,112 @@
|
|
1
|
+
"""
|
2
|
+
Example demonstrating the 'until' feature of natural-pdf.
|
3
|
+
(This was previously named 'select_until')
|
4
|
+
"""
|
5
|
+
import os
|
6
|
+
import sys
|
7
|
+
|
8
|
+
# Add the parent directory to the path to import the package
|
9
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
10
|
+
|
11
|
+
from natural_pdf import PDF
|
12
|
+
|
13
|
+
def until_example(pdf_path):
|
14
|
+
"""Demonstrates the 'until' method for defining content regions."""
|
15
|
+
# Open the PDF
|
16
|
+
with PDF(pdf_path) as pdf:
|
17
|
+
page = pdf.pages[0]
|
18
|
+
|
19
|
+
print(f"PDF loaded: {pdf_path}")
|
20
|
+
print(f"PDF has {len(pdf)} pages\n")
|
21
|
+
|
22
|
+
# EXAMPLE 1: Select from "Summary:" until the thick line
|
23
|
+
print("EXAMPLE 1: Select from Summary until thick line")
|
24
|
+
print("----------------------------------------------")
|
25
|
+
|
26
|
+
# Find the "Summary:" text
|
27
|
+
summary = page.find('text:contains("Summary:")')
|
28
|
+
print(f"Found 'Summary' text at: {summary.bbox}")
|
29
|
+
|
30
|
+
# Find the thick line
|
31
|
+
thick_line = page.find('line[width>=2]')
|
32
|
+
print(f"Found thick line at: {thick_line.bbox}")
|
33
|
+
|
34
|
+
# Create a region from Summary until the thick line
|
35
|
+
print("\nCreating region from 'Summary' until the thick line...")
|
36
|
+
summary_region = summary.until('line[width>=2]', width="full")
|
37
|
+
print(f"Region boundaries: {summary_region.bbox}")
|
38
|
+
|
39
|
+
# Extract and display text from this region
|
40
|
+
region_text = summary_region.extract_text()
|
41
|
+
print("\nText from the region:")
|
42
|
+
print("-" * 60)
|
43
|
+
print(region_text)
|
44
|
+
print("-" * 60)
|
45
|
+
|
46
|
+
# Find all text elements in this region
|
47
|
+
text_elements = summary_region.find_all('text')
|
48
|
+
print(f"\nFound {len(text_elements)} text elements in the region")
|
49
|
+
|
50
|
+
# Display the first 5 elements
|
51
|
+
if text_elements:
|
52
|
+
print("First 5 elements:")
|
53
|
+
for i, el in enumerate(text_elements[:5]):
|
54
|
+
print(f" {i+1}. '{el.text}'")
|
55
|
+
|
56
|
+
# EXAMPLE 2: Demonstrate include_endpoint=False option
|
57
|
+
print("\nEXAMPLE 2: Without including endpoint element")
|
58
|
+
print("----------------------------------------------")
|
59
|
+
|
60
|
+
# Create a region from Summary until the thick line, excluding the line
|
61
|
+
exclusive_region = summary.until('line[width>=2]', include_endpoint=False, width="full")
|
62
|
+
print(f"Region boundaries: {exclusive_region.bbox}")
|
63
|
+
|
64
|
+
# Compare text length
|
65
|
+
inclusive_text = summary_region.extract_text()
|
66
|
+
exclusive_text = exclusive_region.extract_text()
|
67
|
+
|
68
|
+
print(f"\nWith include_endpoint=True: {len(inclusive_text)} characters")
|
69
|
+
print(f"With include_endpoint=False: {len(exclusive_text)} characters")
|
70
|
+
|
71
|
+
# EXAMPLE 3: Different elements for until
|
72
|
+
print("\nEXAMPLE 3: Select from one text to another")
|
73
|
+
print("----------------------------------------------")
|
74
|
+
|
75
|
+
# Find text elements to use as boundaries
|
76
|
+
heading = page.find('text:contains("Violations")')
|
77
|
+
if heading:
|
78
|
+
# Select from "Violations" to "Critical"
|
79
|
+
target_word = page.find('text:contains("Critical")')
|
80
|
+
if target_word:
|
81
|
+
region = heading.until('text:contains("Critical")', width="full")
|
82
|
+
print(f"\nRegion from 'Violations' to 'Critical': {region.bbox}")
|
83
|
+
|
84
|
+
text = region.extract_text()
|
85
|
+
print(f"Extracted {len(text)} characters of text")
|
86
|
+
if len(text) > 100:
|
87
|
+
print(f"First 100 characters: {text[:100]}...")
|
88
|
+
else:
|
89
|
+
print("Could not find 'Critical' text")
|
90
|
+
else:
|
91
|
+
print("Could not find 'Violations' heading")
|
92
|
+
|
93
|
+
print("\nEnd of 'until' method demonstration.")
|
94
|
+
|
95
|
+
if __name__ == "__main__":
|
96
|
+
# Default to example PDF if no path is provided
|
97
|
+
if len(sys.argv) < 2:
|
98
|
+
# Use the example PDF in the pdfs directory
|
99
|
+
pdf_path = os.path.abspath(os.path.join(
|
100
|
+
os.path.dirname(__file__), '..', 'pdfs', '01-practice.pdf'))
|
101
|
+
if not os.path.exists(pdf_path):
|
102
|
+
print("Example PDF not found. Please provide a path to a PDF file.")
|
103
|
+
print("Usage: python until_example.py [path/to/file.pdf]")
|
104
|
+
sys.exit(1)
|
105
|
+
else:
|
106
|
+
pdf_path = sys.argv[1]
|
107
|
+
# Check if the file exists
|
108
|
+
if not os.path.exists(pdf_path):
|
109
|
+
print(f"File not found: {pdf_path}")
|
110
|
+
sys.exit(1)
|
111
|
+
|
112
|
+
until_example(pdf_path)
|
examples/very_basics.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
from natural_pdf import PDF
|
2
|
+
|
3
|
+
# Open the PDF
|
4
|
+
pdf = PDF("./pdfs/01-practice.pdf")
|
5
|
+
|
6
|
+
# Approximate match for red
|
7
|
+
serial = pdf.find('text[color~=red]')
|
8
|
+
|
9
|
+
# Between 'Summary' and thick line
|
10
|
+
summary = pdf.find('text:contains("Summary")').below(include_element=True, until='line[width>=2]')
|
11
|
+
|
12
|
+
# Debug
|
13
|
+
serial.highlight(label='Serial')
|
14
|
+
summary.highlight(label='Summary')
|
15
|
+
pdf.pages[0].to_image(path="output.png", show_labels=True)
|
natural_pdf/__init__.py
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
"""
|
2
|
+
Natural PDF - A more intuitive interface for working with PDFs.
|
3
|
+
"""
|
4
|
+
import logging
|
5
|
+
|
6
|
+
# Create library logger
|
7
|
+
logger = logging.getLogger("natural_pdf")
|
8
|
+
|
9
|
+
# Add a NullHandler to prevent "No handler found" warnings
|
10
|
+
# (Best practice for libraries)
|
11
|
+
logger.addHandler(logging.NullHandler())
|
12
|
+
|
13
|
+
# Utility function for users to easily configure logging
|
14
|
+
def configure_logging(level=logging.INFO, handler=None):
|
15
|
+
"""Configure Natural PDF's logging.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
level: The logging level (e.g., logging.INFO, logging.DEBUG)
|
19
|
+
handler: A custom handler, or None to use StreamHandler
|
20
|
+
"""
|
21
|
+
# Remove NullHandler if present
|
22
|
+
if logger.handlers and isinstance(logger.handlers[0], logging.NullHandler):
|
23
|
+
logger.removeHandler(logger.handlers[0])
|
24
|
+
|
25
|
+
if handler is None:
|
26
|
+
handler = logging.StreamHandler()
|
27
|
+
formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
|
28
|
+
handler.setFormatter(formatter)
|
29
|
+
|
30
|
+
logger.addHandler(handler)
|
31
|
+
logger.setLevel(level)
|
32
|
+
|
33
|
+
# Propagate level to all child loggers
|
34
|
+
for name in logging.root.manager.loggerDict:
|
35
|
+
if name.startswith("natural_pdf."):
|
36
|
+
logging.getLogger(name).setLevel(level)
|
37
|
+
|
38
|
+
from natural_pdf.core.pdf import PDF
|
39
|
+
from natural_pdf.core.page import Page
|
40
|
+
from natural_pdf.elements.region import Region
|
41
|
+
from natural_pdf.elements.collections import ElementCollection
|
42
|
+
|
43
|
+
# Import QA module if available
|
44
|
+
try:
|
45
|
+
from natural_pdf.qa import DocumentQA, get_qa_engine
|
46
|
+
HAS_QA = True
|
47
|
+
except ImportError:
|
48
|
+
HAS_QA = False
|
49
|
+
|
50
|
+
__version__ = "0.1.0"
|
51
|
+
|
52
|
+
if HAS_QA:
|
53
|
+
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging", "DocumentQA", "get_qa_engine"]
|
54
|
+
else:
|
55
|
+
__all__ = ["PDF", "Page", "Region", "ElementCollection", "configure_logging"]
|