natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,230 @@
1
+ """
2
+ Example script demonstrating hierarchical document navigation with Docling.
3
+
4
+ This script shows how to use Docling's hierarchical document structure to:
5
+ 1. Navigate parent-child relationships
6
+ 2. Extract structured content from nested document elements
7
+ 3. Visualize the document hierarchy
8
+
9
+ Usage:
10
+ python examples/docling_hierarchy_example.py [pdf_path]
11
+
12
+ Dependencies:
13
+ - torch
14
+ - transformers
15
+ - docling_core
16
+ """
17
+
18
+ import os
19
+ import sys
20
+ import logging
21
+ from pathlib import Path
22
+
23
+ # Import the library
24
+ from natural_pdf import PDF, configure_logging
25
+
26
+ # Configure logging
27
+ configure_logging(level=logging.INFO)
28
+ logger = logging.getLogger("docling_hierarchy")
29
+ logger.setLevel(logging.INFO)
30
+
31
+ # Get PDF path from command line or use demo file
32
+ if len(sys.argv) > 1:
33
+ pdf_path = sys.argv[1]
34
+ else:
35
+ # Default to a sample PDF in the pdfs directory
36
+ script_dir = os.path.dirname(os.path.abspath(__file__))
37
+ repo_root = os.path.dirname(script_dir)
38
+ pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
39
+
40
+ # Check if required packages are installed
41
+ try:
42
+ from docling.document_converter import DocumentConverter
43
+ except ImportError:
44
+ print("Missing required packages. Please install:")
45
+ print("pip install docling")
46
+ sys.exit(1)
47
+
48
+ # Create output directory
49
+ output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
50
+ os.makedirs(output_dir, exist_ok=True)
51
+
52
+ # Load the PDF
53
+ print(f"Loading PDF: {pdf_path}")
54
+ pdf = PDF(pdf_path)
55
+ page = pdf.pages[0]
56
+
57
+ # Run Docling analysis
58
+ print("Running Docling analysis...")
59
+ page.analyze_layout(
60
+ model="docling",
61
+ confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
62
+ model_params={
63
+ "verbose": True
64
+ # Any other parameters would be passed directly to DocumentConverter
65
+ }
66
+ )
67
+
68
+ # Verify Docling document is created
69
+ if not hasattr(page, 'docling_document'):
70
+ print("Error: Docling document not created")
71
+ sys.exit(1)
72
+
73
+ # Get all Docling regions
74
+ docling_regions = page.find_all('region[model=docling]')
75
+ print(f"Found {len(docling_regions)} Docling regions")
76
+
77
+ # Find top-level elements (no parent)
78
+ top_level = [r for r in docling_regions if not r.parent_region]
79
+ print(f"Document has {len(top_level)} top-level elements")
80
+
81
+ # Show the top-level hierarchy
82
+ print("\n--- Top-Level Hierarchy ---")
83
+ for i, elem in enumerate(top_level[:5]): # Show first 5 top-level elements
84
+ print(f"Element {i+1}: {elem.region_type}")
85
+
86
+ # Count children if any
87
+ if hasattr(elem, 'child_regions') and elem.child_regions:
88
+ print(f" - Children: {len(elem.child_regions)}")
89
+
90
+ # Show first 3 children
91
+ for j, child in enumerate(elem.child_regions[:3]):
92
+ print(f" Child {j+1}: {child.region_type}")
93
+
94
+ # If the child has children (grandchildren)
95
+ if hasattr(child, 'child_regions') and child.child_regions:
96
+ print(f" - Grandchildren: {len(child.child_regions)}")
97
+
98
+ # If more children exist
99
+ if len(elem.child_regions) > 3:
100
+ print(f" ... and {len(elem.child_regions) - 3} more children")
101
+
102
+ # Try to find section headers specifically
103
+ section_headers = page.find_all('section-header')
104
+ print(f"\nFound {len(section_headers)} section headers")
105
+
106
+ # If we have section headers, demonstrate hierarchical navigation
107
+ if section_headers:
108
+ # Choose the first section header for demonstration
109
+ header = section_headers[0]
110
+ print(f"\n--- Analyzing Section: {header.extract_text()[:50]}... ---")
111
+
112
+ # Direct children
113
+ children = header.get_children()
114
+ print(f"Direct children: {len(children)}")
115
+
116
+ # Children by type
117
+ text_children = header.get_children('text')
118
+ print(f"Direct text children: {len(text_children)}")
119
+
120
+ # All descendants
121
+ descendants = header.get_descendants()
122
+ print(f"All descendants: {len(descendants)}")
123
+
124
+ # Descendants by type
125
+ text_descendants = header.get_descendants('text')
126
+ print(f"All text descendants: {len(text_descendants)}")
127
+
128
+ # Recursive find_all
129
+ found_text = header.find_all('text', recursive=True)
130
+ print(f"Text elements found recursively: {len(found_text)}")
131
+
132
+ # Extract text from the entire section
133
+ section_text = header.extract_text()
134
+ print(f"Full section text ({len(section_text)} chars): {section_text[:100]}...")
135
+
136
+ # Create a structured outline of this section
137
+ print("\n--- Section Outline ---")
138
+ def print_outline(element, level=0):
139
+ """Recursively print the outline of a section"""
140
+ indent = " " * level
141
+ text = element.extract_text()
142
+ if len(text) > 50:
143
+ text = text[:47] + "..."
144
+ print(f"{indent}- {element.region_type}: {text}")
145
+
146
+ if hasattr(element, 'get_children'):
147
+ for child in element.get_children():
148
+ print_outline(child, level + 1)
149
+
150
+ print_outline(header)
151
+
152
+ # Visualize the hierarchy
153
+ print("\nVisualizing section hierarchy...")
154
+ page.clear_highlights()
155
+
156
+ # Create a color gradient for different hierarchy levels
157
+ colors = [
158
+ (1, 0, 0, 0.3), # Red - Top level
159
+ (0, 0.7, 0, 0.3), # Green - Level 1
160
+ (0, 0, 1, 0.3), # Blue - Level 2
161
+ (1, 0.7, 0, 0.3), # Orange - Level 3
162
+ (0.7, 0, 1, 0.3), # Purple - Level 4
163
+ ]
164
+
165
+ # Highlight the hierarchy
166
+ def highlight_hierarchy(element, level=0):
167
+ """Recursively highlight elements with color by level"""
168
+ color = colors[min(level, len(colors) - 1)]
169
+ label = f"Level {level}: {element.region_type}"
170
+ element.highlight(color=color, label=label, include_attrs=['region_type'])
171
+
172
+ if hasattr(element, 'get_children'):
173
+ for child in element.get_children():
174
+ highlight_hierarchy(child, level + 1)
175
+
176
+ highlight_hierarchy(header)
177
+
178
+ # Save visualization
179
+ hierarchy_path = os.path.join(output_dir, "docling_hierarchy.png")
180
+ page.save_image(hierarchy_path, labels=True)
181
+ print(f"Saved hierarchy visualization to {hierarchy_path}")
182
+
183
+ # BONUS: Extract structured content from the hierarchy
184
+ print("\n--- Structured Content Extraction ---")
185
+
186
+ # Create a structured dictionary from the hierarchy
187
+ def extract_structured_content(element):
188
+ """Extract structured content from the element hierarchy"""
189
+ content = {
190
+ "type": element.region_type,
191
+ "text": element.extract_text(),
192
+ "children": []
193
+ }
194
+
195
+ if hasattr(element, 'get_children'):
196
+ for child in element.get_children():
197
+ content["children"].append(extract_structured_content(child))
198
+
199
+ return content
200
+
201
+ structured_content = extract_structured_content(header)
202
+
203
+ # Display the structure (simplified)
204
+ def print_structure(structure, level=0):
205
+ """Print the structured content dictionary in a readable format"""
206
+ indent = " " * level
207
+ text = structure["text"]
208
+ if len(text) > 50:
209
+ text = text[:47] + "..."
210
+ print(f"{indent}{structure['type']}: {text}")
211
+
212
+ if structure["children"]:
213
+ print(f"{indent}Children: {len(structure['children'])}")
214
+ for child in structure["children"][:2]: # Show only first 2 children
215
+ print_structure(child, level + 1)
216
+ if len(structure["children"]) > 2:
217
+ print(f"{indent}... and {len(structure['children']) - 2} more children")
218
+
219
+ print_structure(structured_content)
220
+
221
+ # Advanced: Save structured content as JSON
222
+ import json
223
+ structured_path = os.path.join(output_dir, "docling_structured_content.json")
224
+ with open(structured_path, 'w') as f:
225
+ json.dump(structured_content, f, indent=2)
226
+ print(f"Saved structured content to {structured_path}")
227
+ else:
228
+ print("No section headers found for hierarchy demonstration")
229
+
230
+ print("\nHierarchy analysis complete!")
@@ -0,0 +1,241 @@
1
+ """
2
+ Example script demonstrating how Docling handles text from different sources.
3
+
4
+ This script shows how Docling integrates with natural-pdf's text extraction system,
5
+ handling both native PDF text and OCR text intelligently.
6
+
7
+ Usage:
8
+ python examples/docling_text_sources.py [pdf_path]
9
+
10
+ Dependencies:
11
+ - torch
12
+ - transformers
13
+ - docling_core
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import logging
19
+ from pathlib import Path
20
+
21
+ # Import the library
22
+ from natural_pdf import PDF, configure_logging
23
+
24
+ # Configure detailed logging to see text source decision messages
25
+ configure_logging(level=logging.INFO)
26
+ logger = logging.getLogger("natural_pdf")
27
+ logger.setLevel(logging.INFO)
28
+
29
+ # Get PDF path from command line or use demo files
30
+ if len(sys.argv) > 1:
31
+ pdf_path = sys.argv[1]
32
+ else:
33
+ # Default to a sample PDF in the pdfs directory
34
+ script_dir = os.path.dirname(os.path.abspath(__file__))
35
+ repo_root = os.path.dirname(script_dir)
36
+
37
+ # Use two different PDFs for testing:
38
+ # 1. One with native text
39
+ native_pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
40
+ # 2. One that needs OCR
41
+ ocr_pdf_path = os.path.join(repo_root, "pdfs", "needs-ocr.pdf")
42
+
43
+ # Default to native text PDF
44
+ pdf_path = native_pdf_path
45
+
46
+ # Check if required packages are installed
47
+ try:
48
+ from docling.document_converter import DocumentConverter
49
+ except ImportError:
50
+ print("Missing required packages. Please install:")
51
+ print("pip install docling")
52
+ sys.exit(1)
53
+
54
+ # Create output directory
55
+ output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output")
56
+ os.makedirs(output_dir, exist_ok=True)
57
+
58
+ # Create a custom handler to also print log messages to console
59
+ console_handler = logging.StreamHandler()
60
+ console_handler.setLevel(logging.INFO)
61
+ formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
62
+ console_handler.setFormatter(formatter)
63
+ logger.addHandler(console_handler)
64
+
65
+ # Part 1: Native Text PDF Example
66
+ print("\n=== PART 1: PDF WITH NATIVE TEXT ===")
67
+
68
+ # Load the PDF with native text
69
+ print(f"Loading PDF with native text: {native_pdf_path}")
70
+ native_pdf = PDF(native_pdf_path)
71
+ native_page = native_pdf.pages[0]
72
+
73
+ # First count original text elements
74
+ original_elements = native_page.words
75
+ print(f"PDF has {len(original_elements)} native text elements")
76
+
77
+ # Run Docling analysis
78
+ print("\nRunning Docling analysis...")
79
+ native_page.analyze_layout(
80
+ model="docling",
81
+ confidence=0.2
82
+ )
83
+
84
+ # Find Docling regions
85
+ docling_regions = native_page.find_all('region[model=docling]')
86
+ print(f"Found {len(docling_regions)} Docling regions")
87
+
88
+ # Count elements by source
89
+ native_text = native_page.find_all('text[source=native]')
90
+ ocr_text = native_page.find_all('text[source=ocr]')
91
+ docling_text_regions = native_page.find_all('region[model=docling][type=text]')
92
+
93
+ print(f"\nText elements by source:")
94
+ print(f" Native PDF text: {len(native_text)} elements")
95
+ print(f" OCR text: {len(ocr_text)} elements")
96
+ print(f" Docling text regions: {len(docling_text_regions)} elements")
97
+
98
+ # Check text sources
99
+ print("\nChecking text sources for regions:")
100
+ for i, region in enumerate(docling_regions[:5]): # Check first 5 regions
101
+ # Check if region has direct text content
102
+ has_text_content = hasattr(region, 'text_content') and region.text_content
103
+
104
+ # Check if region has associated text elements
105
+ has_associated_text = (hasattr(region, 'associated_text_elements') and
106
+ region.associated_text_elements)
107
+
108
+ # Extract text using the enhanced method which logs source decision
109
+ text = region.extract_text()
110
+
111
+ print(f"\nRegion {i+1} ({region.region_type}):")
112
+ print(f" Has direct text content: {has_text_content}")
113
+ print(f" Has associated text elements: {has_associated_text}")
114
+ print(f" Text length: {len(text)} characters")
115
+ print(f" Text preview: '{text[:50]}...'")
116
+
117
+ # Visualize text sources
118
+ print("\nVisualizing text sources...")
119
+ native_page.clear_highlights()
120
+
121
+ # Highlight native text elements
122
+ native_text.highlight(
123
+ color=(0, 0, 0.7, 0.3),
124
+ label="Native PDF Text Elements",
125
+ include_attrs=['source']
126
+ )
127
+
128
+ # Highlight regions with native text (associated elements)
129
+ native_text_regions = []
130
+ for region in docling_regions:
131
+ if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
132
+ native_text_regions.append(region)
133
+
134
+ if native_text_regions:
135
+ from natural_pdf.elements.collections import ElementCollection
136
+ ElementCollection(native_text_regions).highlight(
137
+ color=(0, 0.7, 0, 0.3),
138
+ label="Regions using Native Text",
139
+ include_attrs=['region_type']
140
+ )
141
+
142
+ # Highlight regions with only Docling text
143
+ docling_text_regions = []
144
+ for region in docling_regions:
145
+ if ((hasattr(region, 'text_content') and region.text_content) and
146
+ (not hasattr(region, 'associated_text_elements') or not region.associated_text_elements)):
147
+ docling_text_regions.append(region)
148
+
149
+ if docling_text_regions:
150
+ from natural_pdf.elements.collections import ElementCollection
151
+ ElementCollection(docling_text_regions).highlight(
152
+ color=(0.7, 0, 0, 0.3),
153
+ label="Regions using Docling Text Only",
154
+ include_attrs=['region_type']
155
+ )
156
+
157
+ # Save visualization
158
+ native_output_path = os.path.join(output_dir, "docling_native_text_sources.png")
159
+ native_page.save_image(native_output_path, labels=True)
160
+ print(f"Saved visualization to {native_output_path}")
161
+
162
+ # Part 2: OCR PDF Example (if available)
163
+ print("\n=== PART 2: PDF REQUIRING OCR ===")
164
+
165
+ # Check if OCR PDF exists
166
+ if not os.path.exists(ocr_pdf_path):
167
+ print(f"OCR test PDF not found at {ocr_pdf_path}")
168
+ print("Skipping OCR text source test")
169
+ sys.exit(0)
170
+
171
+ # Load the PDF requiring OCR
172
+ print(f"Loading PDF requiring OCR: {ocr_pdf_path}")
173
+ ocr_pdf = PDF(ocr_pdf_path, ocr="auto") # Enable auto OCR
174
+ ocr_page = ocr_pdf.pages[0]
175
+
176
+ # First extract text with standard OCR
177
+ print("\nExtracting text with standard OCR first...")
178
+ ocr_elements = ocr_page.apply_ocr()
179
+ print(f"Standard OCR found {len(ocr_elements)} text elements")
180
+
181
+ # Now run Docling analysis
182
+ print("\nRunning Docling analysis with integrated OCR...")
183
+ ocr_page.analyze_layout(
184
+ model="docling",
185
+ confidence=0.2
186
+ )
187
+
188
+ # Find Docling regions
189
+ ocr_docling_regions = ocr_page.find_all('region[model=docling]')
190
+ print(f"Found {len(ocr_docling_regions)} Docling regions")
191
+
192
+ # Check text sources
193
+ print("\nChecking text sources for regions:")
194
+ for i, region in enumerate(ocr_docling_regions[:5]): # Check first 5 regions
195
+ # Check if region has direct text content
196
+ has_text_content = hasattr(region, 'text_content') and region.text_content
197
+
198
+ # Check if region has associated text elements (from standard OCR)
199
+ has_associated_text = (hasattr(region, 'associated_text_elements') and
200
+ region.associated_text_elements)
201
+
202
+ # Extract text using the enhanced method which logs source decision
203
+ text = region.extract_text()
204
+
205
+ print(f"\nRegion {i+1} ({region.region_type}):")
206
+ print(f" Has Docling text content: {has_text_content}")
207
+ print(f" Has associated OCR elements: {has_associated_text}")
208
+ print(f" Text length: {len(text)} characters")
209
+ print(f" Text preview: '{text[:50]}...'")
210
+
211
+ # Visualize text sources
212
+ print("\nVisualizing OCR text sources...")
213
+ ocr_page.clear_highlights()
214
+
215
+ # Highlight standard OCR elements
216
+ ocr_page.find_all('text[source=ocr]').highlight(
217
+ color=(0, 0, 0.7, 0.3),
218
+ label="Standard OCR Text",
219
+ include_attrs=['confidence']
220
+ )
221
+
222
+ # Highlight regions with Docling text
223
+ docling_ocr_regions = []
224
+ for region in ocr_docling_regions:
225
+ if hasattr(region, 'text_content') and region.text_content:
226
+ docling_ocr_regions.append(region)
227
+
228
+ if docling_ocr_regions:
229
+ from natural_pdf.elements.collections import ElementCollection
230
+ ElementCollection(docling_ocr_regions).highlight(
231
+ color=(0.7, 0, 0, 0.3),
232
+ label="Docling OCR Text",
233
+ include_attrs=['region_type']
234
+ )
235
+
236
+ # Save visualization
237
+ ocr_output_path = os.path.join(output_dir, "docling_ocr_text_sources.png")
238
+ ocr_page.save_image(ocr_output_path, labels=True)
239
+ print(f"Saved visualization to {ocr_output_path}")
240
+
241
+ print("\nText source analysis complete!")
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example demonstrating the simplified document QA interface.
4
+ """
5
+ import sys
6
+ import os
7
+ import argparse
8
+
9
+ # Add the parent directory to the path so we can import the natural_pdf package
10
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Example of using the improved document QA interface")
16
+ parser.add_argument('pdf', nargs='?',
17
+ default="pdfs/2019 Statistics.pdf",
18
+ help="Path to a PDF document")
19
+ parser.add_argument('--question', '-q',
20
+ default="What information does this document contain?",
21
+ help="Question to ask about the document")
22
+ parser.add_argument('--full', '-f', action='store_true',
23
+ help="Show the full result dictionary with confidence scores")
24
+ args = parser.parse_args()
25
+
26
+ if not os.path.exists(args.pdf):
27
+ print(f"Error: PDF file '{args.pdf}' not found")
28
+ sys.exit(1)
29
+
30
+ print(f"Loading PDF: {args.pdf}")
31
+ print(f"Question: {args.question}")
32
+
33
+ try:
34
+ # Open the PDF
35
+ with PDF(args.pdf) as pdf:
36
+ # Get result dictionary
37
+ result = pdf.ask(args.question)
38
+
39
+ # Display result
40
+ if args.full:
41
+ print("\nFull result:")
42
+ for key, value in result.items():
43
+ if key == 'confidence' and isinstance(value, float):
44
+ print(f" {key}: {value:.2f}")
45
+ else:
46
+ print(f" {key}: {value}")
47
+ else:
48
+ print("\nResult:")
49
+ print(f" Answer: {result['answer']}")
50
+ if 'confidence' in result:
51
+ print(f" Confidence: {result['confidence']:.2f}")
52
+ if 'page_num' in result:
53
+ print(f" Page: {result['page_num']}")
54
+
55
+ # Ask another related question
56
+ print("\nAsking follow-up question:")
57
+ follow_up = "What year does this data cover?"
58
+ print(f"Question: {follow_up}")
59
+ follow_result = pdf.ask(follow_up)
60
+ print(f"Answer: {follow_result['answer']}")
61
+
62
+ except Exception as e:
63
+ print(f"Error: {e}")
64
+
65
+ if __name__ == "__main__":
66
+ main()
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Example demonstrating loading a PDF from a URL.
4
+ """
5
+ import sys
6
+ import os
7
+ import argparse
8
+
9
+ # Add the parent directory to the path so we can import the natural_pdf package
10
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
11
+
12
+ from natural_pdf import PDF
13
+
14
+ def main():
15
+ parser = argparse.ArgumentParser(description="Example of loading a PDF from a URL")
16
+ parser.add_argument('url', nargs='?',
17
+ default="https://arxiv.org/pdf/2103.14749.pdf",
18
+ help="URL to a PDF document (default: an arXiv paper)")
19
+ args = parser.parse_args()
20
+
21
+ print(f"Loading PDF from URL: {args.url}")
22
+
23
+ # Open the PDF from URL
24
+ with PDF(args.url) as pdf:
25
+ # Display basic document info
26
+ print(f"Document loaded successfully: {len(pdf)} pages")
27
+
28
+ # Extract text from the first page
29
+ if len(pdf) > 0:
30
+ page = pdf.pages[0]
31
+
32
+ # Get the title (usually large text on the first page)
33
+ title = page.find_all('text[size>=12]')
34
+ if title:
35
+ print("\nTitle candidates:")
36
+ for i, t in enumerate(title[:3], 1): # Show top 3 candidates
37
+ print(f"{i}. {t.text}")
38
+
39
+ # Extract the first 200 characters of text
40
+ text = page.extract_text()
41
+ preview = text[:200] + "..." if len(text) > 200 else text
42
+ print(f"\nText preview:\n{preview}")
43
+
44
+ if __name__ == "__main__":
45
+ main()