natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/direct_qa_example.py +17 -111
- examples/docling_comprehensive_test.py +325 -0
- examples/docling_example.py +192 -0
- examples/docling_hierarchy_example.py +230 -0
- examples/docling_text_sources.py +241 -0
- examples/improved_qa_example.py +66 -0
- examples/url_pdf_example.py +45 -0
- natural_pdf/analyzers/document_layout.py +276 -0
- natural_pdf/core/page.py +72 -21
- natural_pdf/core/pdf.py +102 -71
- natural_pdf/elements/region.py +174 -19
- natural_pdf/qa/document_qa.py +29 -38
- natural_pdf/selectors/parser.py +6 -2
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +25 -3
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +18 -12
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0
examples/direct_qa_example.py
CHANGED
@@ -23,74 +23,6 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
23
23
|
from natural_pdf import PDF, configure_logging
|
24
24
|
import logging
|
25
25
|
|
26
|
-
def pdfplumber_qa(pdf_path, question, debug=False):
|
27
|
-
"""Run QA using direct pdfplumber code similar to the original example."""
|
28
|
-
# Open PDF
|
29
|
-
pdf = pdfplumber.open(pdf_path)
|
30
|
-
page = pdf.pages[0]
|
31
|
-
|
32
|
-
# Get image
|
33
|
-
image = page.to_image(resolution=300).original
|
34
|
-
|
35
|
-
# Extract words
|
36
|
-
words = page.extract_words()
|
37
|
-
|
38
|
-
# Build word boxes in the expected format
|
39
|
-
def get_box(word):
|
40
|
-
return [
|
41
|
-
word['text'],
|
42
|
-
[int(word["x0"]), int(word["top"]), int(word["x1"]), int(word["bottom"])]
|
43
|
-
]
|
44
|
-
|
45
|
-
word_boxes = [get_box(word) for word in words]
|
46
|
-
|
47
|
-
# Debug visualization
|
48
|
-
if debug:
|
49
|
-
os.makedirs("output", exist_ok=True)
|
50
|
-
|
51
|
-
# Save image
|
52
|
-
image.save("output/direct_qa_image.png")
|
53
|
-
|
54
|
-
# Save visualization
|
55
|
-
vis_image = image.copy()
|
56
|
-
draw = ImageDraw.Draw(vis_image)
|
57
|
-
|
58
|
-
for i, (text, box) in enumerate(word_boxes):
|
59
|
-
x0, y0, x1, y1 = box
|
60
|
-
draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
|
61
|
-
draw.text((x0, y0), str(i), fill=(255, 0, 0))
|
62
|
-
|
63
|
-
vis_image.save("output/direct_qa_boxes.png")
|
64
|
-
|
65
|
-
# Use transformers pipeline
|
66
|
-
try:
|
67
|
-
from transformers import pipeline
|
68
|
-
|
69
|
-
pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
|
70
|
-
|
71
|
-
# Run query
|
72
|
-
query = { "image": image, "question": question, "word_boxes": word_boxes }
|
73
|
-
|
74
|
-
result = pipe(query)[0]
|
75
|
-
|
76
|
-
# Create result dictionary similar to Natural PDF's format
|
77
|
-
return {
|
78
|
-
"answer": result.get("answer", ""),
|
79
|
-
"confidence": result.get("score", 0.0),
|
80
|
-
"start": result.get("start", 0),
|
81
|
-
"end": result.get("end", 0),
|
82
|
-
"found": True if result.get("answer") else False
|
83
|
-
}
|
84
|
-
|
85
|
-
except Exception as e:
|
86
|
-
print(f"Error in direct QA: {e}")
|
87
|
-
return {
|
88
|
-
"answer": "",
|
89
|
-
"confidence": 0.0,
|
90
|
-
"error": str(e),
|
91
|
-
"found": False
|
92
|
-
}
|
93
|
-
|
94
26
|
def main():
|
95
27
|
parser = argparse.ArgumentParser(description="Direct Document QA Example")
|
96
28
|
parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
|
@@ -99,8 +31,6 @@ def main():
|
|
99
31
|
help="Question to ask about the document")
|
100
32
|
parser.add_argument("--debug", action="store_true",
|
101
33
|
help="Save debug information for troubleshooting")
|
102
|
-
parser.add_argument("--compare", action="store_true",
|
103
|
-
help="Compare with Natural PDF implementation")
|
104
34
|
|
105
35
|
args = parser.parse_args()
|
106
36
|
|
@@ -113,53 +43,29 @@ def main():
|
|
113
43
|
print(f"Document: {args.pdf_path}")
|
114
44
|
print(f"Question: {args.question}")
|
115
45
|
|
116
|
-
|
117
|
-
|
118
|
-
|
46
|
+
print("\n=== Natural PDF implementation ===")
|
47
|
+
|
48
|
+
# Use Natural PDF
|
49
|
+
pdf = PDF(args.pdf_path)
|
50
|
+
page = pdf.pages[0]
|
51
|
+
|
52
|
+
# Ask the question
|
53
|
+
result = page.ask(args.question, debug=args.debug)
|
119
54
|
|
120
55
|
if result.get("found", False):
|
121
56
|
print(f"Answer: {result['answer']}")
|
122
57
|
print(f"Confidence: {result['confidence']:.2f}")
|
58
|
+
|
59
|
+
# Highlight the answer
|
60
|
+
if result.get("source_elements"):
|
61
|
+
for element in result["source_elements"]:
|
62
|
+
element.highlight(color=(1, 0.5, 0, 0.5))
|
63
|
+
|
64
|
+
# Save the image
|
65
|
+
page.save_image("output/natural_pdf_answer.png")
|
66
|
+
print("Saved highlighted answer to output/natural_pdf_answer.png")
|
123
67
|
else:
|
124
68
|
print(f"No answer found: {result.get('error', '')}")
|
125
69
|
|
126
|
-
# Compare with Natural PDF if requested
|
127
|
-
if args.compare:
|
128
|
-
print("\n=== Natural PDF implementation ===")
|
129
|
-
|
130
|
-
# Use Natural PDF
|
131
|
-
pdf = PDF(args.pdf_path)
|
132
|
-
page = pdf.pages[0]
|
133
|
-
|
134
|
-
# Ask the question
|
135
|
-
natural_result = page.ask(args.question, debug=args.debug)
|
136
|
-
|
137
|
-
if natural_result.get("found", False):
|
138
|
-
print(f"Answer: {natural_result['answer']}")
|
139
|
-
print(f"Confidence: {natural_result['confidence']:.2f}")
|
140
|
-
|
141
|
-
# Highlight the answer
|
142
|
-
if natural_result.get("source_elements"):
|
143
|
-
for element in natural_result["source_elements"]:
|
144
|
-
element.highlight(color=(1, 0.5, 0, 0.5))
|
145
|
-
|
146
|
-
# Save the image
|
147
|
-
page.save_image("output/natural_pdf_answer.png")
|
148
|
-
print("Saved highlighted answer to output/natural_pdf_answer.png")
|
149
|
-
else:
|
150
|
-
print(f"No answer found: {natural_result.get('error', '')}")
|
151
|
-
|
152
|
-
# Compare results
|
153
|
-
if result.get("found", False) and natural_result.get("found", False):
|
154
|
-
print("\n=== Comparison ===")
|
155
|
-
print(f"Direct answer: '{result['answer']}' (confidence: {result['confidence']:.2f})")
|
156
|
-
print(f"Natural PDF answer: '{natural_result['answer']}' (confidence: {natural_result['confidence']:.2f})")
|
157
|
-
|
158
|
-
# Calculate similarity
|
159
|
-
if result['answer'] == natural_result['answer']:
|
160
|
-
print("Results match exactly!")
|
161
|
-
else:
|
162
|
-
print("Results differ.")
|
163
|
-
|
164
70
|
if __name__ == "__main__":
|
165
71
|
main()
|
@@ -0,0 +1,325 @@
|
|
1
|
+
"""
|
2
|
+
Comprehensive test of the Docling integration with Natural PDF.
|
3
|
+
|
4
|
+
This script tests all aspects of the Docling integration:
|
5
|
+
1. Basic document layout detection
|
6
|
+
2. Hierarchical document navigation
|
7
|
+
3. Text extraction from complex structures
|
8
|
+
4. Integration with other layout models
|
9
|
+
5. Performance and edge cases
|
10
|
+
|
11
|
+
Usage:
|
12
|
+
python examples/docling_comprehensive_test.py [pdf_path]
|
13
|
+
|
14
|
+
Dependencies:
|
15
|
+
- torch
|
16
|
+
- transformers
|
17
|
+
- docling_core
|
18
|
+
"""
|
19
|
+
|
20
|
+
import os
|
21
|
+
import sys
|
22
|
+
import time
|
23
|
+
import logging
|
24
|
+
from pathlib import Path
|
25
|
+
|
26
|
+
# Import the library
|
27
|
+
from natural_pdf import PDF, configure_logging
|
28
|
+
|
29
|
+
# Configure detailed logging for debugging
|
30
|
+
configure_logging(level=logging.INFO)
|
31
|
+
logger = logging.getLogger("docling_test")
|
32
|
+
logger.setLevel(logging.INFO)
|
33
|
+
|
34
|
+
# Get PDF path from command line or use demo file
|
35
|
+
if len(sys.argv) > 1:
|
36
|
+
pdf_path = sys.argv[1]
|
37
|
+
else:
|
38
|
+
# Default to a sample PDF in the pdfs directory
|
39
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
40
|
+
repo_root = os.path.dirname(script_dir)
|
41
|
+
pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
|
42
|
+
|
43
|
+
# Check if required packages are installed
|
44
|
+
try:
|
45
|
+
from docling.document_converter import DocumentConverter
|
46
|
+
except ImportError:
|
47
|
+
logger.error("Missing required packages. Please install with:")
|
48
|
+
logger.error("pip install docling")
|
49
|
+
sys.exit(1)
|
50
|
+
|
51
|
+
# Create output directory for test results
|
52
|
+
output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output", "docling_tests")
|
53
|
+
os.makedirs(output_dir, exist_ok=True)
|
54
|
+
|
55
|
+
# Load the PDF
|
56
|
+
logger.info(f"Loading PDF: {pdf_path}")
|
57
|
+
pdf = PDF(pdf_path)
|
58
|
+
logger.info(f"PDF has {len(pdf.pages)} pages")
|
59
|
+
|
60
|
+
# Process only the first page for tests
|
61
|
+
page = pdf.pages[0]
|
62
|
+
|
63
|
+
# SECTION 1: Basic Docling Detection
|
64
|
+
logger.info("\n*** SECTION 1: Basic Docling Detection ***")
|
65
|
+
|
66
|
+
# Time the Docling analysis
|
67
|
+
start_time = time.time()
|
68
|
+
page.analyze_layout(
|
69
|
+
model="docling",
|
70
|
+
confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
|
71
|
+
model_params={
|
72
|
+
"verbose": True
|
73
|
+
# Any other parameters would be passed directly to DocumentConverter
|
74
|
+
}
|
75
|
+
)
|
76
|
+
docling_time = time.time() - start_time
|
77
|
+
logger.info(f"Docling analysis completed in {docling_time:.2f} seconds")
|
78
|
+
|
79
|
+
# Verify that docling_document was created
|
80
|
+
if hasattr(page, 'docling_document'):
|
81
|
+
logger.info("✅ Docling document created successfully")
|
82
|
+
else:
|
83
|
+
logger.error("❌ Docling document not created")
|
84
|
+
|
85
|
+
# Count detected regions
|
86
|
+
docling_regions = page.find_all('region[model=docling]')
|
87
|
+
logger.info(f"Found {len(docling_regions)} total Docling regions")
|
88
|
+
|
89
|
+
# Get regions by type
|
90
|
+
section_headers = page.find_all('section-header')
|
91
|
+
text_regions = page.find_all('region[model=docling][type=text]')
|
92
|
+
figures = page.find_all('region[model=docling][type=figure]')
|
93
|
+
|
94
|
+
logger.info(f"- Section headers: {len(section_headers)}")
|
95
|
+
logger.info(f"- Text regions: {len(text_regions)}")
|
96
|
+
logger.info(f"- Figures: {len(figures)}")
|
97
|
+
|
98
|
+
# SECTION 2: Hierarchical Navigation
|
99
|
+
logger.info("\n*** SECTION 2: Hierarchical Navigation ***")
|
100
|
+
|
101
|
+
# Test if regions have child_regions attribute
|
102
|
+
has_children_attr = all(hasattr(region, 'child_regions') for region in docling_regions)
|
103
|
+
logger.info(f"All regions have child_regions attribute: {has_children_attr}")
|
104
|
+
|
105
|
+
# Count top-level regions (no parent)
|
106
|
+
top_level_regions = [r for r in docling_regions if not r.parent_region]
|
107
|
+
logger.info(f"Top-level regions: {len(top_level_regions)}")
|
108
|
+
|
109
|
+
# Test child traversal for section headers
|
110
|
+
if section_headers:
|
111
|
+
header = section_headers[0]
|
112
|
+
logger.info(f"Testing section header: '{header.extract_text()[:30]}...'")
|
113
|
+
|
114
|
+
# Test get_children method
|
115
|
+
if hasattr(header, 'get_children'):
|
116
|
+
children = header.get_children()
|
117
|
+
logger.info(f"- Direct children: {len(children)}")
|
118
|
+
|
119
|
+
# Test filtered get_children
|
120
|
+
text_children = header.get_children('text')
|
121
|
+
logger.info(f"- Direct text children: {len(text_children)}")
|
122
|
+
else:
|
123
|
+
logger.error("❌ get_children method not found")
|
124
|
+
|
125
|
+
# Test get_descendants method
|
126
|
+
if hasattr(header, 'get_descendants'):
|
127
|
+
descendants = header.get_descendants()
|
128
|
+
logger.info(f"- All descendants: {len(descendants)}")
|
129
|
+
|
130
|
+
# Test filtered get_descendants
|
131
|
+
text_descendants = header.get_descendants('text')
|
132
|
+
logger.info(f"- Text descendants: {len(text_descendants)}")
|
133
|
+
else:
|
134
|
+
logger.error("❌ get_descendants method not found")
|
135
|
+
|
136
|
+
# Test find_all with recursive option
|
137
|
+
children_find = header.find_all('text', recursive=False)
|
138
|
+
logger.info(f"- Children via find_all(recursive=False): {len(children_find)}")
|
139
|
+
|
140
|
+
all_find = header.find_all('text', recursive=True)
|
141
|
+
logger.info(f"- All text via find_all(recursive=True): {len(all_find)}")
|
142
|
+
|
143
|
+
# SECTION 3: Text Extraction
|
144
|
+
logger.info("\n*** SECTION 3: Text Extraction ***")
|
145
|
+
|
146
|
+
# Test basic text extraction
|
147
|
+
if section_headers:
|
148
|
+
header = section_headers[0]
|
149
|
+
header_text = header.extract_text()
|
150
|
+
logger.info(f"Section header text: '{header_text[:50]}...'")
|
151
|
+
|
152
|
+
# Test extraction from hierarchy
|
153
|
+
if hasattr(header, 'get_children') and header.get_children():
|
154
|
+
child = header.get_children()[0]
|
155
|
+
child_text = child.extract_text()
|
156
|
+
logger.info(f"First child text: '{child_text[:50]}...'")
|
157
|
+
|
158
|
+
# Compare with standard extraction
|
159
|
+
# In a real document, the header's extract_text might include the child text too
|
160
|
+
combined_len = len(header_text) + len(child_text)
|
161
|
+
logger.info(f"Combined text length: {combined_len} characters")
|
162
|
+
|
163
|
+
# Test text extraction with and without OCR
|
164
|
+
# This is a simplified test - in a real scenario, we'd compare with known text
|
165
|
+
extracted_text = page.extract_text()
|
166
|
+
logger.info(f"Extracted page text: {len(extracted_text)} characters")
|
167
|
+
|
168
|
+
# SECTION 4: Integration with Other Models
|
169
|
+
logger.info("\n*** SECTION 4: Integration with Other Models ***")
|
170
|
+
|
171
|
+
# Store current regions for comparison
|
172
|
+
original_region_count = len(page._regions['detected'])
|
173
|
+
|
174
|
+
# Add YOLO analysis
|
175
|
+
page.analyze_layout(
|
176
|
+
model="yolo",
|
177
|
+
confidence=0.3,
|
178
|
+
existing="append" # Important: don't replace Docling regions
|
179
|
+
)
|
180
|
+
|
181
|
+
# Count new regions
|
182
|
+
all_regions = page._regions['detected']
|
183
|
+
logger.info(f"Total regions after adding YOLO: {len(all_regions)}")
|
184
|
+
logger.info(f"New regions added: {len(all_regions) - original_region_count}")
|
185
|
+
|
186
|
+
# Test filtering by model
|
187
|
+
yolo_regions = page.find_all('region[model=yolo]')
|
188
|
+
docling_regions_after = page.find_all('region[model=docling]')
|
189
|
+
|
190
|
+
logger.info(f"YOLO regions: {len(yolo_regions)}")
|
191
|
+
logger.info(f"Docling regions after YOLO: {len(docling_regions_after)}")
|
192
|
+
logger.info(f"Docling regions preserved: {len(docling_regions_after) == len(docling_regions)}")
|
193
|
+
|
194
|
+
# SECTION 5: Visualization
|
195
|
+
logger.info("\n*** SECTION 5: Visualization ***")
|
196
|
+
|
197
|
+
# Clear previous highlights
|
198
|
+
page.clear_highlights()
|
199
|
+
|
200
|
+
# Highlight different models and region types
|
201
|
+
if section_headers:
|
202
|
+
section_headers.highlight(
|
203
|
+
color=(1, 0, 0, 0.3),
|
204
|
+
label="Docling Headers",
|
205
|
+
include_attrs=['region_type']
|
206
|
+
)
|
207
|
+
|
208
|
+
if text_regions:
|
209
|
+
text_regions.highlight(
|
210
|
+
color=(0, 0, 1, 0.3),
|
211
|
+
label="Docling Text",
|
212
|
+
include_attrs=['region_type']
|
213
|
+
)
|
214
|
+
|
215
|
+
if yolo_regions:
|
216
|
+
yolo_regions.highlight(
|
217
|
+
color=(0, 1, 0, 0.3),
|
218
|
+
label="YOLO Regions",
|
219
|
+
include_attrs=['region_type']
|
220
|
+
)
|
221
|
+
|
222
|
+
# Save highlighted image
|
223
|
+
highlight_path = os.path.join(output_dir, "model_comparison.png")
|
224
|
+
page.save_image(highlight_path, labels=True)
|
225
|
+
logger.info(f"Saved visualization to {highlight_path}")
|
226
|
+
|
227
|
+
# Test hierarchical highlighting
|
228
|
+
if section_headers and len(section_headers) > 0:
|
229
|
+
# Clear previous highlights
|
230
|
+
page.clear_highlights()
|
231
|
+
|
232
|
+
# Select a section to visualize
|
233
|
+
header = section_headers[0]
|
234
|
+
|
235
|
+
# Highlight header
|
236
|
+
header.highlight(
|
237
|
+
color=(1, 0, 0, 0.3),
|
238
|
+
label="Section Header"
|
239
|
+
)
|
240
|
+
|
241
|
+
# Highlight direct children
|
242
|
+
if hasattr(header, 'get_children') and header.get_children():
|
243
|
+
children = header.get_children()
|
244
|
+
for child in children:
|
245
|
+
child.highlight(
|
246
|
+
color=(0, 1, 0, 0.3),
|
247
|
+
label="Direct Children",
|
248
|
+
include_attrs=['region_type']
|
249
|
+
)
|
250
|
+
|
251
|
+
# Save hierarchy visualization
|
252
|
+
hierarchy_path = os.path.join(output_dir, "hierarchy_visualization.png")
|
253
|
+
page.save_image(hierarchy_path, labels=True)
|
254
|
+
logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
|
255
|
+
|
256
|
+
# SECTION 6: Text Source Testing (OCR vs Native)
|
257
|
+
logger.info("\n*** SECTION 6: Text Source Testing ***")
|
258
|
+
|
259
|
+
# Find text elements by source
|
260
|
+
native_text = page.find_all('text[source=native]')
|
261
|
+
ocr_text = page.find_all('text[source=ocr]')
|
262
|
+
docling_text = page.find_all('region[model=docling][type=text]')
|
263
|
+
|
264
|
+
logger.info(f"Text elements by source:")
|
265
|
+
logger.info(f"- Native PDF text: {len(native_text)} elements")
|
266
|
+
logger.info(f"- OCR text: {len(ocr_text)} elements")
|
267
|
+
logger.info(f"- Docling text: {len(docling_text)} elements")
|
268
|
+
|
269
|
+
# Test specific text element queries
|
270
|
+
if native_text:
|
271
|
+
sample_native = native_text[0]
|
272
|
+
logger.info(f"Sample native text: '{sample_native.text[:30]}...'")
|
273
|
+
logger.info(f"Has source='native' attribute: {getattr(sample_native, 'source', None) == 'native'}")
|
274
|
+
|
275
|
+
# Test if text_content attribute is set
|
276
|
+
has_text_content = False
|
277
|
+
for region in docling_regions:
|
278
|
+
if hasattr(region, 'text_content') and region.text_content:
|
279
|
+
has_text_content = True
|
280
|
+
logger.info(f"Found region with text_content: '{region.text_content[:30]}...'")
|
281
|
+
break
|
282
|
+
|
283
|
+
logger.info(f"Regions have text_content attribute: {has_text_content}")
|
284
|
+
|
285
|
+
# Test if associated_text_elements is used
|
286
|
+
has_associated_text = False
|
287
|
+
for region in docling_regions:
|
288
|
+
if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
|
289
|
+
has_associated_text = True
|
290
|
+
logger.info(f"Found region with associated_text_elements: {len(region.associated_text_elements)} elements")
|
291
|
+
break
|
292
|
+
|
293
|
+
logger.info(f"Regions have associated_text_elements: {has_associated_text}")
|
294
|
+
|
295
|
+
# Highlight different text sources
|
296
|
+
page.clear_highlights()
|
297
|
+
if native_text:
|
298
|
+
native_text.highlight(
|
299
|
+
color=(0, 0, 0.7, 0.3),
|
300
|
+
label="Native Text Elements",
|
301
|
+
include_attrs=['source']
|
302
|
+
)
|
303
|
+
|
304
|
+
if docling_text:
|
305
|
+
docling_text.highlight(
|
306
|
+
color=(0.7, 0, 0, 0.3),
|
307
|
+
label="Docling Text Elements",
|
308
|
+
include_attrs=['model']
|
309
|
+
)
|
310
|
+
|
311
|
+
# Save source visualization
|
312
|
+
source_path = os.path.join(output_dir, "text_sources.png")
|
313
|
+
page.save_image(source_path, labels=True)
|
314
|
+
logger.info(f"Saved text source visualization to {source_path}")
|
315
|
+
|
316
|
+
# Log final summary
|
317
|
+
print("\n*** TEST SUMMARY ***")
|
318
|
+
print(f"Total Docling regions: {len(docling_regions)}")
|
319
|
+
print(f"Hierarchical navigation: {'✅ Working' if has_children_attr else '❌ Not working'}")
|
320
|
+
print(f"Text extraction: {'✅ Working' if len(extracted_text) > 0 else '❌ Not working'}")
|
321
|
+
print(f"Multi-model integration: {'✅ Working' if len(yolo_regions) > 0 else '❌ Not working'}")
|
322
|
+
print(f"Test artifacts saved to: {output_dir}")
|
323
|
+
|
324
|
+
print("\nAll tests completed with no errors!")
|
325
|
+
logger.info("\nAll tests completed.")
|
@@ -0,0 +1,192 @@
|
|
1
|
+
"""
|
2
|
+
Example script demonstrating the Docling integration with Natural PDF.
|
3
|
+
|
4
|
+
This script uses Docling to analyze document layout and text structure,
|
5
|
+
with hierarchical relationships between document elements.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
python examples/docling_example.py [pdf_path]
|
9
|
+
|
10
|
+
Dependencies:
|
11
|
+
- torch
|
12
|
+
- transformers
|
13
|
+
- docling_core
|
14
|
+
"""
|
15
|
+
|
16
|
+
import os
|
17
|
+
import sys
|
18
|
+
import logging
|
19
|
+
from PIL import Image
|
20
|
+
|
21
|
+
# Import the library
|
22
|
+
from natural_pdf import PDF, configure_logging
|
23
|
+
|
24
|
+
# Get PDF path from command line or use demo file
|
25
|
+
if len(sys.argv) > 1:
|
26
|
+
pdf_path = sys.argv[1]
|
27
|
+
else:
|
28
|
+
# Default to a sample PDF in the pdfs directory
|
29
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
30
|
+
repo_root = os.path.dirname(script_dir)
|
31
|
+
pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
|
32
|
+
|
33
|
+
# Configure logging to see what's happening
|
34
|
+
configure_logging(level=logging.INFO)
|
35
|
+
logger = logging.getLogger("docling_example")
|
36
|
+
logger.setLevel(logging.INFO)
|
37
|
+
|
38
|
+
# Check if we can import required packages
|
39
|
+
try:
|
40
|
+
from docling.document_converter import DocumentConverter
|
41
|
+
except ImportError:
|
42
|
+
logger.error("Missing required packages. Please install:")
|
43
|
+
logger.error("pip install docling")
|
44
|
+
sys.exit(1)
|
45
|
+
|
46
|
+
# Docling will use the best available device automatically
|
47
|
+
|
48
|
+
# Load the PDF
|
49
|
+
pdf = PDF(pdf_path)
|
50
|
+
logger.info(f"Loaded PDF with {len(pdf.pages)} pages")
|
51
|
+
|
52
|
+
# Process the first page with Docling
|
53
|
+
page = pdf.pages[0]
|
54
|
+
|
55
|
+
# Run Docling analysis with the new docling model
|
56
|
+
logger.info("Running Docling analysis...")
|
57
|
+
page.analyze_layout(
|
58
|
+
model="docling",
|
59
|
+
confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
|
60
|
+
model_params={
|
61
|
+
"verbose": True, # Enable detailed logging
|
62
|
+
# Any other parameters would be passed directly to DocumentConverter
|
63
|
+
}
|
64
|
+
)
|
65
|
+
|
66
|
+
# If we have a docling_document, we succeeded
|
67
|
+
if hasattr(page, 'docling_document'):
|
68
|
+
logger.info("Docling analysis complete!")
|
69
|
+
|
70
|
+
# Find all detected regions by model
|
71
|
+
docling_regions = page.find_all('region[model=docling]')
|
72
|
+
logger.info(f"Found {len(docling_regions)} Docling regions")
|
73
|
+
|
74
|
+
# Get regions by type
|
75
|
+
section_headers = page.find_all('section-header')
|
76
|
+
plain_text = page.find_all('text[model=docling]')
|
77
|
+
figures = page.find_all('figure[model=docling]')
|
78
|
+
|
79
|
+
logger.info(f"Found {len(section_headers)} section headers")
|
80
|
+
logger.info(f"Found {len(plain_text)} text blocks")
|
81
|
+
logger.info(f"Found {len(figures)} figures")
|
82
|
+
|
83
|
+
# Print hierarchy information
|
84
|
+
root_regions = [r for r in docling_regions if not r.parent_region]
|
85
|
+
logger.info(f"Document has {len(root_regions)} top-level regions")
|
86
|
+
|
87
|
+
# Print text from each section header and its children
|
88
|
+
for i, header in enumerate(section_headers):
|
89
|
+
logger.info(f"\nSection {i+1}: {header.extract_text()}")
|
90
|
+
|
91
|
+
# Get direct children of this header
|
92
|
+
children = header.get_children()
|
93
|
+
if children:
|
94
|
+
logger.info(f" - Has {len(children)} direct children")
|
95
|
+
for j, child in enumerate(children[:2]): # Show first 2 children
|
96
|
+
child_text = child.extract_text()
|
97
|
+
if len(child_text) > 50:
|
98
|
+
child_text = child_text[:50] + "..."
|
99
|
+
logger.info(f" - Child {j+1}: {child.region_type} - {child_text}")
|
100
|
+
|
101
|
+
if len(children) > 2:
|
102
|
+
logger.info(f" - And {len(children) - 2} more children...")
|
103
|
+
|
104
|
+
# Highlight different types of regions
|
105
|
+
page.clear_highlights()
|
106
|
+
|
107
|
+
# Highlight section headers in red
|
108
|
+
if section_headers:
|
109
|
+
section_headers.highlight(
|
110
|
+
color=(1, 0, 0, 0.3),
|
111
|
+
label="Section Headers",
|
112
|
+
include_attrs=['confidence']
|
113
|
+
)
|
114
|
+
|
115
|
+
# Highlight text blocks in blue
|
116
|
+
if plain_text:
|
117
|
+
plain_text.highlight(
|
118
|
+
color=(0, 0, 1, 0.3),
|
119
|
+
label="Text Blocks"
|
120
|
+
)
|
121
|
+
|
122
|
+
# Highlight figures in green
|
123
|
+
if figures:
|
124
|
+
figures.highlight(
|
125
|
+
color=(0, 1, 0, 0.3),
|
126
|
+
label="Figures"
|
127
|
+
)
|
128
|
+
|
129
|
+
# Demonstrate hierarchical extraction
|
130
|
+
if section_headers:
|
131
|
+
# Get the first section header
|
132
|
+
header = section_headers[0]
|
133
|
+
|
134
|
+
# Extract all text recursively from this section and its children
|
135
|
+
all_text = header.extract_text()
|
136
|
+
logger.info(f"\nExtracted text from first section: {all_text[:100]}...")
|
137
|
+
|
138
|
+
# Find all text elements recursively within this section
|
139
|
+
section_text_elems = header.find_all('text', recursive=True)
|
140
|
+
logger.info(f"Found {len(section_text_elems)} text elements in the section hierarchy")
|
141
|
+
|
142
|
+
# Test recursive searching
|
143
|
+
if hasattr(header, 'get_descendants'):
|
144
|
+
descendants = header.get_descendants()
|
145
|
+
logger.info(f"Section has {len(descendants)} total descendants")
|
146
|
+
|
147
|
+
# Save highlighted image
|
148
|
+
output_path = os.path.join("output", "docling_analysis.png")
|
149
|
+
os.makedirs("output", exist_ok=True)
|
150
|
+
|
151
|
+
logger.info(f"Saving visualization to {output_path}")
|
152
|
+
page.save_image(output_path, labels=True)
|
153
|
+
|
154
|
+
# Create a more detailed visualization showing the hierarchy
|
155
|
+
if section_headers and len(section_headers) > 0:
|
156
|
+
# Create a new visualization from scratch
|
157
|
+
page.clear_highlights()
|
158
|
+
|
159
|
+
# Get the first section to visualize its hierarchy
|
160
|
+
section = section_headers[0]
|
161
|
+
|
162
|
+
# Highlight the section header
|
163
|
+
section.highlight(
|
164
|
+
color=(1, 0, 0, 0.3),
|
165
|
+
label="Section Header"
|
166
|
+
)
|
167
|
+
|
168
|
+
# Highlight its immediate children
|
169
|
+
children = section.get_children()
|
170
|
+
for child in children:
|
171
|
+
child.highlight(
|
172
|
+
color=(0, 0.7, 0, 0.3),
|
173
|
+
label="Direct Children",
|
174
|
+
include_attrs=['region_type']
|
175
|
+
)
|
176
|
+
|
177
|
+
# Highlight grandchildren differently
|
178
|
+
grandchildren = child.get_children()
|
179
|
+
for grandchild in grandchildren:
|
180
|
+
grandchild.highlight(
|
181
|
+
color=(0, 0, 0.7, 0.3),
|
182
|
+
label="Grandchildren",
|
183
|
+
include_attrs=['region_type']
|
184
|
+
)
|
185
|
+
|
186
|
+
# Save hierarchy visualization
|
187
|
+
hierarchy_path = os.path.join("output", "docling_hierarchy.png")
|
188
|
+
page.save_image(hierarchy_path, labels=True)
|
189
|
+
logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
|
190
|
+
|
191
|
+
else:
|
192
|
+
logger.error("Docling analysis failed. Check that you have the required packages installed.")
|