natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,74 +23,6 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
23
23
  from natural_pdf import PDF, configure_logging
24
24
  import logging
25
25
 
26
- def pdfplumber_qa(pdf_path, question, debug=False):
27
- """Run QA using direct pdfplumber code similar to the original example."""
28
- # Open PDF
29
- pdf = pdfplumber.open(pdf_path)
30
- page = pdf.pages[0]
31
-
32
- # Get image
33
- image = page.to_image(resolution=300).original
34
-
35
- # Extract words
36
- words = page.extract_words()
37
-
38
- # Build word boxes in the expected format
39
- def get_box(word):
40
- return [
41
- word['text'],
42
- [int(word["x0"]), int(word["top"]), int(word["x1"]), int(word["bottom"])]
43
- ]
44
-
45
- word_boxes = [get_box(word) for word in words]
46
-
47
- # Debug visualization
48
- if debug:
49
- os.makedirs("output", exist_ok=True)
50
-
51
- # Save image
52
- image.save("output/direct_qa_image.png")
53
-
54
- # Save visualization
55
- vis_image = image.copy()
56
- draw = ImageDraw.Draw(vis_image)
57
-
58
- for i, (text, box) in enumerate(word_boxes):
59
- x0, y0, x1, y1 = box
60
- draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0), width=2)
61
- draw.text((x0, y0), str(i), fill=(255, 0, 0))
62
-
63
- vis_image.save("output/direct_qa_boxes.png")
64
-
65
- # Use transformers pipeline
66
- try:
67
- from transformers import pipeline
68
-
69
- pipe = pipeline("document-question-answering", model="impira/layoutlm-document-qa")
70
-
71
- # Run query
72
- query = { "image": image, "question": question, "word_boxes": word_boxes }
73
-
74
- result = pipe(query)[0]
75
-
76
- # Create result dictionary similar to Natural PDF's format
77
- return {
78
- "answer": result.get("answer", ""),
79
- "confidence": result.get("score", 0.0),
80
- "start": result.get("start", 0),
81
- "end": result.get("end", 0),
82
- "found": True if result.get("answer") else False
83
- }
84
-
85
- except Exception as e:
86
- print(f"Error in direct QA: {e}")
87
- return {
88
- "answer": "",
89
- "confidence": 0.0,
90
- "error": str(e),
91
- "found": False
92
- }
93
-
94
26
  def main():
95
27
  parser = argparse.ArgumentParser(description="Direct Document QA Example")
96
28
  parser.add_argument("pdf_path", nargs="?", default="../pdfs/0500000US42001.pdf",
@@ -99,8 +31,6 @@ def main():
99
31
  help="Question to ask about the document")
100
32
  parser.add_argument("--debug", action="store_true",
101
33
  help="Save debug information for troubleshooting")
102
- parser.add_argument("--compare", action="store_true",
103
- help="Compare with Natural PDF implementation")
104
34
 
105
35
  args = parser.parse_args()
106
36
 
@@ -113,53 +43,29 @@ def main():
113
43
  print(f"Document: {args.pdf_path}")
114
44
  print(f"Question: {args.question}")
115
45
 
116
- # Run direct pdfplumber QA
117
- print("\n=== Direct pdfplumber implementation ===")
118
- result = pdfplumber_qa(args.pdf_path, args.question, debug=args.debug)
46
+ print("\n=== Natural PDF implementation ===")
47
+
48
+ # Use Natural PDF
49
+ pdf = PDF(args.pdf_path)
50
+ page = pdf.pages[0]
51
+
52
+ # Ask the question
53
+ result = page.ask(args.question, debug=args.debug)
119
54
 
120
55
  if result.get("found", False):
121
56
  print(f"Answer: {result['answer']}")
122
57
  print(f"Confidence: {result['confidence']:.2f}")
58
+
59
+ # Highlight the answer
60
+ if result.get("source_elements"):
61
+ for element in result["source_elements"]:
62
+ element.highlight(color=(1, 0.5, 0, 0.5))
63
+
64
+ # Save the image
65
+ page.save_image("output/natural_pdf_answer.png")
66
+ print("Saved highlighted answer to output/natural_pdf_answer.png")
123
67
  else:
124
68
  print(f"No answer found: {result.get('error', '')}")
125
69
 
126
- # Compare with Natural PDF if requested
127
- if args.compare:
128
- print("\n=== Natural PDF implementation ===")
129
-
130
- # Use Natural PDF
131
- pdf = PDF(args.pdf_path)
132
- page = pdf.pages[0]
133
-
134
- # Ask the question
135
- natural_result = page.ask(args.question, debug=args.debug)
136
-
137
- if natural_result.get("found", False):
138
- print(f"Answer: {natural_result['answer']}")
139
- print(f"Confidence: {natural_result['confidence']:.2f}")
140
-
141
- # Highlight the answer
142
- if natural_result.get("source_elements"):
143
- for element in natural_result["source_elements"]:
144
- element.highlight(color=(1, 0.5, 0, 0.5))
145
-
146
- # Save the image
147
- page.save_image("output/natural_pdf_answer.png")
148
- print("Saved highlighted answer to output/natural_pdf_answer.png")
149
- else:
150
- print(f"No answer found: {natural_result.get('error', '')}")
151
-
152
- # Compare results
153
- if result.get("found", False) and natural_result.get("found", False):
154
- print("\n=== Comparison ===")
155
- print(f"Direct answer: '{result['answer']}' (confidence: {result['confidence']:.2f})")
156
- print(f"Natural PDF answer: '{natural_result['answer']}' (confidence: {natural_result['confidence']:.2f})")
157
-
158
- # Calculate similarity
159
- if result['answer'] == natural_result['answer']:
160
- print("Results match exactly!")
161
- else:
162
- print("Results differ.")
163
-
164
70
  if __name__ == "__main__":
165
71
  main()
@@ -0,0 +1,325 @@
1
+ """
2
+ Comprehensive test of the Docling integration with Natural PDF.
3
+
4
+ This script tests all aspects of the Docling integration:
5
+ 1. Basic document layout detection
6
+ 2. Hierarchical document navigation
7
+ 3. Text extraction from complex structures
8
+ 4. Integration with other layout models
9
+ 5. Performance and edge cases
10
+
11
+ Usage:
12
+ python examples/docling_comprehensive_test.py [pdf_path]
13
+
14
+ Dependencies:
15
+ - torch
16
+ - transformers
17
+ - docling_core
18
+ """
19
+
20
+ import os
21
+ import sys
22
+ import time
23
+ import logging
24
+ from pathlib import Path
25
+
26
+ # Import the library
27
+ from natural_pdf import PDF, configure_logging
28
+
29
+ # Configure detailed logging for debugging
30
+ configure_logging(level=logging.INFO)
31
+ logger = logging.getLogger("docling_test")
32
+ logger.setLevel(logging.INFO)
33
+
34
+ # Get PDF path from command line or use demo file
35
+ if len(sys.argv) > 1:
36
+ pdf_path = sys.argv[1]
37
+ else:
38
+ # Default to a sample PDF in the pdfs directory
39
+ script_dir = os.path.dirname(os.path.abspath(__file__))
40
+ repo_root = os.path.dirname(script_dir)
41
+ pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
42
+
43
+ # Check if required packages are installed
44
+ try:
45
+ from docling.document_converter import DocumentConverter
46
+ except ImportError:
47
+ logger.error("Missing required packages. Please install with:")
48
+ logger.error("pip install docling")
49
+ sys.exit(1)
50
+
51
+ # Create output directory for test results
52
+ output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "output", "docling_tests")
53
+ os.makedirs(output_dir, exist_ok=True)
54
+
55
+ # Load the PDF
56
+ logger.info(f"Loading PDF: {pdf_path}")
57
+ pdf = PDF(pdf_path)
58
+ logger.info(f"PDF has {len(pdf.pages)} pages")
59
+
60
+ # Process only the first page for tests
61
+ page = pdf.pages[0]
62
+
63
+ # SECTION 1: Basic Docling Detection
64
+ logger.info("\n*** SECTION 1: Basic Docling Detection ***")
65
+
66
+ # Time the Docling analysis
67
+ start_time = time.time()
68
+ page.analyze_layout(
69
+ model="docling",
70
+ confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
71
+ model_params={
72
+ "verbose": True
73
+ # Any other parameters would be passed directly to DocumentConverter
74
+ }
75
+ )
76
+ docling_time = time.time() - start_time
77
+ logger.info(f"Docling analysis completed in {docling_time:.2f} seconds")
78
+
79
+ # Verify that docling_document was created
80
+ if hasattr(page, 'docling_document'):
81
+ logger.info("✅ Docling document created successfully")
82
+ else:
83
+ logger.error("❌ Docling document not created")
84
+
85
+ # Count detected regions
86
+ docling_regions = page.find_all('region[model=docling]')
87
+ logger.info(f"Found {len(docling_regions)} total Docling regions")
88
+
89
+ # Get regions by type
90
+ section_headers = page.find_all('section-header')
91
+ text_regions = page.find_all('region[model=docling][type=text]')
92
+ figures = page.find_all('region[model=docling][type=figure]')
93
+
94
+ logger.info(f"- Section headers: {len(section_headers)}")
95
+ logger.info(f"- Text regions: {len(text_regions)}")
96
+ logger.info(f"- Figures: {len(figures)}")
97
+
98
+ # SECTION 2: Hierarchical Navigation
99
+ logger.info("\n*** SECTION 2: Hierarchical Navigation ***")
100
+
101
+ # Test if regions have child_regions attribute
102
+ has_children_attr = all(hasattr(region, 'child_regions') for region in docling_regions)
103
+ logger.info(f"All regions have child_regions attribute: {has_children_attr}")
104
+
105
+ # Count top-level regions (no parent)
106
+ top_level_regions = [r for r in docling_regions if not r.parent_region]
107
+ logger.info(f"Top-level regions: {len(top_level_regions)}")
108
+
109
+ # Test child traversal for section headers
110
+ if section_headers:
111
+ header = section_headers[0]
112
+ logger.info(f"Testing section header: '{header.extract_text()[:30]}...'")
113
+
114
+ # Test get_children method
115
+ if hasattr(header, 'get_children'):
116
+ children = header.get_children()
117
+ logger.info(f"- Direct children: {len(children)}")
118
+
119
+ # Test filtered get_children
120
+ text_children = header.get_children('text')
121
+ logger.info(f"- Direct text children: {len(text_children)}")
122
+ else:
123
+ logger.error("❌ get_children method not found")
124
+
125
+ # Test get_descendants method
126
+ if hasattr(header, 'get_descendants'):
127
+ descendants = header.get_descendants()
128
+ logger.info(f"- All descendants: {len(descendants)}")
129
+
130
+ # Test filtered get_descendants
131
+ text_descendants = header.get_descendants('text')
132
+ logger.info(f"- Text descendants: {len(text_descendants)}")
133
+ else:
134
+ logger.error("❌ get_descendants method not found")
135
+
136
+ # Test find_all with recursive option
137
+ children_find = header.find_all('text', recursive=False)
138
+ logger.info(f"- Children via find_all(recursive=False): {len(children_find)}")
139
+
140
+ all_find = header.find_all('text', recursive=True)
141
+ logger.info(f"- All text via find_all(recursive=True): {len(all_find)}")
142
+
143
+ # SECTION 3: Text Extraction
144
+ logger.info("\n*** SECTION 3: Text Extraction ***")
145
+
146
+ # Test basic text extraction
147
+ if section_headers:
148
+ header = section_headers[0]
149
+ header_text = header.extract_text()
150
+ logger.info(f"Section header text: '{header_text[:50]}...'")
151
+
152
+ # Test extraction from hierarchy
153
+ if hasattr(header, 'get_children') and header.get_children():
154
+ child = header.get_children()[0]
155
+ child_text = child.extract_text()
156
+ logger.info(f"First child text: '{child_text[:50]}...'")
157
+
158
+ # Compare with standard extraction
159
+ # In a real document, the header's extract_text might include the child text too
160
+ combined_len = len(header_text) + len(child_text)
161
+ logger.info(f"Combined text length: {combined_len} characters")
162
+
163
+ # Test text extraction with and without OCR
164
+ # This is a simplified test - in a real scenario, we'd compare with known text
165
+ extracted_text = page.extract_text()
166
+ logger.info(f"Extracted page text: {len(extracted_text)} characters")
167
+
168
+ # SECTION 4: Integration with Other Models
169
+ logger.info("\n*** SECTION 4: Integration with Other Models ***")
170
+
171
+ # Store current regions for comparison
172
+ original_region_count = len(page._regions['detected'])
173
+
174
+ # Add YOLO analysis
175
+ page.analyze_layout(
176
+ model="yolo",
177
+ confidence=0.3,
178
+ existing="append" # Important: don't replace Docling regions
179
+ )
180
+
181
+ # Count new regions
182
+ all_regions = page._regions['detected']
183
+ logger.info(f"Total regions after adding YOLO: {len(all_regions)}")
184
+ logger.info(f"New regions added: {len(all_regions) - original_region_count}")
185
+
186
+ # Test filtering by model
187
+ yolo_regions = page.find_all('region[model=yolo]')
188
+ docling_regions_after = page.find_all('region[model=docling]')
189
+
190
+ logger.info(f"YOLO regions: {len(yolo_regions)}")
191
+ logger.info(f"Docling regions after YOLO: {len(docling_regions_after)}")
192
+ logger.info(f"Docling regions preserved: {len(docling_regions_after) == len(docling_regions)}")
193
+
194
+ # SECTION 5: Visualization
195
+ logger.info("\n*** SECTION 5: Visualization ***")
196
+
197
+ # Clear previous highlights
198
+ page.clear_highlights()
199
+
200
+ # Highlight different models and region types
201
+ if section_headers:
202
+ section_headers.highlight(
203
+ color=(1, 0, 0, 0.3),
204
+ label="Docling Headers",
205
+ include_attrs=['region_type']
206
+ )
207
+
208
+ if text_regions:
209
+ text_regions.highlight(
210
+ color=(0, 0, 1, 0.3),
211
+ label="Docling Text",
212
+ include_attrs=['region_type']
213
+ )
214
+
215
+ if yolo_regions:
216
+ yolo_regions.highlight(
217
+ color=(0, 1, 0, 0.3),
218
+ label="YOLO Regions",
219
+ include_attrs=['region_type']
220
+ )
221
+
222
+ # Save highlighted image
223
+ highlight_path = os.path.join(output_dir, "model_comparison.png")
224
+ page.save_image(highlight_path, labels=True)
225
+ logger.info(f"Saved visualization to {highlight_path}")
226
+
227
+ # Test hierarchical highlighting
228
+ if section_headers and len(section_headers) > 0:
229
+ # Clear previous highlights
230
+ page.clear_highlights()
231
+
232
+ # Select a section to visualize
233
+ header = section_headers[0]
234
+
235
+ # Highlight header
236
+ header.highlight(
237
+ color=(1, 0, 0, 0.3),
238
+ label="Section Header"
239
+ )
240
+
241
+ # Highlight direct children
242
+ if hasattr(header, 'get_children') and header.get_children():
243
+ children = header.get_children()
244
+ for child in children:
245
+ child.highlight(
246
+ color=(0, 1, 0, 0.3),
247
+ label="Direct Children",
248
+ include_attrs=['region_type']
249
+ )
250
+
251
+ # Save hierarchy visualization
252
+ hierarchy_path = os.path.join(output_dir, "hierarchy_visualization.png")
253
+ page.save_image(hierarchy_path, labels=True)
254
+ logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
255
+
256
+ # SECTION 6: Text Source Testing (OCR vs Native)
257
+ logger.info("\n*** SECTION 6: Text Source Testing ***")
258
+
259
+ # Find text elements by source
260
+ native_text = page.find_all('text[source=native]')
261
+ ocr_text = page.find_all('text[source=ocr]')
262
+ docling_text = page.find_all('region[model=docling][type=text]')
263
+
264
+ logger.info(f"Text elements by source:")
265
+ logger.info(f"- Native PDF text: {len(native_text)} elements")
266
+ logger.info(f"- OCR text: {len(ocr_text)} elements")
267
+ logger.info(f"- Docling text: {len(docling_text)} elements")
268
+
269
+ # Test specific text element queries
270
+ if native_text:
271
+ sample_native = native_text[0]
272
+ logger.info(f"Sample native text: '{sample_native.text[:30]}...'")
273
+ logger.info(f"Has source='native' attribute: {getattr(sample_native, 'source', None) == 'native'}")
274
+
275
+ # Test if text_content attribute is set
276
+ has_text_content = False
277
+ for region in docling_regions:
278
+ if hasattr(region, 'text_content') and region.text_content:
279
+ has_text_content = True
280
+ logger.info(f"Found region with text_content: '{region.text_content[:30]}...'")
281
+ break
282
+
283
+ logger.info(f"Regions have text_content attribute: {has_text_content}")
284
+
285
+ # Test if associated_text_elements is used
286
+ has_associated_text = False
287
+ for region in docling_regions:
288
+ if hasattr(region, 'associated_text_elements') and region.associated_text_elements:
289
+ has_associated_text = True
290
+ logger.info(f"Found region with associated_text_elements: {len(region.associated_text_elements)} elements")
291
+ break
292
+
293
+ logger.info(f"Regions have associated_text_elements: {has_associated_text}")
294
+
295
+ # Highlight different text sources
296
+ page.clear_highlights()
297
+ if native_text:
298
+ native_text.highlight(
299
+ color=(0, 0, 0.7, 0.3),
300
+ label="Native Text Elements",
301
+ include_attrs=['source']
302
+ )
303
+
304
+ if docling_text:
305
+ docling_text.highlight(
306
+ color=(0.7, 0, 0, 0.3),
307
+ label="Docling Text Elements",
308
+ include_attrs=['model']
309
+ )
310
+
311
+ # Save source visualization
312
+ source_path = os.path.join(output_dir, "text_sources.png")
313
+ page.save_image(source_path, labels=True)
314
+ logger.info(f"Saved text source visualization to {source_path}")
315
+
316
+ # Log final summary
317
+ print("\n*** TEST SUMMARY ***")
318
+ print(f"Total Docling regions: {len(docling_regions)}")
319
+ print(f"Hierarchical navigation: {'✅ Working' if has_children_attr else '❌ Not working'}")
320
+ print(f"Text extraction: {'✅ Working' if len(extracted_text) > 0 else '❌ Not working'}")
321
+ print(f"Multi-model integration: {'✅ Working' if len(yolo_regions) > 0 else '❌ Not working'}")
322
+ print(f"Test artifacts saved to: {output_dir}")
323
+
324
+ print("\nAll tests completed with no errors!")
325
+ logger.info("\nAll tests completed.")
@@ -0,0 +1,192 @@
1
+ """
2
+ Example script demonstrating the Docling integration with Natural PDF.
3
+
4
+ This script uses Docling to analyze document layout and text structure,
5
+ with hierarchical relationships between document elements.
6
+
7
+ Usage:
8
+ python examples/docling_example.py [pdf_path]
9
+
10
+ Dependencies:
11
+ - torch
12
+ - transformers
13
+ - docling_core
14
+ """
15
+
16
+ import os
17
+ import sys
18
+ import logging
19
+ from PIL import Image
20
+
21
+ # Import the library
22
+ from natural_pdf import PDF, configure_logging
23
+
24
+ # Get PDF path from command line or use demo file
25
+ if len(sys.argv) > 1:
26
+ pdf_path = sys.argv[1]
27
+ else:
28
+ # Default to a sample PDF in the pdfs directory
29
+ script_dir = os.path.dirname(os.path.abspath(__file__))
30
+ repo_root = os.path.dirname(script_dir)
31
+ pdf_path = os.path.join(repo_root, "pdfs", "01-practice.pdf")
32
+
33
+ # Configure logging to see what's happening
34
+ configure_logging(level=logging.INFO)
35
+ logger = logging.getLogger("docling_example")
36
+ logger.setLevel(logging.INFO)
37
+
38
+ # Check if we can import required packages
39
+ try:
40
+ from docling.document_converter import DocumentConverter
41
+ except ImportError:
42
+ logger.error("Missing required packages. Please install:")
43
+ logger.error("pip install docling")
44
+ sys.exit(1)
45
+
46
+ # Docling will use the best available device automatically
47
+
48
+ # Load the PDF
49
+ pdf = PDF(pdf_path)
50
+ logger.info(f"Loaded PDF with {len(pdf.pages)} pages")
51
+
52
+ # Process the first page with Docling
53
+ page = pdf.pages[0]
54
+
55
+ # Run Docling analysis with the new docling model
56
+ logger.info("Running Docling analysis...")
57
+ page.analyze_layout(
58
+ model="docling",
59
+ confidence=0.2, # This parameter isn't used by Docling but kept for API consistency
60
+ model_params={
61
+ "verbose": True, # Enable detailed logging
62
+ # Any other parameters would be passed directly to DocumentConverter
63
+ }
64
+ )
65
+
66
+ # If we have a docling_document, we succeeded
67
+ if hasattr(page, 'docling_document'):
68
+ logger.info("Docling analysis complete!")
69
+
70
+ # Find all detected regions by model
71
+ docling_regions = page.find_all('region[model=docling]')
72
+ logger.info(f"Found {len(docling_regions)} Docling regions")
73
+
74
+ # Get regions by type
75
+ section_headers = page.find_all('section-header')
76
+ plain_text = page.find_all('text[model=docling]')
77
+ figures = page.find_all('figure[model=docling]')
78
+
79
+ logger.info(f"Found {len(section_headers)} section headers")
80
+ logger.info(f"Found {len(plain_text)} text blocks")
81
+ logger.info(f"Found {len(figures)} figures")
82
+
83
+ # Print hierarchy information
84
+ root_regions = [r for r in docling_regions if not r.parent_region]
85
+ logger.info(f"Document has {len(root_regions)} top-level regions")
86
+
87
+ # Print text from each section header and its children
88
+ for i, header in enumerate(section_headers):
89
+ logger.info(f"\nSection {i+1}: {header.extract_text()}")
90
+
91
+ # Get direct children of this header
92
+ children = header.get_children()
93
+ if children:
94
+ logger.info(f" - Has {len(children)} direct children")
95
+ for j, child in enumerate(children[:2]): # Show first 2 children
96
+ child_text = child.extract_text()
97
+ if len(child_text) > 50:
98
+ child_text = child_text[:50] + "..."
99
+ logger.info(f" - Child {j+1}: {child.region_type} - {child_text}")
100
+
101
+ if len(children) > 2:
102
+ logger.info(f" - And {len(children) - 2} more children...")
103
+
104
+ # Highlight different types of regions
105
+ page.clear_highlights()
106
+
107
+ # Highlight section headers in red
108
+ if section_headers:
109
+ section_headers.highlight(
110
+ color=(1, 0, 0, 0.3),
111
+ label="Section Headers",
112
+ include_attrs=['confidence']
113
+ )
114
+
115
+ # Highlight text blocks in blue
116
+ if plain_text:
117
+ plain_text.highlight(
118
+ color=(0, 0, 1, 0.3),
119
+ label="Text Blocks"
120
+ )
121
+
122
+ # Highlight figures in green
123
+ if figures:
124
+ figures.highlight(
125
+ color=(0, 1, 0, 0.3),
126
+ label="Figures"
127
+ )
128
+
129
+ # Demonstrate hierarchical extraction
130
+ if section_headers:
131
+ # Get the first section header
132
+ header = section_headers[0]
133
+
134
+ # Extract all text recursively from this section and its children
135
+ all_text = header.extract_text()
136
+ logger.info(f"\nExtracted text from first section: {all_text[:100]}...")
137
+
138
+ # Find all text elements recursively within this section
139
+ section_text_elems = header.find_all('text', recursive=True)
140
+ logger.info(f"Found {len(section_text_elems)} text elements in the section hierarchy")
141
+
142
+ # Test recursive searching
143
+ if hasattr(header, 'get_descendants'):
144
+ descendants = header.get_descendants()
145
+ logger.info(f"Section has {len(descendants)} total descendants")
146
+
147
+ # Save highlighted image
148
+ output_path = os.path.join("output", "docling_analysis.png")
149
+ os.makedirs("output", exist_ok=True)
150
+
151
+ logger.info(f"Saving visualization to {output_path}")
152
+ page.save_image(output_path, labels=True)
153
+
154
+ # Create a more detailed visualization showing the hierarchy
155
+ if section_headers and len(section_headers) > 0:
156
+ # Create a new visualization from scratch
157
+ page.clear_highlights()
158
+
159
+ # Get the first section to visualize its hierarchy
160
+ section = section_headers[0]
161
+
162
+ # Highlight the section header
163
+ section.highlight(
164
+ color=(1, 0, 0, 0.3),
165
+ label="Section Header"
166
+ )
167
+
168
+ # Highlight its immediate children
169
+ children = section.get_children()
170
+ for child in children:
171
+ child.highlight(
172
+ color=(0, 0.7, 0, 0.3),
173
+ label="Direct Children",
174
+ include_attrs=['region_type']
175
+ )
176
+
177
+ # Highlight grandchildren differently
178
+ grandchildren = child.get_children()
179
+ for grandchild in grandchildren:
180
+ grandchild.highlight(
181
+ color=(0, 0, 0.7, 0.3),
182
+ label="Grandchildren",
183
+ include_attrs=['region_type']
184
+ )
185
+
186
+ # Save hierarchy visualization
187
+ hierarchy_path = os.path.join("output", "docling_hierarchy.png")
188
+ page.save_image(hierarchy_path, labels=True)
189
+ logger.info(f"Saved hierarchy visualization to {hierarchy_path}")
190
+
191
+ else:
192
+ logger.error("Docling analysis failed. Check that you have the required packages installed.")