natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/pdf.py CHANGED
@@ -1,5 +1,9 @@
1
1
  import pdfplumber
2
2
  import logging
3
+ import tempfile
4
+ import os
5
+ import re
6
+ import urllib.request
3
7
  from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
4
8
 
5
9
  from natural_pdf.core.page import Page
@@ -28,7 +32,7 @@ class PDF:
28
32
  with improved selection, navigation, and extraction capabilities.
29
33
  """
30
34
 
31
- def __init__(self, path: str, reading_order: bool = True,
35
+ def __init__(self, path_or_url: str, reading_order: bool = True,
32
36
  ocr: Optional[Union[bool, str, List, Dict]] = None,
33
37
  ocr_engine: Optional[Union[str, Any]] = None,
34
38
  font_attrs: Optional[List[str]] = None,
@@ -37,7 +41,7 @@ class PDF:
37
41
  Initialize the enhanced PDF object.
38
42
 
39
43
  Args:
40
- path: Path to the PDF file
44
+ path_or_url: Path to the PDF file or a URL to a PDF
41
45
  reading_order: Whether to use natural reading order
42
46
  ocr: OCR configuration:
43
47
  - None or False: OCR disabled
@@ -58,6 +62,40 @@ class PDF:
58
62
  True: Spaces are part of words, better for multi-word searching
59
63
  False: Break text at spaces, each word is separate (legacy behavior)
60
64
  """
65
+ # Check if the input is a URL
66
+ is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
67
+
68
+ # Initialize path-related attributes
69
+ self._original_path = path_or_url
70
+ self._temp_file = None
71
+
72
+ if is_url:
73
+ logger.info(f"Downloading PDF from URL: {path_or_url}")
74
+ try:
75
+ # Create a temporary file to store the downloaded PDF
76
+ self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
77
+
78
+ # Download the PDF
79
+ with urllib.request.urlopen(path_or_url) as response:
80
+ self._temp_file.write(response.read())
81
+ self._temp_file.flush()
82
+ self._temp_file.close()
83
+
84
+ # Use the temporary file path
85
+ path = self._temp_file.name
86
+ logger.info(f"PDF downloaded to temporary file: {path}")
87
+ except Exception as e:
88
+ if self._temp_file and hasattr(self._temp_file, 'name'):
89
+ try:
90
+ os.unlink(self._temp_file.name)
91
+ except:
92
+ pass
93
+ logger.error(f"Failed to download PDF from URL: {e}")
94
+ raise ValueError(f"Failed to download PDF from URL: {e}")
95
+ else:
96
+ # Use the provided path directly
97
+ path = path_or_url
98
+
61
99
  logger.info(f"Initializing PDF from {path}")
62
100
  logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
63
101
 
@@ -72,8 +110,13 @@ class PDF:
72
110
  if HAS_OCR_ENGINES:
73
111
  # Handle OCR engine selection
74
112
  if ocr_engine is None:
75
- # Use default engine (EasyOCR)
76
- self._ocr_engine = EasyOCREngine()
113
+ # Use default engine (PaddleOCR)
114
+ try:
115
+ self._ocr_engine = PaddleOCREngine()
116
+ except (ImportError, ValueError) as e:
117
+ logger.warning(f"PaddleOCR engine could not be loaded: {e}")
118
+ logger.warning("Falling back to EasyOCR engine.")
119
+ self._ocr_engine = EasyOCREngine()
77
120
  elif isinstance(ocr_engine, str):
78
121
  # String-based engine selection
79
122
  try:
@@ -481,74 +524,51 @@ class PDF:
481
524
  **kwargs: Additional parameters passed to the QA engine
482
525
 
483
526
  Returns:
484
- Dictionary with answer and confidence
527
+ A dictionary containing the answer, confidence, and other metadata.
528
+ Result will have an 'answer' key containing the answer text.
485
529
  """
486
- try:
487
- from natural_pdf.qa import get_qa_engine
488
-
489
- # Initialize or get QA engine
490
- qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
491
-
492
- # Determine which pages to query
493
- if pages is None:
494
- # Query all pages by default, prioritizing first few pages
495
- target_pages = list(range(min(10, len(self.pages))))
496
- elif isinstance(pages, int):
497
- # Single page
498
- target_pages = [pages]
499
- elif isinstance(pages, (list, range)):
500
- # List or range of pages
501
- target_pages = pages
502
- else:
503
- raise ValueError(f"Invalid pages parameter: {pages}")
504
-
505
- # Actually query each page and gather results
506
- results = []
507
- for page_idx in target_pages:
508
- if 0 <= page_idx < len(self.pages):
509
- page = self.pages[page_idx]
510
- page_result = qa_engine.ask_pdf_page(
511
- page=page,
512
- question=question,
513
- min_confidence=min_confidence,
514
- **kwargs
515
- )
516
-
517
- # Add to results if it found an answer
518
- if page_result.get("found", False):
519
- results.append(page_result)
520
-
521
- # Sort results by confidence
522
- results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
523
-
524
- # Return the best result, or an empty result if none found
525
- if results:
526
- return results[0]
527
- else:
528
- return {
529
- "answer": "",
530
- "confidence": 0.0,
531
- "found": False,
532
- "message": "No answer found in document"
533
- }
530
+ from natural_pdf.qa import get_qa_engine
531
+
532
+ # Initialize or get QA engine
533
+ qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
534
+
535
+ # Determine which pages to query
536
+ if pages is None:
537
+ target_pages = list(range(len(self.pages)))
538
+ elif isinstance(pages, int):
539
+ # Single page
540
+ target_pages = [pages]
541
+ elif isinstance(pages, (list, range)):
542
+ # List or range of pages
543
+ target_pages = pages
544
+ else:
545
+ raise ValueError(f"Invalid pages parameter: {pages}")
546
+
547
+ # Actually query each page and gather results
548
+ results = []
549
+ for page_idx in target_pages:
550
+ if 0 <= page_idx < len(self.pages):
551
+ page = self.pages[page_idx]
552
+ page_result = qa_engine.ask_pdf_page(
553
+ page=page,
554
+ question=question,
555
+ min_confidence=min_confidence,
556
+ **kwargs
557
+ )
558
+
559
+ # Add to results if it found an answer
560
+ if page_result.get("found", False):
561
+ results.append(page_result)
562
+
563
+ # Sort results by confidence
564
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
565
+
566
+ # Return the best result, or a default result if none found
567
+ if results:
568
+ return results[0]
569
+ else:
570
+ return None
534
571
 
535
- except ImportError as e:
536
- logger.warning(f"QA functionality not available: {e}")
537
- return {
538
- "answer": "",
539
- "confidence": 0.0,
540
- "error": "QA functionality not available",
541
- "found": False
542
- }
543
- except Exception as e:
544
- logger.error(f"Error in document QA: {e}")
545
- return {
546
- "answer": "",
547
- "confidence": 0.0,
548
- "error": str(e),
549
- "found": False
550
- }
551
-
552
572
  def __len__(self) -> int:
553
573
  """Return the number of pages in the PDF."""
554
574
  return len(self.pages)
@@ -558,10 +578,21 @@ class PDF:
558
578
  return self.pages[key]
559
579
 
560
580
  def close(self):
561
- """Close the underlying PDF file."""
581
+ """Close the underlying PDF file and clean up any temporary files."""
562
582
  if hasattr(self, '_pdf') and self._pdf is not None:
563
583
  self._pdf.close()
564
584
  self._pdf = None
585
+
586
+ # Clean up temporary file if it exists
587
+ if hasattr(self, '_temp_file') and self._temp_file is not None:
588
+ try:
589
+ if os.path.exists(self._temp_file.name):
590
+ os.unlink(self._temp_file.name)
591
+ logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
592
+ except Exception as e:
593
+ logger.warning(f"Failed to clean up temporary PDF file: {e}")
594
+ finally:
595
+ self._temp_file = None
565
596
 
566
597
  def __enter__(self):
567
598
  """Context manager entry."""
@@ -18,7 +18,7 @@ class Region:
18
18
  Represents a rectangular region on a page.
19
19
  """
20
20
 
21
- def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None):
21
+ def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
22
22
  """
23
23
  Initialize a region.
24
24
 
@@ -26,6 +26,7 @@ class Region:
26
26
  page: Parent page
27
27
  bbox: Bounding box as (x0, top, x1, bottom)
28
28
  polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
29
+ parent: Optional parent region (for hierarchical document structure)
29
30
  """
30
31
  self._page = page
31
32
  self._bbox = bbox
@@ -48,6 +49,12 @@ class Region:
48
49
  # Region management attributes
49
50
  self.name = None
50
51
  self.source = None # Will be set by creation methods
52
+
53
+ # Hierarchy support for nested document structure
54
+ self.parent_region = parent
55
+ self.child_regions = []
56
+ self.text_content = None # Direct text content (e.g., from Docling)
57
+ self.associated_text_elements = [] # Native text elements that overlap with this region
51
58
 
52
59
  @property
53
60
  def page(self) -> 'Page':
@@ -387,6 +394,11 @@ class Region:
387
394
  """
388
395
  Extract text from this region using pdfplumber's native functionality.
389
396
 
397
+ For regions created by Docling, this will first try to use:
398
+ 1. Associated text elements from the PDF (if available)
399
+ 2. Direct text content from Docling (if available)
400
+ 3. Fall back to standard pdfplumber extraction
401
+
390
402
  Args:
391
403
  keep_blank_chars: Whether to keep blank characters (legacy parameter)
392
404
  apply_exclusions: Whether to apply exclusion regions
@@ -398,6 +410,28 @@ class Region:
398
410
  Returns:
399
411
  Extracted text as string
400
412
  """
413
+ import logging
414
+ logger = logging.getLogger("natural_pdf.elements.region")
415
+
416
+ # Check for Docling model or if we have direct text content
417
+ if self.model == 'docling' or hasattr(self, 'text_content'):
418
+ # First priority: check if we have associated native text elements
419
+ if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
420
+ source_count = len(self.associated_text_elements)
421
+ logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
422
+ # Sort elements in reading order
423
+ sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
424
+ # Extract and join their text
425
+ text_result = " ".join(elem.text for elem in sorted_elements)
426
+ return text_result
427
+
428
+ # Second priority: use direct text content from Docling
429
+ elif self.text_content:
430
+ logger.info(f"Region {self.region_type}: Using Docling OCR text content")
431
+ return self.text_content
432
+
433
+ logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
434
+
401
435
  # Handle preserve_whitespace parameter for consistency with Page.extract_text
402
436
  if preserve_whitespace is not None:
403
437
  keep_blank_chars = preserve_whitespace
@@ -1346,21 +1380,142 @@ class Region:
1346
1380
  "source_elements": list of elements that contain the answer (if found)
1347
1381
  }
1348
1382
  """
1349
- try:
1350
- from natural_pdf.qa.document_qa import get_qa_engine
1351
-
1352
- # Get or initialize QA engine with specified model
1353
- qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1354
-
1355
- # Ask the question using the QA engine
1356
- return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
1357
- except ImportError as e:
1358
- import logging
1359
- logger = logging.getLogger("natural_pdf.elements.region")
1360
- logger.warning(f"QA functionality not available: {e}")
1361
- return {
1362
- "answer": "",
1363
- "confidence": 0.0,
1364
- "error": "QA functionality not available",
1365
- "found": False
1366
- }
1383
+ from natural_pdf.qa.document_qa import get_qa_engine
1384
+
1385
+ # Get or initialize QA engine with specified model
1386
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1387
+
1388
+ # Ask the question using the QA engine
1389
+
1390
+ def add_child(self, child):
1391
+ """
1392
+ Add a child region to this region.
1393
+
1394
+ Used for hierarchical document structure when using models like Docling
1395
+ that understand document hierarchy.
1396
+
1397
+ Args:
1398
+ child: Region object to add as a child
1399
+
1400
+ Returns:
1401
+ Self for method chaining
1402
+ """
1403
+ self.child_regions.append(child)
1404
+ child.parent_region = self
1405
+ return self
1406
+
1407
+ def get_children(self, selector=None):
1408
+ """
1409
+ Get immediate child regions, optionally filtered by selector.
1410
+
1411
+ Args:
1412
+ selector: Optional selector to filter children
1413
+
1414
+ Returns:
1415
+ List of child regions matching the selector
1416
+ """
1417
+ import logging
1418
+ logger = logging.getLogger("natural_pdf.elements.region")
1419
+
1420
+ if selector is None:
1421
+ return self.child_regions
1422
+
1423
+ # Use existing selector parser to filter
1424
+ from natural_pdf.selectors.parser import match_elements_with_selector
1425
+ matched = match_elements_with_selector(self.child_regions, selector)
1426
+ logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
1427
+ return matched
1428
+
1429
+ def get_descendants(self, selector=None):
1430
+ """
1431
+ Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
1432
+
1433
+ Args:
1434
+ selector: Optional selector to filter descendants
1435
+
1436
+ Returns:
1437
+ List of descendant regions matching the selector
1438
+ """
1439
+ import logging
1440
+ logger = logging.getLogger("natural_pdf.elements.region")
1441
+
1442
+ all_descendants = []
1443
+
1444
+ # First add direct children
1445
+ all_descendants.extend(self.child_regions)
1446
+
1447
+ # Then recursively add their descendants
1448
+ for child in self.child_regions:
1449
+ all_descendants.extend(child.get_descendants())
1450
+
1451
+ logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
1452
+
1453
+ # Filter by selector if provided
1454
+ if selector is not None:
1455
+ from natural_pdf.selectors.parser import match_elements_with_selector
1456
+ matched = match_elements_with_selector(all_descendants, selector)
1457
+ logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
1458
+ return matched
1459
+
1460
+ return all_descendants
1461
+
1462
+ def find_all(self, selector, recursive=True, **kwargs):
1463
+ """
1464
+ Find all matching elements within this region, with optional recursion through child regions.
1465
+
1466
+ Args:
1467
+ selector: The selector to find elements with
1468
+ recursive: Whether to search recursively through child regions
1469
+ **kwargs: Additional parameters to pass to the selector parser
1470
+
1471
+ Returns:
1472
+ Collection of matching elements
1473
+ """
1474
+ # Get direct matches
1475
+ direct_matches = self.page.find_all(selector, region=self, **kwargs)
1476
+
1477
+ if not recursive or not self.child_regions:
1478
+ return direct_matches
1479
+
1480
+ # Get recursive matches from children
1481
+ from natural_pdf.elements.collections import ElementCollection
1482
+ all_matches = list(direct_matches)
1483
+
1484
+ for child in self.child_regions:
1485
+ child_matches = child.find_all(selector, recursive=True, **kwargs)
1486
+ for match in child_matches:
1487
+ if match not in all_matches:
1488
+ all_matches.append(match)
1489
+
1490
+ return ElementCollection(all_matches)
1491
+
1492
+ def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
1493
+ """
1494
+ Ask a question about the region content using document QA.
1495
+
1496
+ This method uses a document question answering model to extract answers from the region content.
1497
+ It leverages both textual content and layout information for better understanding.
1498
+
1499
+ Args:
1500
+ question: The question to ask about the region content
1501
+ min_confidence: Minimum confidence threshold for answers (0.0-1.0)
1502
+ model: Optional model name to use for QA (if None, uses default model)
1503
+ **kwargs: Additional parameters to pass to the QA engine
1504
+
1505
+ Returns:
1506
+ Dictionary with answer details: {
1507
+ "answer": extracted text,
1508
+ "confidence": confidence score,
1509
+ "found": whether an answer was found,
1510
+ "page_num": page number,
1511
+ "region": reference to this region,
1512
+ "source_elements": list of elements that contain the answer (if found)
1513
+ }
1514
+ """
1515
+ from natural_pdf.qa.document_qa import get_qa_engine
1516
+
1517
+ # Get or initialize QA engine with specified model
1518
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
1519
+
1520
+ # Ask the question using the QA engine
1521
+ return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  from typing import List, Dict, Any, Optional, Union, Tuple
3
3
  import numpy as np
4
- from PIL import Image
4
+ from PIL import Image, ImageDraw
5
5
  import os
6
6
  import tempfile
7
7
  import json
@@ -207,47 +207,38 @@ class DocumentQA:
207
207
  logger.info(f"Visualization: {vis_path}")
208
208
 
209
209
  # Run the query through the pipeline
210
- try:
211
- logger.info(f"Running document QA pipeline with question: {question}")
212
- result = self.pipe(query)[0]
213
- logger.info(f"Raw result: {result}")
214
-
215
- # Save the result if debugging
216
- if debug:
217
- result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
218
- with open(result_path, 'w') as f:
219
- # Convert any non-serializable data
220
- serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
221
- for k, v in result.items()}
222
- json.dump(serializable_result, f, indent=2)
223
-
224
- # Check confidence against threshold
225
- if result["score"] < min_confidence:
226
- logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
227
- return {
228
- "answer": "",
229
- "confidence": result["score"],
230
- "start": result.get("start", -1),
231
- "end": result.get("end", -1),
232
- "found": False
233
- }
234
-
235
- return {
236
- "answer": result["answer"],
237
- "confidence": result["score"],
238
- "start": result.get("start", 0),
239
- "end": result.get("end", 0),
240
- "found": True
241
- }
242
-
243
- except Exception as e:
244
- logger.error(f"Error in document QA: {e}")
210
+ logger.info(f"Running document QA pipeline with question: {question}")
211
+ result = self.pipe(query)[0]
212
+ logger.info(f"Raw result: {result}")
213
+
214
+ # Save the result if debugging
215
+ if debug:
216
+ result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
217
+ with open(result_path, 'w') as f:
218
+ # Convert any non-serializable data
219
+ serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
220
+ for k, v in result.items()}
221
+ json.dump(serializable_result, f, indent=2)
222
+
223
+ # Check confidence against threshold
224
+ if result["score"] < min_confidence:
225
+ logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
245
226
  return {
246
227
  "answer": "",
247
- "confidence": 0.0,
248
- "error": str(e),
228
+ "confidence": result["score"],
229
+ "start": result.get("start", -1),
230
+ "end": result.get("end", -1),
249
231
  "found": False
250
232
  }
233
+
234
+ return {
235
+ "answer": result["answer"],
236
+ "confidence": result["score"],
237
+ "start": result.get("start", 0),
238
+ "end": result.get("end", 0),
239
+ "found": True
240
+ }
241
+
251
242
 
252
243
  def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
253
244
  """
@@ -162,8 +162,12 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
162
162
  return False
163
163
 
164
164
  # If 'type' attribute specified, it will be checked in the attributes section
165
- # Otherwise, require exact match
166
- elif element.type != selector['type']:
165
+ # Check for Docling-specific types (section-header, etc.)
166
+ elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
167
+ # This is a direct match with a Docling region type
168
+ pass
169
+ # Otherwise, require exact match with the element's type attribute
170
+ elif not hasattr(element, 'type') or element.type != selector['type']:
167
171
  return False
168
172
 
169
173
  # Check attributes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: natural-pdf
3
- Version: 25.3.16
3
+ Version: 25.3.17.2
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Home-page: https://github.com/jsoma/natural-pdf
6
6
  Author: Jonathan Soma
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
15
15
  Requires-Dist: Pillow>=8.0.0
16
16
  Requires-Dist: colour>=0.1.5
17
17
  Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: urllib3>=1.26.0
18
19
  Requires-Dist: doclayout_yolo>=0.0.3
19
20
  Requires-Dist: torch>=2.0.0
20
21
  Requires-Dist: torchvision>=0.15.0
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
58
59
 
59
60
  Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
60
61
 
61
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
62
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
63
+ - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
62
64
 
63
65
  ## Features
64
66
 
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
96
98
  ```python
97
99
  from natural_pdf import PDF
98
100
 
99
- # Open a PDF
101
+ # Open a local PDF
100
102
  pdf = PDF('document.pdf')
101
103
 
104
+ # Or open a PDF from a URL
105
+ pdf = PDF('https://example.com/document.pdf')
106
+
102
107
  # Get the first page
103
108
  page = pdf.pages[0]
104
109
 
@@ -263,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
263
268
  - `natural_pdf.analyzers` - Layout analysis operations
264
269
  - `natural_pdf.ocr` - OCR engine operations
265
270
 
271
+ ## Document QA
272
+
273
+ Ask questions directly to your documents:
274
+
275
+ ```python
276
+ # Ask questions about the document content
277
+ result = pdf.ask("What was the company's revenue in 2022?")
278
+ print(f"Answer: {result['answer']}")
279
+ print(f"Confidence: {result['confidence']:.2f}")
280
+
281
+ # Access more details in the result dictionary
282
+ result = pdf.ask("Who is the CEO?")
283
+ print(f"Answer: {result['answer']}")
284
+ print(f"Found on page: {result['page_num']}")
285
+ print(f"Source text: {result.get('source_text', 'N/A')}")
286
+ ```
287
+
266
288
  ## More details
267
289
 
268
290
  [Complete documentation here](https://jsoma.github.io/natural-pdf)