natural-pdf 25.3.16__py3-none-any.whl → 25.3.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/direct_qa_example.py +17 -111
- examples/docling_comprehensive_test.py +325 -0
- examples/docling_example.py +192 -0
- examples/docling_hierarchy_example.py +230 -0
- examples/docling_text_sources.py +241 -0
- examples/improved_qa_example.py +66 -0
- examples/url_pdf_example.py +45 -0
- natural_pdf/analyzers/document_layout.py +276 -0
- natural_pdf/core/page.py +72 -21
- natural_pdf/core/pdf.py +102 -71
- natural_pdf/elements/region.py +174 -19
- natural_pdf/qa/document_qa.py +29 -38
- natural_pdf/selectors/parser.py +6 -2
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +25 -3
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +18 -12
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
- {natural_pdf-25.3.16.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
import pdfplumber
|
2
2
|
import logging
|
3
|
+
import tempfile
|
4
|
+
import os
|
5
|
+
import re
|
6
|
+
import urllib.request
|
3
7
|
from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
|
4
8
|
|
5
9
|
from natural_pdf.core.page import Page
|
@@ -28,7 +32,7 @@ class PDF:
|
|
28
32
|
with improved selection, navigation, and extraction capabilities.
|
29
33
|
"""
|
30
34
|
|
31
|
-
def __init__(self,
|
35
|
+
def __init__(self, path_or_url: str, reading_order: bool = True,
|
32
36
|
ocr: Optional[Union[bool, str, List, Dict]] = None,
|
33
37
|
ocr_engine: Optional[Union[str, Any]] = None,
|
34
38
|
font_attrs: Optional[List[str]] = None,
|
@@ -37,7 +41,7 @@ class PDF:
|
|
37
41
|
Initialize the enhanced PDF object.
|
38
42
|
|
39
43
|
Args:
|
40
|
-
|
44
|
+
path_or_url: Path to the PDF file or a URL to a PDF
|
41
45
|
reading_order: Whether to use natural reading order
|
42
46
|
ocr: OCR configuration:
|
43
47
|
- None or False: OCR disabled
|
@@ -58,6 +62,40 @@ class PDF:
|
|
58
62
|
True: Spaces are part of words, better for multi-word searching
|
59
63
|
False: Break text at spaces, each word is separate (legacy behavior)
|
60
64
|
"""
|
65
|
+
# Check if the input is a URL
|
66
|
+
is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
|
67
|
+
|
68
|
+
# Initialize path-related attributes
|
69
|
+
self._original_path = path_or_url
|
70
|
+
self._temp_file = None
|
71
|
+
|
72
|
+
if is_url:
|
73
|
+
logger.info(f"Downloading PDF from URL: {path_or_url}")
|
74
|
+
try:
|
75
|
+
# Create a temporary file to store the downloaded PDF
|
76
|
+
self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
77
|
+
|
78
|
+
# Download the PDF
|
79
|
+
with urllib.request.urlopen(path_or_url) as response:
|
80
|
+
self._temp_file.write(response.read())
|
81
|
+
self._temp_file.flush()
|
82
|
+
self._temp_file.close()
|
83
|
+
|
84
|
+
# Use the temporary file path
|
85
|
+
path = self._temp_file.name
|
86
|
+
logger.info(f"PDF downloaded to temporary file: {path}")
|
87
|
+
except Exception as e:
|
88
|
+
if self._temp_file and hasattr(self._temp_file, 'name'):
|
89
|
+
try:
|
90
|
+
os.unlink(self._temp_file.name)
|
91
|
+
except:
|
92
|
+
pass
|
93
|
+
logger.error(f"Failed to download PDF from URL: {e}")
|
94
|
+
raise ValueError(f"Failed to download PDF from URL: {e}")
|
95
|
+
else:
|
96
|
+
# Use the provided path directly
|
97
|
+
path = path_or_url
|
98
|
+
|
61
99
|
logger.info(f"Initializing PDF from {path}")
|
62
100
|
logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
|
63
101
|
|
@@ -72,8 +110,13 @@ class PDF:
|
|
72
110
|
if HAS_OCR_ENGINES:
|
73
111
|
# Handle OCR engine selection
|
74
112
|
if ocr_engine is None:
|
75
|
-
# Use default engine (
|
76
|
-
|
113
|
+
# Use default engine (PaddleOCR)
|
114
|
+
try:
|
115
|
+
self._ocr_engine = PaddleOCREngine()
|
116
|
+
except (ImportError, ValueError) as e:
|
117
|
+
logger.warning(f"PaddleOCR engine could not be loaded: {e}")
|
118
|
+
logger.warning("Falling back to EasyOCR engine.")
|
119
|
+
self._ocr_engine = EasyOCREngine()
|
77
120
|
elif isinstance(ocr_engine, str):
|
78
121
|
# String-based engine selection
|
79
122
|
try:
|
@@ -481,74 +524,51 @@ class PDF:
|
|
481
524
|
**kwargs: Additional parameters passed to the QA engine
|
482
525
|
|
483
526
|
Returns:
|
484
|
-
|
527
|
+
A dictionary containing the answer, confidence, and other metadata.
|
528
|
+
Result will have an 'answer' key containing the answer text.
|
485
529
|
"""
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
page
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
else:
|
528
|
-
return {
|
529
|
-
"answer": "",
|
530
|
-
"confidence": 0.0,
|
531
|
-
"found": False,
|
532
|
-
"message": "No answer found in document"
|
533
|
-
}
|
530
|
+
from natural_pdf.qa import get_qa_engine
|
531
|
+
|
532
|
+
# Initialize or get QA engine
|
533
|
+
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
534
|
+
|
535
|
+
# Determine which pages to query
|
536
|
+
if pages is None:
|
537
|
+
target_pages = list(range(len(self.pages)))
|
538
|
+
elif isinstance(pages, int):
|
539
|
+
# Single page
|
540
|
+
target_pages = [pages]
|
541
|
+
elif isinstance(pages, (list, range)):
|
542
|
+
# List or range of pages
|
543
|
+
target_pages = pages
|
544
|
+
else:
|
545
|
+
raise ValueError(f"Invalid pages parameter: {pages}")
|
546
|
+
|
547
|
+
# Actually query each page and gather results
|
548
|
+
results = []
|
549
|
+
for page_idx in target_pages:
|
550
|
+
if 0 <= page_idx < len(self.pages):
|
551
|
+
page = self.pages[page_idx]
|
552
|
+
page_result = qa_engine.ask_pdf_page(
|
553
|
+
page=page,
|
554
|
+
question=question,
|
555
|
+
min_confidence=min_confidence,
|
556
|
+
**kwargs
|
557
|
+
)
|
558
|
+
|
559
|
+
# Add to results if it found an answer
|
560
|
+
if page_result.get("found", False):
|
561
|
+
results.append(page_result)
|
562
|
+
|
563
|
+
# Sort results by confidence
|
564
|
+
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
565
|
+
|
566
|
+
# Return the best result, or a default result if none found
|
567
|
+
if results:
|
568
|
+
return results[0]
|
569
|
+
else:
|
570
|
+
return None
|
534
571
|
|
535
|
-
except ImportError as e:
|
536
|
-
logger.warning(f"QA functionality not available: {e}")
|
537
|
-
return {
|
538
|
-
"answer": "",
|
539
|
-
"confidence": 0.0,
|
540
|
-
"error": "QA functionality not available",
|
541
|
-
"found": False
|
542
|
-
}
|
543
|
-
except Exception as e:
|
544
|
-
logger.error(f"Error in document QA: {e}")
|
545
|
-
return {
|
546
|
-
"answer": "",
|
547
|
-
"confidence": 0.0,
|
548
|
-
"error": str(e),
|
549
|
-
"found": False
|
550
|
-
}
|
551
|
-
|
552
572
|
def __len__(self) -> int:
|
553
573
|
"""Return the number of pages in the PDF."""
|
554
574
|
return len(self.pages)
|
@@ -558,10 +578,21 @@ class PDF:
|
|
558
578
|
return self.pages[key]
|
559
579
|
|
560
580
|
def close(self):
|
561
|
-
"""Close the underlying PDF file."""
|
581
|
+
"""Close the underlying PDF file and clean up any temporary files."""
|
562
582
|
if hasattr(self, '_pdf') and self._pdf is not None:
|
563
583
|
self._pdf.close()
|
564
584
|
self._pdf = None
|
585
|
+
|
586
|
+
# Clean up temporary file if it exists
|
587
|
+
if hasattr(self, '_temp_file') and self._temp_file is not None:
|
588
|
+
try:
|
589
|
+
if os.path.exists(self._temp_file.name):
|
590
|
+
os.unlink(self._temp_file.name)
|
591
|
+
logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
|
592
|
+
except Exception as e:
|
593
|
+
logger.warning(f"Failed to clean up temporary PDF file: {e}")
|
594
|
+
finally:
|
595
|
+
self._temp_file = None
|
565
596
|
|
566
597
|
def __enter__(self):
|
567
598
|
"""Context manager entry."""
|
natural_pdf/elements/region.py
CHANGED
@@ -18,7 +18,7 @@ class Region:
|
|
18
18
|
Represents a rectangular region on a page.
|
19
19
|
"""
|
20
20
|
|
21
|
-
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None):
|
21
|
+
def __init__(self, page: 'Page', bbox: Tuple[float, float, float, float], polygon: List[Tuple[float, float]] = None, parent=None):
|
22
22
|
"""
|
23
23
|
Initialize a region.
|
24
24
|
|
@@ -26,6 +26,7 @@ class Region:
|
|
26
26
|
page: Parent page
|
27
27
|
bbox: Bounding box as (x0, top, x1, bottom)
|
28
28
|
polygon: Optional list of coordinate points [(x1,y1), (x2,y2), ...] for non-rectangular regions
|
29
|
+
parent: Optional parent region (for hierarchical document structure)
|
29
30
|
"""
|
30
31
|
self._page = page
|
31
32
|
self._bbox = bbox
|
@@ -48,6 +49,12 @@ class Region:
|
|
48
49
|
# Region management attributes
|
49
50
|
self.name = None
|
50
51
|
self.source = None # Will be set by creation methods
|
52
|
+
|
53
|
+
# Hierarchy support for nested document structure
|
54
|
+
self.parent_region = parent
|
55
|
+
self.child_regions = []
|
56
|
+
self.text_content = None # Direct text content (e.g., from Docling)
|
57
|
+
self.associated_text_elements = [] # Native text elements that overlap with this region
|
51
58
|
|
52
59
|
@property
|
53
60
|
def page(self) -> 'Page':
|
@@ -387,6 +394,11 @@ class Region:
|
|
387
394
|
"""
|
388
395
|
Extract text from this region using pdfplumber's native functionality.
|
389
396
|
|
397
|
+
For regions created by Docling, this will first try to use:
|
398
|
+
1. Associated text elements from the PDF (if available)
|
399
|
+
2. Direct text content from Docling (if available)
|
400
|
+
3. Fall back to standard pdfplumber extraction
|
401
|
+
|
390
402
|
Args:
|
391
403
|
keep_blank_chars: Whether to keep blank characters (legacy parameter)
|
392
404
|
apply_exclusions: Whether to apply exclusion regions
|
@@ -398,6 +410,28 @@ class Region:
|
|
398
410
|
Returns:
|
399
411
|
Extracted text as string
|
400
412
|
"""
|
413
|
+
import logging
|
414
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
415
|
+
|
416
|
+
# Check for Docling model or if we have direct text content
|
417
|
+
if self.model == 'docling' or hasattr(self, 'text_content'):
|
418
|
+
# First priority: check if we have associated native text elements
|
419
|
+
if hasattr(self, 'associated_text_elements') and self.associated_text_elements:
|
420
|
+
source_count = len(self.associated_text_elements)
|
421
|
+
logger.info(f"Region {self.region_type}: Using {source_count} native PDF text elements")
|
422
|
+
# Sort elements in reading order
|
423
|
+
sorted_elements = sorted(self.associated_text_elements, key=lambda e: (e.top, e.x0))
|
424
|
+
# Extract and join their text
|
425
|
+
text_result = " ".join(elem.text for elem in sorted_elements)
|
426
|
+
return text_result
|
427
|
+
|
428
|
+
# Second priority: use direct text content from Docling
|
429
|
+
elif self.text_content:
|
430
|
+
logger.info(f"Region {self.region_type}: Using Docling OCR text content")
|
431
|
+
return self.text_content
|
432
|
+
|
433
|
+
logger.debug(f"Region {self.region_type}: No Docling text found, falling back to standard extraction")
|
434
|
+
|
401
435
|
# Handle preserve_whitespace parameter for consistency with Page.extract_text
|
402
436
|
if preserve_whitespace is not None:
|
403
437
|
keep_blank_chars = preserve_whitespace
|
@@ -1346,21 +1380,142 @@ class Region:
|
|
1346
1380
|
"source_elements": list of elements that contain the answer (if found)
|
1347
1381
|
}
|
1348
1382
|
"""
|
1349
|
-
|
1350
|
-
|
1351
|
-
|
1352
|
-
|
1353
|
-
|
1354
|
-
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1366
|
-
|
1383
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1384
|
+
|
1385
|
+
# Get or initialize QA engine with specified model
|
1386
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1387
|
+
|
1388
|
+
# Ask the question using the QA engine
|
1389
|
+
|
1390
|
+
def add_child(self, child):
|
1391
|
+
"""
|
1392
|
+
Add a child region to this region.
|
1393
|
+
|
1394
|
+
Used for hierarchical document structure when using models like Docling
|
1395
|
+
that understand document hierarchy.
|
1396
|
+
|
1397
|
+
Args:
|
1398
|
+
child: Region object to add as a child
|
1399
|
+
|
1400
|
+
Returns:
|
1401
|
+
Self for method chaining
|
1402
|
+
"""
|
1403
|
+
self.child_regions.append(child)
|
1404
|
+
child.parent_region = self
|
1405
|
+
return self
|
1406
|
+
|
1407
|
+
def get_children(self, selector=None):
|
1408
|
+
"""
|
1409
|
+
Get immediate child regions, optionally filtered by selector.
|
1410
|
+
|
1411
|
+
Args:
|
1412
|
+
selector: Optional selector to filter children
|
1413
|
+
|
1414
|
+
Returns:
|
1415
|
+
List of child regions matching the selector
|
1416
|
+
"""
|
1417
|
+
import logging
|
1418
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
1419
|
+
|
1420
|
+
if selector is None:
|
1421
|
+
return self.child_regions
|
1422
|
+
|
1423
|
+
# Use existing selector parser to filter
|
1424
|
+
from natural_pdf.selectors.parser import match_elements_with_selector
|
1425
|
+
matched = match_elements_with_selector(self.child_regions, selector)
|
1426
|
+
logger.debug(f"get_children: found {len(matched)} of {len(self.child_regions)} children matching '{selector}'")
|
1427
|
+
return matched
|
1428
|
+
|
1429
|
+
def get_descendants(self, selector=None):
|
1430
|
+
"""
|
1431
|
+
Get all descendant regions (children, grandchildren, etc.), optionally filtered by selector.
|
1432
|
+
|
1433
|
+
Args:
|
1434
|
+
selector: Optional selector to filter descendants
|
1435
|
+
|
1436
|
+
Returns:
|
1437
|
+
List of descendant regions matching the selector
|
1438
|
+
"""
|
1439
|
+
import logging
|
1440
|
+
logger = logging.getLogger("natural_pdf.elements.region")
|
1441
|
+
|
1442
|
+
all_descendants = []
|
1443
|
+
|
1444
|
+
# First add direct children
|
1445
|
+
all_descendants.extend(self.child_regions)
|
1446
|
+
|
1447
|
+
# Then recursively add their descendants
|
1448
|
+
for child in self.child_regions:
|
1449
|
+
all_descendants.extend(child.get_descendants())
|
1450
|
+
|
1451
|
+
logger.debug(f"get_descendants: found {len(all_descendants)} total descendants")
|
1452
|
+
|
1453
|
+
# Filter by selector if provided
|
1454
|
+
if selector is not None:
|
1455
|
+
from natural_pdf.selectors.parser import match_elements_with_selector
|
1456
|
+
matched = match_elements_with_selector(all_descendants, selector)
|
1457
|
+
logger.debug(f"get_descendants: filtered to {len(matched)} matching '{selector}'")
|
1458
|
+
return matched
|
1459
|
+
|
1460
|
+
return all_descendants
|
1461
|
+
|
1462
|
+
def find_all(self, selector, recursive=True, **kwargs):
|
1463
|
+
"""
|
1464
|
+
Find all matching elements within this region, with optional recursion through child regions.
|
1465
|
+
|
1466
|
+
Args:
|
1467
|
+
selector: The selector to find elements with
|
1468
|
+
recursive: Whether to search recursively through child regions
|
1469
|
+
**kwargs: Additional parameters to pass to the selector parser
|
1470
|
+
|
1471
|
+
Returns:
|
1472
|
+
Collection of matching elements
|
1473
|
+
"""
|
1474
|
+
# Get direct matches
|
1475
|
+
direct_matches = self.page.find_all(selector, region=self, **kwargs)
|
1476
|
+
|
1477
|
+
if not recursive or not self.child_regions:
|
1478
|
+
return direct_matches
|
1479
|
+
|
1480
|
+
# Get recursive matches from children
|
1481
|
+
from natural_pdf.elements.collections import ElementCollection
|
1482
|
+
all_matches = list(direct_matches)
|
1483
|
+
|
1484
|
+
for child in self.child_regions:
|
1485
|
+
child_matches = child.find_all(selector, recursive=True, **kwargs)
|
1486
|
+
for match in child_matches:
|
1487
|
+
if match not in all_matches:
|
1488
|
+
all_matches.append(match)
|
1489
|
+
|
1490
|
+
return ElementCollection(all_matches)
|
1491
|
+
|
1492
|
+
def ask(self, question: str, min_confidence: float = 0.1, model: str = None, debug: bool = False, **kwargs) -> Dict[str, Any]:
|
1493
|
+
"""
|
1494
|
+
Ask a question about the region content using document QA.
|
1495
|
+
|
1496
|
+
This method uses a document question answering model to extract answers from the region content.
|
1497
|
+
It leverages both textual content and layout information for better understanding.
|
1498
|
+
|
1499
|
+
Args:
|
1500
|
+
question: The question to ask about the region content
|
1501
|
+
min_confidence: Minimum confidence threshold for answers (0.0-1.0)
|
1502
|
+
model: Optional model name to use for QA (if None, uses default model)
|
1503
|
+
**kwargs: Additional parameters to pass to the QA engine
|
1504
|
+
|
1505
|
+
Returns:
|
1506
|
+
Dictionary with answer details: {
|
1507
|
+
"answer": extracted text,
|
1508
|
+
"confidence": confidence score,
|
1509
|
+
"found": whether an answer was found,
|
1510
|
+
"page_num": page number,
|
1511
|
+
"region": reference to this region,
|
1512
|
+
"source_elements": list of elements that contain the answer (if found)
|
1513
|
+
}
|
1514
|
+
"""
|
1515
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
1516
|
+
|
1517
|
+
# Get or initialize QA engine with specified model
|
1518
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
1519
|
+
|
1520
|
+
# Ask the question using the QA engine
|
1521
|
+
return qa_engine.ask_pdf_region(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Dict, Any, Optional, Union, Tuple
|
3
3
|
import numpy as np
|
4
|
-
from PIL import Image
|
4
|
+
from PIL import Image, ImageDraw
|
5
5
|
import os
|
6
6
|
import tempfile
|
7
7
|
import json
|
@@ -207,47 +207,38 @@ class DocumentQA:
|
|
207
207
|
logger.info(f"Visualization: {vis_path}")
|
208
208
|
|
209
209
|
# Run the query through the pipeline
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
227
|
-
return {
|
228
|
-
"answer": "",
|
229
|
-
"confidence": result["score"],
|
230
|
-
"start": result.get("start", -1),
|
231
|
-
"end": result.get("end", -1),
|
232
|
-
"found": False
|
233
|
-
}
|
234
|
-
|
235
|
-
return {
|
236
|
-
"answer": result["answer"],
|
237
|
-
"confidence": result["score"],
|
238
|
-
"start": result.get("start", 0),
|
239
|
-
"end": result.get("end", 0),
|
240
|
-
"found": True
|
241
|
-
}
|
242
|
-
|
243
|
-
except Exception as e:
|
244
|
-
logger.error(f"Error in document QA: {e}")
|
210
|
+
logger.info(f"Running document QA pipeline with question: {question}")
|
211
|
+
result = self.pipe(query)[0]
|
212
|
+
logger.info(f"Raw result: {result}")
|
213
|
+
|
214
|
+
# Save the result if debugging
|
215
|
+
if debug:
|
216
|
+
result_path = os.path.join(debug_output_dir, "debug_qa_result.json")
|
217
|
+
with open(result_path, 'w') as f:
|
218
|
+
# Convert any non-serializable data
|
219
|
+
serializable_result = {k: str(v) if not isinstance(v, (str, int, float, bool, list, dict, type(None))) else v
|
220
|
+
for k, v in result.items()}
|
221
|
+
json.dump(serializable_result, f, indent=2)
|
222
|
+
|
223
|
+
# Check confidence against threshold
|
224
|
+
if result["score"] < min_confidence:
|
225
|
+
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
245
226
|
return {
|
246
227
|
"answer": "",
|
247
|
-
"confidence":
|
248
|
-
"
|
228
|
+
"confidence": result["score"],
|
229
|
+
"start": result.get("start", -1),
|
230
|
+
"end": result.get("end", -1),
|
249
231
|
"found": False
|
250
232
|
}
|
233
|
+
|
234
|
+
return {
|
235
|
+
"answer": result["answer"],
|
236
|
+
"confidence": result["score"],
|
237
|
+
"start": result.get("start", 0),
|
238
|
+
"end": result.get("end", 0),
|
239
|
+
"found": True
|
240
|
+
}
|
241
|
+
|
251
242
|
|
252
243
|
def ask_pdf_page(self, page, question: str, min_confidence: float = 0.1, debug: bool = False) -> Dict[str, Any]:
|
253
244
|
"""
|
natural_pdf/selectors/parser.py
CHANGED
@@ -162,8 +162,12 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> callable:
|
|
162
162
|
return False
|
163
163
|
|
164
164
|
# If 'type' attribute specified, it will be checked in the attributes section
|
165
|
-
#
|
166
|
-
elif element.
|
165
|
+
# Check for Docling-specific types (section-header, etc.)
|
166
|
+
elif hasattr(element, 'normalized_type') and element.normalized_type == selector['type']:
|
167
|
+
# This is a direct match with a Docling region type
|
168
|
+
pass
|
169
|
+
# Otherwise, require exact match with the element's type attribute
|
170
|
+
elif not hasattr(element, 'type') or element.type != selector['type']:
|
167
171
|
return False
|
168
172
|
|
169
173
|
# Check attributes
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 25.3.
|
3
|
+
Version: 25.3.17.2
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Home-page: https://github.com/jsoma/natural-pdf
|
6
6
|
Author: Jonathan Soma
|
@@ -15,6 +15,7 @@ Requires-Dist: pdfplumber>=0.7.0
|
|
15
15
|
Requires-Dist: Pillow>=8.0.0
|
16
16
|
Requires-Dist: colour>=0.1.5
|
17
17
|
Requires-Dist: numpy>=1.20.0
|
18
|
+
Requires-Dist: urllib3>=1.26.0
|
18
19
|
Requires-Dist: doclayout_yolo>=0.0.3
|
19
20
|
Requires-Dist: torch>=2.0.0
|
20
21
|
Requires-Dist: torchvision>=0.15.0
|
@@ -58,7 +59,8 @@ A friendly library for working with PDFs, built on top of [pdfplumber](https://g
|
|
58
59
|
|
59
60
|
Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
|
60
61
|
|
61
|
-
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|
62
|
+
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
|
63
|
+
- [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
|
62
64
|
|
63
65
|
## Features
|
64
66
|
|
@@ -96,9 +98,12 @@ pip install natural-pdf[easyocr,paddle]
|
|
96
98
|
```python
|
97
99
|
from natural_pdf import PDF
|
98
100
|
|
99
|
-
# Open a PDF
|
101
|
+
# Open a local PDF
|
100
102
|
pdf = PDF('document.pdf')
|
101
103
|
|
104
|
+
# Or open a PDF from a URL
|
105
|
+
pdf = PDF('https://example.com/document.pdf')
|
106
|
+
|
102
107
|
# Get the first page
|
103
108
|
page = pdf.pages[0]
|
104
109
|
|
@@ -263,6 +268,23 @@ Logs follow a hierarchical structure matching the library's module organization:
|
|
263
268
|
- `natural_pdf.analyzers` - Layout analysis operations
|
264
269
|
- `natural_pdf.ocr` - OCR engine operations
|
265
270
|
|
271
|
+
## Document QA
|
272
|
+
|
273
|
+
Ask questions directly to your documents:
|
274
|
+
|
275
|
+
```python
|
276
|
+
# Ask questions about the document content
|
277
|
+
result = pdf.ask("What was the company's revenue in 2022?")
|
278
|
+
print(f"Answer: {result['answer']}")
|
279
|
+
print(f"Confidence: {result['confidence']:.2f}")
|
280
|
+
|
281
|
+
# Access more details in the result dictionary
|
282
|
+
result = pdf.ask("Who is the CEO?")
|
283
|
+
print(f"Answer: {result['answer']}")
|
284
|
+
print(f"Found on page: {result['page_num']}")
|
285
|
+
print(f"Source text: {result.get('source_text', 'N/A')}")
|
286
|
+
```
|
287
|
+
|
266
288
|
## More details
|
267
289
|
|
268
290
|
[Complete documentation here](https://jsoma.github.io/natural-pdf)
|