natural-pdf 25.3.16.2__py3-none-any.whl → 25.3.17.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- examples/direct_qa_example.py +17 -111
- examples/docling_comprehensive_test.py +325 -0
- examples/docling_example.py +192 -0
- examples/docling_hierarchy_example.py +230 -0
- examples/docling_text_sources.py +241 -0
- examples/improved_qa_example.py +66 -0
- natural_pdf/analyzers/document_layout.py +276 -0
- natural_pdf/core/page.py +72 -21
- natural_pdf/core/pdf.py +50 -68
- natural_pdf/elements/region.py +174 -19
- natural_pdf/qa/document_qa.py +29 -38
- natural_pdf/selectors/parser.py +6 -2
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/METADATA +19 -2
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/RECORD +17 -12
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/LICENSE +0 -0
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/WHEEL +0 -0
- {natural_pdf-25.3.16.2.dist-info → natural_pdf-25.3.17.2.dist-info}/top_level.txt +0 -0
@@ -399,6 +399,276 @@ class TableTransformerDetector(LayoutDetector):
|
|
399
399
|
return all_detections
|
400
400
|
|
401
401
|
|
402
|
+
class DoclingLayoutDetector(LayoutDetector):
|
403
|
+
"""
|
404
|
+
Document layout and text recognition using Docling.
|
405
|
+
|
406
|
+
Docling provides a hierarchical document understanding system that can analyze:
|
407
|
+
- Document structure (headers, text, figures, tables)
|
408
|
+
- Text content via integrated OCR
|
409
|
+
- Hierarchical relationships between document elements
|
410
|
+
"""
|
411
|
+
|
412
|
+
def __init__(self, verbose=False, **kwargs):
|
413
|
+
"""
|
414
|
+
Initialize the Docling document analyzer.
|
415
|
+
|
416
|
+
Args:
|
417
|
+
verbose: Whether to enable verbose logging
|
418
|
+
**kwargs: Additional parameters to pass to DocumentConverter
|
419
|
+
"""
|
420
|
+
# Set up logger with optional verbose mode
|
421
|
+
import logging
|
422
|
+
self.logger = logging.getLogger("natural_pdf.analyzers.layout.docling")
|
423
|
+
self.original_level = self.logger.level
|
424
|
+
if verbose:
|
425
|
+
self.logger.setLevel(logging.DEBUG)
|
426
|
+
|
427
|
+
super().__init__()
|
428
|
+
self.verbose = verbose
|
429
|
+
self.converter_kwargs = kwargs
|
430
|
+
self._docling_document = None
|
431
|
+
self._converter = None
|
432
|
+
|
433
|
+
def __del__(self):
|
434
|
+
# Restore the original logging level when done
|
435
|
+
if hasattr(self, 'logger') and hasattr(self, 'original_level'):
|
436
|
+
self.logger.setLevel(self.original_level)
|
437
|
+
|
438
|
+
@property
|
439
|
+
def converter(self):
|
440
|
+
"""Lazy-load the DocumentConverter on first use."""
|
441
|
+
if self._converter is None:
|
442
|
+
try:
|
443
|
+
from docling.document_converter import DocumentConverter
|
444
|
+
self.logger.debug("Initializing Docling DocumentConverter")
|
445
|
+
self._converter = DocumentConverter(**self.converter_kwargs)
|
446
|
+
except ImportError:
|
447
|
+
raise ImportError(
|
448
|
+
"Docling integration requires docling. "
|
449
|
+
"Install with: pip install docling"
|
450
|
+
)
|
451
|
+
return self._converter
|
452
|
+
|
453
|
+
def detect(self, image_path, confidence=0.5, classes=None, exclude_classes=None):
|
454
|
+
"""
|
455
|
+
Detect document structure and text using Docling.
|
456
|
+
|
457
|
+
Args:
|
458
|
+
image_path: Path to the image or PDF to analyze
|
459
|
+
confidence: Minimum confidence threshold for detections (not used by Docling)
|
460
|
+
classes: Specific classes to detect (used for filtering)
|
461
|
+
exclude_classes: Classes to exclude from detection (used for filtering)
|
462
|
+
|
463
|
+
Returns:
|
464
|
+
List of detection dictionaries with hierarchical information
|
465
|
+
"""
|
466
|
+
self.logger.info(f"Processing {image_path} with Docling")
|
467
|
+
|
468
|
+
try:
|
469
|
+
# Convert the document using Docling's DocumentConverter
|
470
|
+
result = self.converter.convert(image_path)
|
471
|
+
doc = result.document
|
472
|
+
|
473
|
+
# Store for later use
|
474
|
+
self._docling_document = doc
|
475
|
+
self.logger.info(f"Docling document created with {len(doc.body.children)} top-level elements")
|
476
|
+
|
477
|
+
# Convert Docling document to our detection format
|
478
|
+
detections = self._convert_docling_to_detections(doc, confidence, classes, exclude_classes)
|
479
|
+
|
480
|
+
return detections
|
481
|
+
except Exception as e:
|
482
|
+
self.logger.error(f"Error processing with Docling: {e}")
|
483
|
+
raise
|
484
|
+
|
485
|
+
def _convert_docling_to_detections(self, doc, confidence, classes, exclude_classes):
|
486
|
+
"""
|
487
|
+
Convert a Docling document to our standard detection format.
|
488
|
+
|
489
|
+
Args:
|
490
|
+
doc: DoclingDocument object
|
491
|
+
confidence: Confidence threshold to apply (not used by Docling)
|
492
|
+
classes: Classes to include (if specified)
|
493
|
+
exclude_classes: Classes to exclude
|
494
|
+
|
495
|
+
Returns:
|
496
|
+
List of detection dictionaries with hierarchy information
|
497
|
+
"""
|
498
|
+
if not doc or not hasattr(doc, 'body') or not hasattr(doc.body, 'children'):
|
499
|
+
self.logger.warning("Invalid or empty Docling document")
|
500
|
+
return []
|
501
|
+
|
502
|
+
detections = []
|
503
|
+
id_to_detection = {} # Map from Docling ID to detection index
|
504
|
+
|
505
|
+
# Process text elements
|
506
|
+
if hasattr(doc, 'texts') and doc.texts:
|
507
|
+
self.logger.debug(f"Processing {len(doc.texts)} text elements")
|
508
|
+
|
509
|
+
# First pass: create detections for all text elements
|
510
|
+
for text_elem in doc.texts:
|
511
|
+
# Skip if no provenance information
|
512
|
+
if not hasattr(text_elem, 'prov') or not text_elem.prov:
|
513
|
+
continue
|
514
|
+
|
515
|
+
# Get the bounding box
|
516
|
+
prov = text_elem.prov[0] # Take first provenance entry
|
517
|
+
if not hasattr(prov, 'bbox') or not prov.bbox:
|
518
|
+
continue
|
519
|
+
|
520
|
+
bbox = prov.bbox
|
521
|
+
|
522
|
+
page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792 # Default letter size
|
523
|
+
# Already in top-left coordinates
|
524
|
+
t = page_height - bbox.t
|
525
|
+
b = page_height - bbox.b
|
526
|
+
|
527
|
+
# Ensure top is always less than bottom for PIL coordinates
|
528
|
+
if t > b:
|
529
|
+
t, b = b, t
|
530
|
+
|
531
|
+
# Get the label and normalize it
|
532
|
+
label = str(text_elem.label) if hasattr(text_elem, 'label') else 'text'
|
533
|
+
normalized_label = self._normalize_class_name(label)
|
534
|
+
|
535
|
+
# Skip if filtered by class
|
536
|
+
if classes and normalized_label not in classes:
|
537
|
+
continue
|
538
|
+
if exclude_classes and normalized_label in exclude_classes:
|
539
|
+
continue
|
540
|
+
|
541
|
+
# Create detection
|
542
|
+
detection = {
|
543
|
+
'bbox': (bbox.l, t, bbox.r, b),
|
544
|
+
'class': label,
|
545
|
+
'normalized_class': normalized_label,
|
546
|
+
'confidence': 0.95, # Default confidence for Docling
|
547
|
+
'text': text_elem.text if hasattr(text_elem, 'text') else None,
|
548
|
+
'docling_id': text_elem.self_ref if hasattr(text_elem, 'self_ref') else None,
|
549
|
+
'parent_id': text_elem.parent.self_ref if hasattr(text_elem, 'parent') and hasattr(text_elem.parent, 'self_ref') else None,
|
550
|
+
'model': 'docling'
|
551
|
+
}
|
552
|
+
|
553
|
+
detections.append(detection)
|
554
|
+
|
555
|
+
# Track by ID for hierarchy reconstruction
|
556
|
+
if detection['docling_id']:
|
557
|
+
id_to_detection[detection['docling_id']] = len(detections) - 1
|
558
|
+
|
559
|
+
# Process pictures if available
|
560
|
+
if hasattr(doc, 'pictures') and doc.pictures:
|
561
|
+
self.logger.debug(f"Processing {len(doc.pictures)} picture elements")
|
562
|
+
|
563
|
+
for pic_elem in doc.pictures:
|
564
|
+
# Skip if no provenance information
|
565
|
+
if not hasattr(pic_elem, 'prov') or not pic_elem.prov:
|
566
|
+
continue
|
567
|
+
|
568
|
+
# Get the bounding box
|
569
|
+
prov = pic_elem.prov[0] # Take first provenance entry
|
570
|
+
if not hasattr(prov, 'bbox') or not prov.bbox:
|
571
|
+
continue
|
572
|
+
|
573
|
+
bbox = prov.bbox
|
574
|
+
|
575
|
+
page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792
|
576
|
+
# In BOTTOMLEFT system, bbox.t is distance from bottom (higher value = higher on page)
|
577
|
+
# In TOPLEFT system, we need distance from top (convert using page_height)
|
578
|
+
t = page_height - bbox.t # Correct: Top is page_height minus the top in BOTTOMLEFT
|
579
|
+
b = page_height - bbox.b # Correct: Bottom is page_height minus the bottom in BOTTOMLEFT
|
580
|
+
|
581
|
+
# Ensure top is always less than bottom for PIL coordinates
|
582
|
+
if t > b:
|
583
|
+
t, b = b, t
|
584
|
+
|
585
|
+
label = 'figure' # Default label for pictures
|
586
|
+
normalized_label = 'figure'
|
587
|
+
|
588
|
+
# Skip if filtered by class
|
589
|
+
if classes and normalized_label not in classes:
|
590
|
+
continue
|
591
|
+
if exclude_classes and normalized_label in exclude_classes:
|
592
|
+
continue
|
593
|
+
|
594
|
+
# Create detection
|
595
|
+
detection = {
|
596
|
+
'bbox': (bbox.l, t, bbox.r, b),
|
597
|
+
'class': label,
|
598
|
+
'normalized_class': normalized_label,
|
599
|
+
'confidence': 0.95, # Default confidence
|
600
|
+
'docling_id': pic_elem.self_ref if hasattr(pic_elem, 'self_ref') else None,
|
601
|
+
'parent_id': pic_elem.parent.self_ref if hasattr(pic_elem, 'parent') and hasattr(pic_elem.parent, 'self_ref') else None,
|
602
|
+
'model': 'docling'
|
603
|
+
}
|
604
|
+
|
605
|
+
detections.append(detection)
|
606
|
+
|
607
|
+
# Track by ID for hierarchy reconstruction
|
608
|
+
if detection['docling_id']:
|
609
|
+
id_to_detection[detection['docling_id']] = len(detections) - 1
|
610
|
+
|
611
|
+
# Process tables if available
|
612
|
+
if hasattr(doc, 'tables') and doc.tables:
|
613
|
+
self.logger.debug(f"Processing {len(doc.tables)} table elements")
|
614
|
+
|
615
|
+
for table_elem in doc.tables:
|
616
|
+
# Skip if no provenance information
|
617
|
+
if not hasattr(table_elem, 'prov') or not table_elem.prov:
|
618
|
+
continue
|
619
|
+
|
620
|
+
# Get the bounding box
|
621
|
+
prov = table_elem.prov[0] # Take first provenance entry
|
622
|
+
if not hasattr(prov, 'bbox') or not prov.bbox:
|
623
|
+
continue
|
624
|
+
|
625
|
+
bbox = prov.bbox
|
626
|
+
|
627
|
+
# Convert from bottom-left to top-left coordinates
|
628
|
+
page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792
|
629
|
+
# In BOTTOMLEFT system, bbox.t is distance from bottom (higher value = higher on page)
|
630
|
+
# In TOPLEFT system, we need distance from top (convert using page_height)
|
631
|
+
t = page_height - bbox.t # Correct: Top is page_height minus the top in BOTTOMLEFT
|
632
|
+
b = page_height - bbox.b # Correct: Bottom is page_height minus the bottom in BOTTOMLEFT
|
633
|
+
|
634
|
+
# Ensure top is always less than bottom for PIL coordinates
|
635
|
+
if t > b:
|
636
|
+
t, b = b, t
|
637
|
+
|
638
|
+
label = 'table' # Default label for tables
|
639
|
+
normalized_label = 'table'
|
640
|
+
|
641
|
+
# Skip if filtered by class
|
642
|
+
if classes and normalized_label not in classes:
|
643
|
+
continue
|
644
|
+
if exclude_classes and normalized_label in exclude_classes:
|
645
|
+
continue
|
646
|
+
|
647
|
+
# Create detection
|
648
|
+
detection = {
|
649
|
+
'bbox': (bbox.l, t, bbox.r, b),
|
650
|
+
'class': label,
|
651
|
+
'normalized_class': normalized_label,
|
652
|
+
'confidence': 0.95, # Default confidence
|
653
|
+
'docling_id': table_elem.self_ref if hasattr(table_elem, 'self_ref') else None,
|
654
|
+
'parent_id': table_elem.parent.self_ref if hasattr(table_elem, 'parent') and hasattr(table_elem.parent, 'self_ref') else None,
|
655
|
+
'model': 'docling'
|
656
|
+
}
|
657
|
+
|
658
|
+
detections.append(detection)
|
659
|
+
|
660
|
+
# Track by ID for hierarchy reconstruction
|
661
|
+
if detection['docling_id']:
|
662
|
+
id_to_detection[detection['docling_id']] = len(detections) - 1
|
663
|
+
|
664
|
+
self.logger.info(f"Created {len(detections)} detections from Docling document")
|
665
|
+
return detections
|
666
|
+
|
667
|
+
def get_docling_document(self):
|
668
|
+
"""Get the original Docling document for advanced usage."""
|
669
|
+
return self._docling_document
|
670
|
+
|
671
|
+
|
402
672
|
class PaddleLayoutDetector(LayoutDetector):
|
403
673
|
"""
|
404
674
|
Document layout and table structure detector using PaddlePaddle's PP-Structure.
|
@@ -708,6 +978,12 @@ def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
|
|
708
978
|
# Extract detection info
|
709
979
|
x_min, y_min, x_max, y_max = det['bbox']
|
710
980
|
|
981
|
+
# Ensure coordinates are in proper order (min values are smaller)
|
982
|
+
if x_min > x_max:
|
983
|
+
x_min, x_max = x_max, x_min
|
984
|
+
if y_min > y_max:
|
985
|
+
y_min, y_max = y_max, y_min
|
986
|
+
|
711
987
|
# Scale coordinates from image to PDF space
|
712
988
|
if scale_factor != 1.0:
|
713
989
|
x_min *= scale_factor
|
natural_pdf/core/page.py
CHANGED
@@ -17,6 +17,7 @@ from natural_pdf.analyzers.document_layout import (
|
|
17
17
|
YOLODocLayoutDetector,
|
18
18
|
TableTransformerDetector,
|
19
19
|
PaddleLayoutDetector,
|
20
|
+
DoclingLayoutDetector,
|
20
21
|
convert_to_regions
|
21
22
|
)
|
22
23
|
from natural_pdf.utils.ocr import OCRManager
|
@@ -808,6 +809,8 @@ class Page:
|
|
808
809
|
except (KeyError, AttributeError, TypeError):
|
809
810
|
pass
|
810
811
|
|
812
|
+
# Add source attribute for native text elements
|
813
|
+
c['source'] = 'native'
|
811
814
|
chars.append(TextElement(c, self))
|
812
815
|
|
813
816
|
# Create word-level text elements by grouping chars
|
@@ -872,6 +875,8 @@ class Page:
|
|
872
875
|
if attr in current_word[0]:
|
873
876
|
word_obj[attr] = current_word[0][attr]
|
874
877
|
|
878
|
+
# Add source attribute for native text elements
|
879
|
+
word_obj['source'] = 'native'
|
875
880
|
words.append(TextElement(word_obj, self))
|
876
881
|
current_word = []
|
877
882
|
continue
|
@@ -927,6 +932,8 @@ class Page:
|
|
927
932
|
if attr in current_word[0]:
|
928
933
|
word_obj[attr] = current_word[0][attr]
|
929
934
|
|
935
|
+
# Add source attribute for native text elements
|
936
|
+
word_obj['source'] = 'native'
|
930
937
|
words.append(TextElement(word_obj, self))
|
931
938
|
current_word = [char]
|
932
939
|
# If the gap between chars is larger than a threshold, it's a new word
|
@@ -965,6 +972,8 @@ class Page:
|
|
965
972
|
if attr in current_word[0]:
|
966
973
|
word_obj[attr] = current_word[0][attr]
|
967
974
|
|
975
|
+
# Add source attribute for native text elements
|
976
|
+
word_obj['source'] = 'native'
|
968
977
|
words.append(TextElement(word_obj, self))
|
969
978
|
current_word = [char]
|
970
979
|
else:
|
@@ -1005,6 +1014,8 @@ class Page:
|
|
1005
1014
|
if attr in current_word[0]:
|
1006
1015
|
word_obj[attr] = current_word[0][attr]
|
1007
1016
|
|
1017
|
+
# Add source attribute for native text elements
|
1018
|
+
word_obj['source'] = 'native'
|
1008
1019
|
words.append(TextElement(word_obj, self))
|
1009
1020
|
|
1010
1021
|
line_groups.extend(words)
|
@@ -1853,7 +1864,7 @@ class Page:
|
|
1853
1864
|
return elements
|
1854
1865
|
|
1855
1866
|
def analyze_layout(self,
|
1856
|
-
model: str = "
|
1867
|
+
model: str = "docling",
|
1857
1868
|
confidence: float = 0.2,
|
1858
1869
|
classes: Optional[List[str]] = None,
|
1859
1870
|
exclude_classes: Optional[List[str]] = None,
|
@@ -1868,7 +1879,7 @@ class Page:
|
|
1868
1879
|
Analyze the page layout using a machine learning model.
|
1869
1880
|
|
1870
1881
|
Args:
|
1871
|
-
model: Model type to use ('yolo', 'tatr', or '
|
1882
|
+
model: Model type to use ('yolo', 'tatr', 'paddle', or 'docling')
|
1872
1883
|
confidence: Minimum confidence threshold for detections
|
1873
1884
|
classes: Specific classes to detect (None for all supported classes)
|
1874
1885
|
exclude_classes: Classes to exclude from detection
|
@@ -1878,6 +1889,7 @@ class Page:
|
|
1878
1889
|
- YOLO: {"model_path": "...", "image_size": 1024}
|
1879
1890
|
- TATR: {"model_path": "...", "create_cells": False}
|
1880
1891
|
- Paddle: {"lang": "en", "use_angle_cls": False, "enable_table": True}
|
1892
|
+
- Docling: {"model_name": "ds4sd/SmolDocling-256M-preview", "prompt_text": "...", "verbose": False}
|
1881
1893
|
model_path: (Legacy) Optional path to custom model file
|
1882
1894
|
image_size: (Legacy) Size to resize the image to before detection (YOLO only)
|
1883
1895
|
create_cells: (Legacy) Whether to create cell regions for TATR table regions
|
@@ -1969,8 +1981,32 @@ class Page:
|
|
1969
1981
|
exclude_classes=exclude_classes
|
1970
1982
|
)
|
1971
1983
|
|
1984
|
+
elif model.lower() == "docling":
|
1985
|
+
# Extract Docling-specific parameters
|
1986
|
+
verbose = model_params.get('verbose', False)
|
1987
|
+
|
1988
|
+
# Pass all other model_params directly to DocumentConverter
|
1989
|
+
detector_kwargs = {k: v for k, v in model_params.items() if k != 'verbose'}
|
1990
|
+
|
1991
|
+
# Initialize DoclingLayoutDetector
|
1992
|
+
detector = DoclingLayoutDetector(
|
1993
|
+
verbose=verbose,
|
1994
|
+
**detector_kwargs
|
1995
|
+
)
|
1996
|
+
|
1997
|
+
# Run detection
|
1998
|
+
detections = detector.detect(
|
1999
|
+
temp_image_path,
|
2000
|
+
confidence=confidence,
|
2001
|
+
classes=classes,
|
2002
|
+
exclude_classes=exclude_classes
|
2003
|
+
)
|
2004
|
+
|
2005
|
+
# Store the original Docling document for advanced usage
|
2006
|
+
self.docling_document = detector.get_docling_document()
|
2007
|
+
|
1972
2008
|
else:
|
1973
|
-
raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle'")
|
2009
|
+
raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle', 'docling'")
|
1974
2010
|
|
1975
2011
|
# Calculate the scale factor to convert from image to PDF coordinates
|
1976
2012
|
# Note: This assumes the image resolution is 150 DPI
|
@@ -1981,6 +2017,9 @@ class Page:
|
|
1981
2017
|
layout_regions = []
|
1982
2018
|
|
1983
2019
|
# Convert detections to regions
|
2020
|
+
# First create all regions and track by docling_id if available
|
2021
|
+
docling_id_to_region = {}
|
2022
|
+
|
1984
2023
|
for detection in detections:
|
1985
2024
|
x_min, y_min, x_max, y_max = detection['bbox']
|
1986
2025
|
|
@@ -1998,7 +2037,30 @@ class Page:
|
|
1998
2037
|
region.model = model # Store which model detected this region
|
1999
2038
|
region.source = 'detected' # Set the source for selectors
|
2000
2039
|
|
2040
|
+
# If this is a Docling detection, include text content
|
2041
|
+
if model.lower() == 'docling':
|
2042
|
+
if 'text' in detection:
|
2043
|
+
region.text_content = detection.get('text')
|
2044
|
+
|
2045
|
+
# Track by docling_id for building hierarchy later
|
2046
|
+
if 'docling_id' in detection:
|
2047
|
+
region.docling_id = detection['docling_id']
|
2048
|
+
docling_id_to_region[detection['docling_id']] = region
|
2049
|
+
|
2050
|
+
# Store parent ID for hierarchy building
|
2051
|
+
if 'parent_id' in detection:
|
2052
|
+
region.parent_id = detection.get('parent_id')
|
2053
|
+
|
2001
2054
|
layout_regions.append(region)
|
2055
|
+
|
2056
|
+
# If using Docling model, build parent-child relationships
|
2057
|
+
if model.lower() == 'docling':
|
2058
|
+
# Second pass to establish parent-child relationships
|
2059
|
+
for region in layout_regions:
|
2060
|
+
if hasattr(region, 'parent_id') and region.parent_id:
|
2061
|
+
parent_region = docling_id_to_region.get(region.parent_id)
|
2062
|
+
if parent_region:
|
2063
|
+
parent_region.add_child(region)
|
2002
2064
|
|
2003
2065
|
# Handle existing regions based on mode
|
2004
2066
|
if existing.lower() == 'append':
|
@@ -2356,21 +2418,10 @@ class Page:
|
|
2356
2418
|
"source_elements": list of elements that contain the answer (if found)
|
2357
2419
|
}
|
2358
2420
|
"""
|
2359
|
-
|
2360
|
-
|
2361
|
-
|
2362
|
-
|
2363
|
-
|
2364
|
-
|
2365
|
-
|
2366
|
-
return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
2367
|
-
except ImportError as e:
|
2368
|
-
import logging
|
2369
|
-
logger = logging.getLogger("natural_pdf.core.page")
|
2370
|
-
logger.warning(f"QA functionality not available: {e}")
|
2371
|
-
return {
|
2372
|
-
"answer": "",
|
2373
|
-
"confidence": 0.0,
|
2374
|
-
"error": "QA functionality not available",
|
2375
|
-
"found": False
|
2376
|
-
}
|
2421
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
2422
|
+
|
2423
|
+
# Get or initialize QA engine with specified model
|
2424
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
2425
|
+
|
2426
|
+
# Ask the question using the QA engine
|
2427
|
+
return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
|
natural_pdf/core/pdf.py
CHANGED
@@ -110,8 +110,13 @@ class PDF:
|
|
110
110
|
if HAS_OCR_ENGINES:
|
111
111
|
# Handle OCR engine selection
|
112
112
|
if ocr_engine is None:
|
113
|
-
# Use default engine (
|
114
|
-
|
113
|
+
# Use default engine (PaddleOCR)
|
114
|
+
try:
|
115
|
+
self._ocr_engine = PaddleOCREngine()
|
116
|
+
except (ImportError, ValueError) as e:
|
117
|
+
logger.warning(f"PaddleOCR engine could not be loaded: {e}")
|
118
|
+
logger.warning("Falling back to EasyOCR engine.")
|
119
|
+
self._ocr_engine = EasyOCREngine()
|
115
120
|
elif isinstance(ocr_engine, str):
|
116
121
|
# String-based engine selection
|
117
122
|
try:
|
@@ -519,74 +524,51 @@ class PDF:
|
|
519
524
|
**kwargs: Additional parameters passed to the QA engine
|
520
525
|
|
521
526
|
Returns:
|
522
|
-
|
527
|
+
A dictionary containing the answer, confidence, and other metadata.
|
528
|
+
Result will have an 'answer' key containing the answer text.
|
523
529
|
"""
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
page
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
else:
|
566
|
-
return {
|
567
|
-
"answer": "",
|
568
|
-
"confidence": 0.0,
|
569
|
-
"found": False,
|
570
|
-
"message": "No answer found in document"
|
571
|
-
}
|
530
|
+
from natural_pdf.qa import get_qa_engine
|
531
|
+
|
532
|
+
# Initialize or get QA engine
|
533
|
+
qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
|
534
|
+
|
535
|
+
# Determine which pages to query
|
536
|
+
if pages is None:
|
537
|
+
target_pages = list(range(len(self.pages)))
|
538
|
+
elif isinstance(pages, int):
|
539
|
+
# Single page
|
540
|
+
target_pages = [pages]
|
541
|
+
elif isinstance(pages, (list, range)):
|
542
|
+
# List or range of pages
|
543
|
+
target_pages = pages
|
544
|
+
else:
|
545
|
+
raise ValueError(f"Invalid pages parameter: {pages}")
|
546
|
+
|
547
|
+
# Actually query each page and gather results
|
548
|
+
results = []
|
549
|
+
for page_idx in target_pages:
|
550
|
+
if 0 <= page_idx < len(self.pages):
|
551
|
+
page = self.pages[page_idx]
|
552
|
+
page_result = qa_engine.ask_pdf_page(
|
553
|
+
page=page,
|
554
|
+
question=question,
|
555
|
+
min_confidence=min_confidence,
|
556
|
+
**kwargs
|
557
|
+
)
|
558
|
+
|
559
|
+
# Add to results if it found an answer
|
560
|
+
if page_result.get("found", False):
|
561
|
+
results.append(page_result)
|
562
|
+
|
563
|
+
# Sort results by confidence
|
564
|
+
results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
|
565
|
+
|
566
|
+
# Return the best result, or a default result if none found
|
567
|
+
if results:
|
568
|
+
return results[0]
|
569
|
+
else:
|
570
|
+
return None
|
572
571
|
|
573
|
-
except ImportError as e:
|
574
|
-
logger.warning(f"QA functionality not available: {e}")
|
575
|
-
return {
|
576
|
-
"answer": "",
|
577
|
-
"confidence": 0.0,
|
578
|
-
"error": "QA functionality not available",
|
579
|
-
"found": False
|
580
|
-
}
|
581
|
-
except Exception as e:
|
582
|
-
logger.error(f"Error in document QA: {e}")
|
583
|
-
return {
|
584
|
-
"answer": "",
|
585
|
-
"confidence": 0.0,
|
586
|
-
"error": str(e),
|
587
|
-
"found": False
|
588
|
-
}
|
589
|
-
|
590
572
|
def __len__(self) -> int:
|
591
573
|
"""Return the number of pages in the PDF."""
|
592
574
|
return len(self.pages)
|