natural-pdf 25.3.16.2__py3-none-any.whl → 25.3.17.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -399,6 +399,276 @@ class TableTransformerDetector(LayoutDetector):
399
399
  return all_detections
400
400
 
401
401
 
402
+ class DoclingLayoutDetector(LayoutDetector):
403
+ """
404
+ Document layout and text recognition using Docling.
405
+
406
+ Docling provides a hierarchical document understanding system that can analyze:
407
+ - Document structure (headers, text, figures, tables)
408
+ - Text content via integrated OCR
409
+ - Hierarchical relationships between document elements
410
+ """
411
+
412
+ def __init__(self, verbose=False, **kwargs):
413
+ """
414
+ Initialize the Docling document analyzer.
415
+
416
+ Args:
417
+ verbose: Whether to enable verbose logging
418
+ **kwargs: Additional parameters to pass to DocumentConverter
419
+ """
420
+ # Set up logger with optional verbose mode
421
+ import logging
422
+ self.logger = logging.getLogger("natural_pdf.analyzers.layout.docling")
423
+ self.original_level = self.logger.level
424
+ if verbose:
425
+ self.logger.setLevel(logging.DEBUG)
426
+
427
+ super().__init__()
428
+ self.verbose = verbose
429
+ self.converter_kwargs = kwargs
430
+ self._docling_document = None
431
+ self._converter = None
432
+
433
+ def __del__(self):
434
+ # Restore the original logging level when done
435
+ if hasattr(self, 'logger') and hasattr(self, 'original_level'):
436
+ self.logger.setLevel(self.original_level)
437
+
438
+ @property
439
+ def converter(self):
440
+ """Lazy-load the DocumentConverter on first use."""
441
+ if self._converter is None:
442
+ try:
443
+ from docling.document_converter import DocumentConverter
444
+ self.logger.debug("Initializing Docling DocumentConverter")
445
+ self._converter = DocumentConverter(**self.converter_kwargs)
446
+ except ImportError:
447
+ raise ImportError(
448
+ "Docling integration requires docling. "
449
+ "Install with: pip install docling"
450
+ )
451
+ return self._converter
452
+
453
+ def detect(self, image_path, confidence=0.5, classes=None, exclude_classes=None):
454
+ """
455
+ Detect document structure and text using Docling.
456
+
457
+ Args:
458
+ image_path: Path to the image or PDF to analyze
459
+ confidence: Minimum confidence threshold for detections (not used by Docling)
460
+ classes: Specific classes to detect (used for filtering)
461
+ exclude_classes: Classes to exclude from detection (used for filtering)
462
+
463
+ Returns:
464
+ List of detection dictionaries with hierarchical information
465
+ """
466
+ self.logger.info(f"Processing {image_path} with Docling")
467
+
468
+ try:
469
+ # Convert the document using Docling's DocumentConverter
470
+ result = self.converter.convert(image_path)
471
+ doc = result.document
472
+
473
+ # Store for later use
474
+ self._docling_document = doc
475
+ self.logger.info(f"Docling document created with {len(doc.body.children)} top-level elements")
476
+
477
+ # Convert Docling document to our detection format
478
+ detections = self._convert_docling_to_detections(doc, confidence, classes, exclude_classes)
479
+
480
+ return detections
481
+ except Exception as e:
482
+ self.logger.error(f"Error processing with Docling: {e}")
483
+ raise
484
+
485
+ def _convert_docling_to_detections(self, doc, confidence, classes, exclude_classes):
486
+ """
487
+ Convert a Docling document to our standard detection format.
488
+
489
+ Args:
490
+ doc: DoclingDocument object
491
+ confidence: Confidence threshold to apply (not used by Docling)
492
+ classes: Classes to include (if specified)
493
+ exclude_classes: Classes to exclude
494
+
495
+ Returns:
496
+ List of detection dictionaries with hierarchy information
497
+ """
498
+ if not doc or not hasattr(doc, 'body') or not hasattr(doc.body, 'children'):
499
+ self.logger.warning("Invalid or empty Docling document")
500
+ return []
501
+
502
+ detections = []
503
+ id_to_detection = {} # Map from Docling ID to detection index
504
+
505
+ # Process text elements
506
+ if hasattr(doc, 'texts') and doc.texts:
507
+ self.logger.debug(f"Processing {len(doc.texts)} text elements")
508
+
509
+ # First pass: create detections for all text elements
510
+ for text_elem in doc.texts:
511
+ # Skip if no provenance information
512
+ if not hasattr(text_elem, 'prov') or not text_elem.prov:
513
+ continue
514
+
515
+ # Get the bounding box
516
+ prov = text_elem.prov[0] # Take first provenance entry
517
+ if not hasattr(prov, 'bbox') or not prov.bbox:
518
+ continue
519
+
520
+ bbox = prov.bbox
521
+
522
+ page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792 # Default letter size
523
+ # Already in top-left coordinates
524
+ t = page_height - bbox.t
525
+ b = page_height - bbox.b
526
+
527
+ # Ensure top is always less than bottom for PIL coordinates
528
+ if t > b:
529
+ t, b = b, t
530
+
531
+ # Get the label and normalize it
532
+ label = str(text_elem.label) if hasattr(text_elem, 'label') else 'text'
533
+ normalized_label = self._normalize_class_name(label)
534
+
535
+ # Skip if filtered by class
536
+ if classes and normalized_label not in classes:
537
+ continue
538
+ if exclude_classes and normalized_label in exclude_classes:
539
+ continue
540
+
541
+ # Create detection
542
+ detection = {
543
+ 'bbox': (bbox.l, t, bbox.r, b),
544
+ 'class': label,
545
+ 'normalized_class': normalized_label,
546
+ 'confidence': 0.95, # Default confidence for Docling
547
+ 'text': text_elem.text if hasattr(text_elem, 'text') else None,
548
+ 'docling_id': text_elem.self_ref if hasattr(text_elem, 'self_ref') else None,
549
+ 'parent_id': text_elem.parent.self_ref if hasattr(text_elem, 'parent') and hasattr(text_elem.parent, 'self_ref') else None,
550
+ 'model': 'docling'
551
+ }
552
+
553
+ detections.append(detection)
554
+
555
+ # Track by ID for hierarchy reconstruction
556
+ if detection['docling_id']:
557
+ id_to_detection[detection['docling_id']] = len(detections) - 1
558
+
559
+ # Process pictures if available
560
+ if hasattr(doc, 'pictures') and doc.pictures:
561
+ self.logger.debug(f"Processing {len(doc.pictures)} picture elements")
562
+
563
+ for pic_elem in doc.pictures:
564
+ # Skip if no provenance information
565
+ if not hasattr(pic_elem, 'prov') or not pic_elem.prov:
566
+ continue
567
+
568
+ # Get the bounding box
569
+ prov = pic_elem.prov[0] # Take first provenance entry
570
+ if not hasattr(prov, 'bbox') or not prov.bbox:
571
+ continue
572
+
573
+ bbox = prov.bbox
574
+
575
+ page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792
576
+ # In BOTTOMLEFT system, bbox.t is distance from bottom (higher value = higher on page)
577
+ # In TOPLEFT system, we need distance from top (convert using page_height)
578
+ t = page_height - bbox.t # Correct: Top is page_height minus the top in BOTTOMLEFT
579
+ b = page_height - bbox.b # Correct: Bottom is page_height minus the bottom in BOTTOMLEFT
580
+
581
+ # Ensure top is always less than bottom for PIL coordinates
582
+ if t > b:
583
+ t, b = b, t
584
+
585
+ label = 'figure' # Default label for pictures
586
+ normalized_label = 'figure'
587
+
588
+ # Skip if filtered by class
589
+ if classes and normalized_label not in classes:
590
+ continue
591
+ if exclude_classes and normalized_label in exclude_classes:
592
+ continue
593
+
594
+ # Create detection
595
+ detection = {
596
+ 'bbox': (bbox.l, t, bbox.r, b),
597
+ 'class': label,
598
+ 'normalized_class': normalized_label,
599
+ 'confidence': 0.95, # Default confidence
600
+ 'docling_id': pic_elem.self_ref if hasattr(pic_elem, 'self_ref') else None,
601
+ 'parent_id': pic_elem.parent.self_ref if hasattr(pic_elem, 'parent') and hasattr(pic_elem.parent, 'self_ref') else None,
602
+ 'model': 'docling'
603
+ }
604
+
605
+ detections.append(detection)
606
+
607
+ # Track by ID for hierarchy reconstruction
608
+ if detection['docling_id']:
609
+ id_to_detection[detection['docling_id']] = len(detections) - 1
610
+
611
+ # Process tables if available
612
+ if hasattr(doc, 'tables') and doc.tables:
613
+ self.logger.debug(f"Processing {len(doc.tables)} table elements")
614
+
615
+ for table_elem in doc.tables:
616
+ # Skip if no provenance information
617
+ if not hasattr(table_elem, 'prov') or not table_elem.prov:
618
+ continue
619
+
620
+ # Get the bounding box
621
+ prov = table_elem.prov[0] # Take first provenance entry
622
+ if not hasattr(prov, 'bbox') or not prov.bbox:
623
+ continue
624
+
625
+ bbox = prov.bbox
626
+
627
+ # Convert from bottom-left to top-left coordinates
628
+ page_height = doc.pages.get(prov.page_no).size.height if hasattr(doc, 'pages') else 792
629
+ # In BOTTOMLEFT system, bbox.t is distance from bottom (higher value = higher on page)
630
+ # In TOPLEFT system, we need distance from top (convert using page_height)
631
+ t = page_height - bbox.t # Correct: Top is page_height minus the top in BOTTOMLEFT
632
+ b = page_height - bbox.b # Correct: Bottom is page_height minus the bottom in BOTTOMLEFT
633
+
634
+ # Ensure top is always less than bottom for PIL coordinates
635
+ if t > b:
636
+ t, b = b, t
637
+
638
+ label = 'table' # Default label for tables
639
+ normalized_label = 'table'
640
+
641
+ # Skip if filtered by class
642
+ if classes and normalized_label not in classes:
643
+ continue
644
+ if exclude_classes and normalized_label in exclude_classes:
645
+ continue
646
+
647
+ # Create detection
648
+ detection = {
649
+ 'bbox': (bbox.l, t, bbox.r, b),
650
+ 'class': label,
651
+ 'normalized_class': normalized_label,
652
+ 'confidence': 0.95, # Default confidence
653
+ 'docling_id': table_elem.self_ref if hasattr(table_elem, 'self_ref') else None,
654
+ 'parent_id': table_elem.parent.self_ref if hasattr(table_elem, 'parent') and hasattr(table_elem.parent, 'self_ref') else None,
655
+ 'model': 'docling'
656
+ }
657
+
658
+ detections.append(detection)
659
+
660
+ # Track by ID for hierarchy reconstruction
661
+ if detection['docling_id']:
662
+ id_to_detection[detection['docling_id']] = len(detections) - 1
663
+
664
+ self.logger.info(f"Created {len(detections)} detections from Docling document")
665
+ return detections
666
+
667
+ def get_docling_document(self):
668
+ """Get the original Docling document for advanced usage."""
669
+ return self._docling_document
670
+
671
+
402
672
  class PaddleLayoutDetector(LayoutDetector):
403
673
  """
404
674
  Document layout and table structure detector using PaddlePaddle's PP-Structure.
@@ -708,6 +978,12 @@ def convert_to_regions(page: Any, detections: List[Dict[str, Any]],
708
978
  # Extract detection info
709
979
  x_min, y_min, x_max, y_max = det['bbox']
710
980
 
981
+ # Ensure coordinates are in proper order (min values are smaller)
982
+ if x_min > x_max:
983
+ x_min, x_max = x_max, x_min
984
+ if y_min > y_max:
985
+ y_min, y_max = y_max, y_min
986
+
711
987
  # Scale coordinates from image to PDF space
712
988
  if scale_factor != 1.0:
713
989
  x_min *= scale_factor
natural_pdf/core/page.py CHANGED
@@ -17,6 +17,7 @@ from natural_pdf.analyzers.document_layout import (
17
17
  YOLODocLayoutDetector,
18
18
  TableTransformerDetector,
19
19
  PaddleLayoutDetector,
20
+ DoclingLayoutDetector,
20
21
  convert_to_regions
21
22
  )
22
23
  from natural_pdf.utils.ocr import OCRManager
@@ -808,6 +809,8 @@ class Page:
808
809
  except (KeyError, AttributeError, TypeError):
809
810
  pass
810
811
 
812
+ # Add source attribute for native text elements
813
+ c['source'] = 'native'
811
814
  chars.append(TextElement(c, self))
812
815
 
813
816
  # Create word-level text elements by grouping chars
@@ -872,6 +875,8 @@ class Page:
872
875
  if attr in current_word[0]:
873
876
  word_obj[attr] = current_word[0][attr]
874
877
 
878
+ # Add source attribute for native text elements
879
+ word_obj['source'] = 'native'
875
880
  words.append(TextElement(word_obj, self))
876
881
  current_word = []
877
882
  continue
@@ -927,6 +932,8 @@ class Page:
927
932
  if attr in current_word[0]:
928
933
  word_obj[attr] = current_word[0][attr]
929
934
 
935
+ # Add source attribute for native text elements
936
+ word_obj['source'] = 'native'
930
937
  words.append(TextElement(word_obj, self))
931
938
  current_word = [char]
932
939
  # If the gap between chars is larger than a threshold, it's a new word
@@ -965,6 +972,8 @@ class Page:
965
972
  if attr in current_word[0]:
966
973
  word_obj[attr] = current_word[0][attr]
967
974
 
975
+ # Add source attribute for native text elements
976
+ word_obj['source'] = 'native'
968
977
  words.append(TextElement(word_obj, self))
969
978
  current_word = [char]
970
979
  else:
@@ -1005,6 +1014,8 @@ class Page:
1005
1014
  if attr in current_word[0]:
1006
1015
  word_obj[attr] = current_word[0][attr]
1007
1016
 
1017
+ # Add source attribute for native text elements
1018
+ word_obj['source'] = 'native'
1008
1019
  words.append(TextElement(word_obj, self))
1009
1020
 
1010
1021
  line_groups.extend(words)
@@ -1853,7 +1864,7 @@ class Page:
1853
1864
  return elements
1854
1865
 
1855
1866
  def analyze_layout(self,
1856
- model: str = "yolo",
1867
+ model: str = "docling",
1857
1868
  confidence: float = 0.2,
1858
1869
  classes: Optional[List[str]] = None,
1859
1870
  exclude_classes: Optional[List[str]] = None,
@@ -1868,7 +1879,7 @@ class Page:
1868
1879
  Analyze the page layout using a machine learning model.
1869
1880
 
1870
1881
  Args:
1871
- model: Model type to use ('yolo', 'tatr', or 'paddle')
1882
+ model: Model type to use ('yolo', 'tatr', 'paddle', or 'docling')
1872
1883
  confidence: Minimum confidence threshold for detections
1873
1884
  classes: Specific classes to detect (None for all supported classes)
1874
1885
  exclude_classes: Classes to exclude from detection
@@ -1878,6 +1889,7 @@ class Page:
1878
1889
  - YOLO: {"model_path": "...", "image_size": 1024}
1879
1890
  - TATR: {"model_path": "...", "create_cells": False}
1880
1891
  - Paddle: {"lang": "en", "use_angle_cls": False, "enable_table": True}
1892
+ - Docling: {"model_name": "ds4sd/SmolDocling-256M-preview", "prompt_text": "...", "verbose": False}
1881
1893
  model_path: (Legacy) Optional path to custom model file
1882
1894
  image_size: (Legacy) Size to resize the image to before detection (YOLO only)
1883
1895
  create_cells: (Legacy) Whether to create cell regions for TATR table regions
@@ -1969,8 +1981,32 @@ class Page:
1969
1981
  exclude_classes=exclude_classes
1970
1982
  )
1971
1983
 
1984
+ elif model.lower() == "docling":
1985
+ # Extract Docling-specific parameters
1986
+ verbose = model_params.get('verbose', False)
1987
+
1988
+ # Pass all other model_params directly to DocumentConverter
1989
+ detector_kwargs = {k: v for k, v in model_params.items() if k != 'verbose'}
1990
+
1991
+ # Initialize DoclingLayoutDetector
1992
+ detector = DoclingLayoutDetector(
1993
+ verbose=verbose,
1994
+ **detector_kwargs
1995
+ )
1996
+
1997
+ # Run detection
1998
+ detections = detector.detect(
1999
+ temp_image_path,
2000
+ confidence=confidence,
2001
+ classes=classes,
2002
+ exclude_classes=exclude_classes
2003
+ )
2004
+
2005
+ # Store the original Docling document for advanced usage
2006
+ self.docling_document = detector.get_docling_document()
2007
+
1972
2008
  else:
1973
- raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle'")
2009
+ raise ValueError(f"Unsupported model type: {model}. Currently supported: 'yolo', 'tatr', 'paddle', 'docling'")
1974
2010
 
1975
2011
  # Calculate the scale factor to convert from image to PDF coordinates
1976
2012
  # Note: This assumes the image resolution is 150 DPI
@@ -1981,6 +2017,9 @@ class Page:
1981
2017
  layout_regions = []
1982
2018
 
1983
2019
  # Convert detections to regions
2020
+ # First create all regions and track by docling_id if available
2021
+ docling_id_to_region = {}
2022
+
1984
2023
  for detection in detections:
1985
2024
  x_min, y_min, x_max, y_max = detection['bbox']
1986
2025
 
@@ -1998,7 +2037,30 @@ class Page:
1998
2037
  region.model = model # Store which model detected this region
1999
2038
  region.source = 'detected' # Set the source for selectors
2000
2039
 
2040
+ # If this is a Docling detection, include text content
2041
+ if model.lower() == 'docling':
2042
+ if 'text' in detection:
2043
+ region.text_content = detection.get('text')
2044
+
2045
+ # Track by docling_id for building hierarchy later
2046
+ if 'docling_id' in detection:
2047
+ region.docling_id = detection['docling_id']
2048
+ docling_id_to_region[detection['docling_id']] = region
2049
+
2050
+ # Store parent ID for hierarchy building
2051
+ if 'parent_id' in detection:
2052
+ region.parent_id = detection.get('parent_id')
2053
+
2001
2054
  layout_regions.append(region)
2055
+
2056
+ # If using Docling model, build parent-child relationships
2057
+ if model.lower() == 'docling':
2058
+ # Second pass to establish parent-child relationships
2059
+ for region in layout_regions:
2060
+ if hasattr(region, 'parent_id') and region.parent_id:
2061
+ parent_region = docling_id_to_region.get(region.parent_id)
2062
+ if parent_region:
2063
+ parent_region.add_child(region)
2002
2064
 
2003
2065
  # Handle existing regions based on mode
2004
2066
  if existing.lower() == 'append':
@@ -2356,21 +2418,10 @@ class Page:
2356
2418
  "source_elements": list of elements that contain the answer (if found)
2357
2419
  }
2358
2420
  """
2359
- try:
2360
- from natural_pdf.qa.document_qa import get_qa_engine
2361
-
2362
- # Get or initialize QA engine with specified model
2363
- qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
2364
-
2365
- # Ask the question using the QA engine
2366
- return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
2367
- except ImportError as e:
2368
- import logging
2369
- logger = logging.getLogger("natural_pdf.core.page")
2370
- logger.warning(f"QA functionality not available: {e}")
2371
- return {
2372
- "answer": "",
2373
- "confidence": 0.0,
2374
- "error": "QA functionality not available",
2375
- "found": False
2376
- }
2421
+ from natural_pdf.qa.document_qa import get_qa_engine
2422
+
2423
+ # Get or initialize QA engine with specified model
2424
+ qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
2425
+
2426
+ # Ask the question using the QA engine
2427
+ return qa_engine.ask_pdf_page(self, question, min_confidence=min_confidence, debug=debug, **kwargs)
natural_pdf/core/pdf.py CHANGED
@@ -110,8 +110,13 @@ class PDF:
110
110
  if HAS_OCR_ENGINES:
111
111
  # Handle OCR engine selection
112
112
  if ocr_engine is None:
113
- # Use default engine (EasyOCR)
114
- self._ocr_engine = EasyOCREngine()
113
+ # Use default engine (PaddleOCR)
114
+ try:
115
+ self._ocr_engine = PaddleOCREngine()
116
+ except (ImportError, ValueError) as e:
117
+ logger.warning(f"PaddleOCR engine could not be loaded: {e}")
118
+ logger.warning("Falling back to EasyOCR engine.")
119
+ self._ocr_engine = EasyOCREngine()
115
120
  elif isinstance(ocr_engine, str):
116
121
  # String-based engine selection
117
122
  try:
@@ -519,74 +524,51 @@ class PDF:
519
524
  **kwargs: Additional parameters passed to the QA engine
520
525
 
521
526
  Returns:
522
- Dictionary with answer and confidence
527
+ A dictionary containing the answer, confidence, and other metadata.
528
+ Result will have an 'answer' key containing the answer text.
523
529
  """
524
- try:
525
- from natural_pdf.qa import get_qa_engine
526
-
527
- # Initialize or get QA engine
528
- qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
529
-
530
- # Determine which pages to query
531
- if pages is None:
532
- # Query all pages by default, prioritizing first few pages
533
- target_pages = list(range(min(10, len(self.pages))))
534
- elif isinstance(pages, int):
535
- # Single page
536
- target_pages = [pages]
537
- elif isinstance(pages, (list, range)):
538
- # List or range of pages
539
- target_pages = pages
540
- else:
541
- raise ValueError(f"Invalid pages parameter: {pages}")
542
-
543
- # Actually query each page and gather results
544
- results = []
545
- for page_idx in target_pages:
546
- if 0 <= page_idx < len(self.pages):
547
- page = self.pages[page_idx]
548
- page_result = qa_engine.ask_pdf_page(
549
- page=page,
550
- question=question,
551
- min_confidence=min_confidence,
552
- **kwargs
553
- )
554
-
555
- # Add to results if it found an answer
556
- if page_result.get("found", False):
557
- results.append(page_result)
558
-
559
- # Sort results by confidence
560
- results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
561
-
562
- # Return the best result, or an empty result if none found
563
- if results:
564
- return results[0]
565
- else:
566
- return {
567
- "answer": "",
568
- "confidence": 0.0,
569
- "found": False,
570
- "message": "No answer found in document"
571
- }
530
+ from natural_pdf.qa import get_qa_engine
531
+
532
+ # Initialize or get QA engine
533
+ qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
534
+
535
+ # Determine which pages to query
536
+ if pages is None:
537
+ target_pages = list(range(len(self.pages)))
538
+ elif isinstance(pages, int):
539
+ # Single page
540
+ target_pages = [pages]
541
+ elif isinstance(pages, (list, range)):
542
+ # List or range of pages
543
+ target_pages = pages
544
+ else:
545
+ raise ValueError(f"Invalid pages parameter: {pages}")
546
+
547
+ # Actually query each page and gather results
548
+ results = []
549
+ for page_idx in target_pages:
550
+ if 0 <= page_idx < len(self.pages):
551
+ page = self.pages[page_idx]
552
+ page_result = qa_engine.ask_pdf_page(
553
+ page=page,
554
+ question=question,
555
+ min_confidence=min_confidence,
556
+ **kwargs
557
+ )
558
+
559
+ # Add to results if it found an answer
560
+ if page_result.get("found", False):
561
+ results.append(page_result)
562
+
563
+ # Sort results by confidence
564
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
565
+
566
+ # Return the best result, or a default result if none found
567
+ if results:
568
+ return results[0]
569
+ else:
570
+ return None
572
571
 
573
- except ImportError as e:
574
- logger.warning(f"QA functionality not available: {e}")
575
- return {
576
- "answer": "",
577
- "confidence": 0.0,
578
- "error": "QA functionality not available",
579
- "found": False
580
- }
581
- except Exception as e:
582
- logger.error(f"Error in document QA: {e}")
583
- return {
584
- "answer": "",
585
- "confidence": 0.0,
586
- "error": str(e),
587
- "found": False
588
- }
589
-
590
572
  def __len__(self) -> int:
591
573
  """Return the number of pages in the PDF."""
592
574
  return len(self.pages)