natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +751 -607
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +120 -23
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.35.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -7,6 +7,7 @@ import tempfile
7
7
  import threading
8
8
  import time
9
9
  import urllib.request
10
+ import weakref
10
11
  from pathlib import Path
11
12
  from typing import (
12
13
  TYPE_CHECKING,
@@ -23,9 +24,7 @@ from typing import (
23
24
  )
24
25
 
25
26
  import pdfplumber
26
- from PIL import Image
27
27
  from tqdm.auto import tqdm
28
- import weakref
29
28
 
30
29
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
31
30
  from natural_pdf.classification.manager import ClassificationError
@@ -73,11 +72,14 @@ except ImportError:
73
72
 
74
73
  logger = logging.getLogger("natural_pdf.core.pdf")
75
74
 
75
+
76
76
  def _get_classification_manager_class():
77
77
  """Lazy import for ClassificationManager."""
78
78
  from natural_pdf.classification.manager import ClassificationManager
79
+
79
80
  return ClassificationManager
80
81
 
82
+
81
83
  DEFAULT_MANAGERS = {
82
84
  "classification": _get_classification_manager_class,
83
85
  "structured_data": StructuredDataManager,
@@ -100,15 +102,45 @@ except ImportError:
100
102
  # --- Lazy Page List Helper --- #
101
103
  from collections.abc import Sequence
102
104
 
105
+
103
106
  class _LazyPageList(Sequence):
104
107
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
105
108
 
109
+ This class implements the Sequence protocol to provide list-like access to PDF pages
110
+ while minimizing memory usage. Pages are only created when accessed, and once created,
111
+ they are cached for subsequent access. This design allows efficient handling of large
112
+ PDF documents without loading all pages into memory immediately.
113
+
106
114
  The sequence holds `None` placeholders until an index is accessed, at which point
107
- a real `Page` object is created, cached, and returned. Slices and iteration are
108
- also supported and will materialise pages on demand.
115
+ a real `Page` object is created, cached, and returned. Slices and iteration are
116
+ also supported and will materialize pages on demand.
117
+
118
+ Attributes:
119
+ _parent_pdf: Reference to the parent PDF object.
120
+ _plumber_pdf: Underlying pdfplumber PDF object.
121
+ _font_attrs: Font attributes to use when creating pages.
122
+ _cache: List of cached Page objects (None until accessed).
123
+ _load_text: Whether to load text layer when creating pages.
124
+
125
+ Example:
126
+ ```python
127
+ # Access is transparent - pages created on demand
128
+ pdf = npdf.PDF("document.pdf")
129
+ first_page = pdf.pages[0] # Creates Page object here
130
+ last_page = pdf.pages[-1] # Creates another Page object
131
+
132
+ # Slicing works too
133
+ first_three = pdf.pages[0:3] # Creates 3 Page objects
134
+
135
+ # Iteration creates all pages
136
+ for page in pdf.pages: # Each page created as needed
137
+ print(f"Page {page.index}")
138
+ ```
109
139
  """
110
140
 
111
- def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True):
141
+ def __init__(
142
+ self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
143
+ ):
112
144
  self._parent_pdf = parent_pdf
113
145
  self._plumber_pdf = plumber_pdf
114
146
  self._font_attrs = font_attrs
@@ -124,7 +156,13 @@ class _LazyPageList(Sequence):
124
156
  from natural_pdf.core.page import Page
125
157
 
126
158
  plumber_page = self._plumber_pdf.pages[index]
127
- cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs, load_text=self._load_text)
159
+ cached = Page(
160
+ plumber_page,
161
+ parent=self._parent_pdf,
162
+ index=index,
163
+ font_attrs=self._font_attrs,
164
+ load_text=self._load_text,
165
+ )
128
166
  self._cache[index] = cached
129
167
  return cached
130
168
 
@@ -153,14 +191,44 @@ class _LazyPageList(Sequence):
153
191
  def __repr__(self) -> str: # pragma: no cover
154
192
  return f"<_LazyPageList(len={len(self)})>"
155
193
 
194
+
156
195
  # --- End Lazy Page List Helper --- #
157
196
 
197
+
158
198
  class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
159
- """
160
- Enhanced PDF wrapper built on top of pdfplumber.
199
+ """Enhanced PDF wrapper built on top of pdfplumber.
161
200
 
162
201
  This class provides a fluent interface for working with PDF documents,
163
- with improved selection, navigation, and extraction capabilities.
202
+ with improved selection, navigation, and extraction capabilities. It integrates
203
+ OCR, layout analysis, and AI-powered data extraction features while maintaining
204
+ compatibility with the underlying pdfplumber API.
205
+
206
+ The PDF class supports loading from files, URLs, or streams, and provides
207
+ spatial navigation, element selection with CSS-like selectors, and advanced
208
+ document processing workflows including multi-page content flows.
209
+
210
+ Attributes:
211
+ pages: Lazy-loaded list of Page objects for document pages.
212
+ path: Resolved path to the PDF file or source identifier.
213
+ source_path: Original path, URL, or stream identifier provided during initialization.
214
+ highlighter: Service for rendering highlighted visualizations of document content.
215
+
216
+ Example:
217
+ Basic usage:
218
+ ```python
219
+ import natural_pdf as npdf
220
+
221
+ pdf = npdf.PDF("document.pdf")
222
+ page = pdf.pages[0]
223
+ text_elements = page.find_all('text:contains("Summary")')
224
+ ```
225
+
226
+ Advanced usage with OCR:
227
+ ```python
228
+ pdf = npdf.PDF("scanned_document.pdf")
229
+ pdf.apply_ocr(engine="easyocr", resolution=144)
230
+ tables = pdf.pages[0].find_all('table')
231
+ ```
164
232
  """
165
233
 
166
234
  def __init__(
@@ -173,18 +241,48 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
173
241
  auto_text_tolerance: bool = True,
174
242
  text_layer: bool = True,
175
243
  ):
176
- """
177
- Initialize the enhanced PDF object.
244
+ """Initialize the enhanced PDF object.
178
245
 
179
246
  Args:
180
- path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
181
- reading_order: Whether to use natural reading order
182
- font_attrs: Font attributes for grouping characters into words
183
- keep_spaces: Whether to include spaces in word elements
184
- text_tolerance: PDFplumber-style tolerance settings
185
- auto_text_tolerance: Whether to automatically scale text tolerance
186
- text_layer: Whether to keep the existing text layer from the PDF (default: True).
187
- If False, removes all existing text elements during initialization.
247
+ path_or_url_or_stream: Path to the PDF file (str/Path), a URL (str),
248
+ or a file-like object (stream). URLs must start with 'http://' or 'https://'.
249
+ reading_order: If True, use natural reading order for text extraction.
250
+ Defaults to True.
251
+ font_attrs: List of font attributes for grouping characters into words.
252
+ Common attributes include ['fontname', 'size']. Defaults to None.
253
+ keep_spaces: If True, include spaces in word elements during text extraction.
254
+ Defaults to True.
255
+ text_tolerance: PDFplumber-style tolerance settings for text grouping.
256
+ Dictionary with keys like 'x_tolerance', 'y_tolerance'. Defaults to None.
257
+ auto_text_tolerance: If True, automatically scale text tolerance based on
258
+ font size and document characteristics. Defaults to True.
259
+ text_layer: If True, preserve existing text layer from the PDF. If False,
260
+ removes all existing text elements during initialization, useful for
261
+ OCR-only workflows. Defaults to True.
262
+
263
+ Raises:
264
+ TypeError: If path_or_url_or_stream is not a valid type.
265
+ IOError: If the PDF file cannot be opened or read.
266
+ ValueError: If URL download fails.
267
+
268
+ Example:
269
+ ```python
270
+ # From file path
271
+ pdf = npdf.PDF("document.pdf")
272
+
273
+ # From URL
274
+ pdf = npdf.PDF("https://example.com/document.pdf")
275
+
276
+ # From stream
277
+ with open("document.pdf", "rb") as f:
278
+ pdf = npdf.PDF(f)
279
+
280
+ # With custom settings
281
+ pdf = npdf.PDF("document.pdf",
282
+ reading_order=False,
283
+ text_layer=False, # For OCR-only processing
284
+ font_attrs=['fontname', 'size', 'flags'])
285
+ ```
188
286
  """
189
287
  self._original_path_or_stream = path_or_url_or_stream
190
288
  self._temp_file = None
@@ -262,7 +360,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
262
360
  self._manager_registry = {}
263
361
 
264
362
  # Lazily instantiate pages only when accessed
265
- self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer)
363
+ self._pages = _LazyPageList(
364
+ self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer
365
+ )
266
366
 
267
367
  self._element_cache = {}
268
368
  self._exclusions = []
@@ -272,13 +372,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
272
372
 
273
373
  self._initialize_managers()
274
374
  self._initialize_highlighter()
275
-
375
+
276
376
  # Remove text layer if requested
277
377
  if not self._text_layer:
278
378
  logger.info("Removing text layer as requested (text_layer=False)")
279
379
  # Text layer is not loaded when text_layer=False, so no need to remove
280
380
  pass
281
-
381
+
282
382
  # Analysis results accessed via self.analyses property (see below)
283
383
 
284
384
  # --- Automatic cleanup when object is garbage-collected ---
@@ -315,7 +415,30 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
315
415
  self._managers = {} # Will hold instantiated managers
316
416
 
317
417
  def get_manager(self, key: str) -> Any:
318
- """Retrieve a manager instance by its key, instantiating it lazily if needed."""
418
+ """Retrieve a manager instance by its key, instantiating it lazily if needed.
419
+
420
+ Managers are specialized components that handle specific functionality like
421
+ classification, structured data extraction, or OCR processing. They are
422
+ instantiated on-demand to minimize memory usage and startup time.
423
+
424
+ Args:
425
+ key: The manager key to retrieve. Common keys include 'classification'
426
+ and 'structured_data'.
427
+
428
+ Returns:
429
+ The manager instance for the specified key.
430
+
431
+ Raises:
432
+ KeyError: If no manager is registered for the given key.
433
+ RuntimeError: If the manager failed to initialize.
434
+
435
+ Example:
436
+ ```python
437
+ pdf = npdf.PDF("document.pdf")
438
+ classification_mgr = pdf.get_manager('classification')
439
+ structured_data_mgr = pdf.get_manager('structured_data')
440
+ ```
441
+ """
319
442
  # Check if already instantiated
320
443
  if key in self._managers:
321
444
  manager_instance = self._managers[key]
@@ -351,12 +474,56 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
351
474
 
352
475
  @property
353
476
  def metadata(self) -> Dict[str, Any]:
354
- """Access metadata as a dictionary."""
477
+ """Access PDF metadata as a dictionary.
478
+
479
+ Returns document metadata such as title, author, creation date, and other
480
+ properties embedded in the PDF file. The exact keys available depend on
481
+ what metadata was included when the PDF was created.
482
+
483
+ Returns:
484
+ Dictionary containing PDF metadata. Common keys include 'Title',
485
+ 'Author', 'Subject', 'Creator', 'Producer', 'CreationDate', and
486
+ 'ModDate'. May be empty if no metadata is available.
487
+
488
+ Example:
489
+ ```python
490
+ pdf = npdf.PDF("document.pdf")
491
+ print(pdf.metadata.get('Title', 'No title'))
492
+ print(f"Created: {pdf.metadata.get('CreationDate')}")
493
+ ```
494
+ """
355
495
  return self._pdf.metadata
356
496
 
357
497
  @property
358
498
  def pages(self) -> "PageCollection":
359
- """Access pages as a PageCollection object."""
499
+ """Access pages as a PageCollection object.
500
+
501
+ Provides access to individual pages of the PDF document through a
502
+ collection interface that supports indexing, slicing, and iteration.
503
+ Pages are lazy-loaded to minimize memory usage.
504
+
505
+ Returns:
506
+ PageCollection object that provides list-like access to PDF pages.
507
+
508
+ Raises:
509
+ AttributeError: If PDF pages are not yet initialized.
510
+
511
+ Example:
512
+ ```python
513
+ pdf = npdf.PDF("document.pdf")
514
+
515
+ # Access individual pages
516
+ first_page = pdf.pages[0]
517
+ last_page = pdf.pages[-1]
518
+
519
+ # Slice pages
520
+ first_three = pdf.pages[0:3]
521
+
522
+ # Iterate over pages
523
+ for page in pdf.pages:
524
+ print(f"Page {page.index} has {len(page.chars)} characters")
525
+ ```
526
+ """
360
527
  from natural_pdf.elements.collections import PageCollection
361
528
 
362
529
  if not hasattr(self, "_pages"):
@@ -364,11 +531,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
364
531
  return PageCollection(self._pages)
365
532
 
366
533
  def clear_exclusions(self) -> "PDF":
367
- """
368
- Clear all exclusion functions from the PDF.
534
+ """Clear all exclusion functions from the PDF.
535
+
536
+ Removes all previously added exclusion functions that were used to filter
537
+ out unwanted content (like headers, footers, or administrative text) from
538
+ text extraction and analysis operations.
369
539
 
370
540
  Returns:
371
- Self for method chaining
541
+ Self for method chaining.
542
+
543
+ Raises:
544
+ AttributeError: If PDF pages are not yet initialized.
545
+
546
+ Example:
547
+ ```python
548
+ pdf = npdf.PDF("document.pdf")
549
+ pdf.add_exclusion(lambda page: page.find('text:contains("CONFIDENTIAL")').above())
550
+
551
+ # Later, remove all exclusions
552
+ pdf.clear_exclusions()
553
+ ```
372
554
  """
373
555
  if not hasattr(self, "_pages"):
374
556
  raise AttributeError("PDF pages not yet initialized.")
@@ -381,16 +563,46 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
381
563
  def add_exclusion(
382
564
  self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
383
565
  ) -> "PDF":
384
- """
385
- Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
566
+ """Add an exclusion function to the PDF.
567
+
568
+ Exclusion functions define regions of each page that should be ignored during
569
+ text extraction and analysis operations. This is useful for filtering out headers,
570
+ footers, watermarks, or other administrative content that shouldn't be included
571
+ in the main document processing.
386
572
 
387
573
  Args:
388
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None
389
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None
390
- label: Optional label for this exclusion
574
+ exclusion_func: A function that takes a Page object and returns a Region
575
+ to exclude from processing, or None if no exclusion should be applied
576
+ to that page. The function is called once per page.
577
+ label: Optional descriptive label for this exclusion rule, useful for
578
+ debugging and identification.
391
579
 
392
580
  Returns:
393
- Self for method chaining
581
+ Self for method chaining.
582
+
583
+ Raises:
584
+ AttributeError: If PDF pages are not yet initialized.
585
+
586
+ Example:
587
+ ```python
588
+ pdf = npdf.PDF("document.pdf")
589
+
590
+ # Exclude headers (top 50 points of each page)
591
+ pdf.add_exclusion(
592
+ lambda page: page.region(0, 0, page.width, 50),
593
+ label="header_exclusion"
594
+ )
595
+
596
+ # Exclude any text containing "CONFIDENTIAL"
597
+ pdf.add_exclusion(
598
+ lambda page: page.find('text:contains("CONFIDENTIAL")').above(include_source=True)
599
+ if page.find('text:contains("CONFIDENTIAL")') else None,
600
+ label="confidential_exclusion"
601
+ )
602
+
603
+ # Chain multiple exclusions
604
+ pdf.add_exclusion(header_func).add_exclusion(footer_func)
605
+ ```
394
606
  """
395
607
  if not hasattr(self, "_pages"):
396
608
  raise AttributeError("PDF pages not yet initialized.")
@@ -416,23 +628,74 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
416
628
  options: Optional[Any] = None,
417
629
  pages: Optional[Union[Iterable[int], range, slice]] = None,
418
630
  ) -> "PDF":
419
- """
420
- Applies OCR to specified pages of the PDF using batch processing.
631
+ """Apply OCR to specified pages of the PDF using batch processing.
632
+
633
+ Performs optical character recognition on the specified pages, converting
634
+ image-based text into searchable and extractable text elements. This method
635
+ supports multiple OCR engines and provides batch processing for efficiency.
421
636
 
422
637
  Args:
423
- engine: Name of the OCR engine
424
- languages: List of language codes
425
- min_confidence: Minimum confidence threshold
426
- device: Device to run OCR on
427
- resolution: DPI resolution for page images
428
- apply_exclusions: Whether to mask excluded areas
429
- detect_only: If True, only detect text boxes
430
- replace: Whether to replace existing OCR elements
431
- options: Engine-specific options
432
- pages: Page indices to process or None for all pages
638
+ engine: Name of the OCR engine to use. Supported engines include
639
+ 'easyocr' (default), 'surya', 'paddle', and 'doctr'. If None,
640
+ uses the global default from natural_pdf.options.ocr.engine.
641
+ languages: List of language codes for OCR recognition (e.g., ['en', 'es']).
642
+ If None, uses the global default from natural_pdf.options.ocr.languages.
643
+ min_confidence: Minimum confidence threshold (0.0-1.0) for accepting
644
+ OCR results. Text with lower confidence will be filtered out.
645
+ If None, uses the global default.
646
+ device: Device to run OCR on ('cpu', 'cuda', 'mps'). Engine-specific
647
+ availability varies. If None, uses engine defaults.
648
+ resolution: DPI resolution for rendering pages to images before OCR.
649
+ Higher values improve accuracy but increase processing time and memory.
650
+ Typical values: 150 (fast), 300 (balanced), 600 (high quality).
651
+ apply_exclusions: If True, mask excluded regions before OCR to prevent
652
+ processing of headers, footers, or other unwanted content.
653
+ detect_only: If True, only detect text bounding boxes without performing
654
+ character recognition. Useful for layout analysis workflows.
655
+ replace: If True, replace any existing OCR elements on the pages.
656
+ If False, append new OCR results to existing elements.
657
+ options: Engine-specific options object (e.g., EasyOCROptions, SuryaOptions).
658
+ Allows fine-tuning of engine behavior beyond common parameters.
659
+ pages: Page indices to process. Can be:
660
+ - None: Process all pages
661
+ - slice: Process a range of pages (e.g., slice(0, 10))
662
+ - Iterable[int]: Process specific page indices (e.g., [0, 2, 5])
433
663
 
434
664
  Returns:
435
- Self for method chaining
665
+ Self for method chaining.
666
+
667
+ Raises:
668
+ ValueError: If invalid page index is provided.
669
+ TypeError: If pages parameter has invalid type.
670
+ RuntimeError: If OCR engine is not available or fails.
671
+
672
+ Example:
673
+ ```python
674
+ pdf = npdf.PDF("scanned_document.pdf")
675
+
676
+ # Basic OCR on all pages
677
+ pdf.apply_ocr()
678
+
679
+ # High-quality OCR with specific settings
680
+ pdf.apply_ocr(
681
+ engine='easyocr',
682
+ languages=['en', 'es'],
683
+ resolution=300,
684
+ min_confidence=0.8
685
+ )
686
+
687
+ # OCR specific pages only
688
+ pdf.apply_ocr(pages=[0, 1, 2]) # First 3 pages
689
+ pdf.apply_ocr(pages=slice(5, 10)) # Pages 5-9
690
+
691
+ # Detection-only workflow for layout analysis
692
+ pdf.apply_ocr(detect_only=True, resolution=150)
693
+ ```
694
+
695
+ Note:
696
+ OCR processing can be time and memory intensive, especially at high
697
+ resolutions. Consider using exclusions to mask unwanted regions and
698
+ processing pages in batches for large documents.
436
699
  """
437
700
  if not self._ocr_manager:
438
701
  logger.error("OCRManager not available. Cannot apply OCR.")
@@ -1025,10 +1288,47 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1025
1288
  **kwargs,
1026
1289
  ) -> Dict[str, Any]:
1027
1290
  """
1028
- Ask a question about the document content.
1291
+ Ask a single question about the document content.
1292
+
1293
+ Args:
1294
+ question: Question string to ask about the document
1295
+ mode: "extractive" to extract answer from document, "generative" to generate
1296
+ pages: Specific pages to query (default: all pages)
1297
+ min_confidence: Minimum confidence threshold for answers
1298
+ model: Optional model name for question answering
1299
+ **kwargs: Additional parameters passed to the QA engine
1300
+
1301
+ Returns:
1302
+ Dict containing: answer, confidence, found, page_num, source_elements, etc.
1303
+ """
1304
+ # Delegate to ask_batch and return the first result
1305
+ results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
1306
+ return results[0] if results else {
1307
+ "answer": None,
1308
+ "confidence": 0.0,
1309
+ "found": False,
1310
+ "page_num": None,
1311
+ "source_elements": [],
1312
+ }
1313
+
1314
+ def ask_batch(
1315
+ self,
1316
+ questions: List[str],
1317
+ mode: str = "extractive",
1318
+ pages: Union[int, List[int], range] = None,
1319
+ min_confidence: float = 0.1,
1320
+ model: str = None,
1321
+ **kwargs,
1322
+ ) -> List[Dict[str, Any]]:
1323
+ """
1324
+ Ask multiple questions about the document content using batch processing.
1325
+
1326
+ This method processes multiple questions efficiently in a single batch,
1327
+ avoiding the multiprocessing resource accumulation that can occur with
1328
+ sequential individual question calls.
1029
1329
 
1030
1330
  Args:
1031
- question: Question to ask about the document
1331
+ questions: List of question strings to ask about the document
1032
1332
  mode: "extractive" to extract answer from document, "generative" to generate
1033
1333
  pages: Specific pages to query (default: all pages)
1034
1334
  min_confidence: Minimum confidence threshold for answers
@@ -1036,45 +1336,147 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1036
1336
  **kwargs: Additional parameters passed to the QA engine
1037
1337
 
1038
1338
  Returns:
1039
- A dictionary containing the answer, confidence, and other metadata
1040
- A dictionary containing the answer, confidence, and other metadata
1339
+ List of Dicts, each containing: answer, confidence, found, page_num, source_elements, etc.
1041
1340
  """
1042
1341
  from natural_pdf.qa import get_qa_engine
1043
1342
 
1343
+ if not questions:
1344
+ return []
1345
+
1346
+ if not isinstance(questions, list) or not all(isinstance(q, str) for q in questions):
1347
+ raise TypeError("'questions' must be a list of strings")
1348
+
1044
1349
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
1045
1350
 
1351
+ # Resolve target pages
1046
1352
  if pages is None:
1047
- target_pages = list(range(len(self.pages)))
1353
+ target_pages = self.pages
1048
1354
  elif isinstance(pages, int):
1049
- target_pages = [pages]
1355
+ if 0 <= pages < len(self.pages):
1356
+ target_pages = [self.pages[pages]]
1357
+ else:
1358
+ raise IndexError(f"Page index {pages} out of range (0-{len(self.pages)-1})")
1050
1359
  elif isinstance(pages, (list, range)):
1051
- target_pages = pages
1360
+ target_pages = []
1361
+ for page_idx in pages:
1362
+ if 0 <= page_idx < len(self.pages):
1363
+ target_pages.append(self.pages[page_idx])
1364
+ else:
1365
+ logger.warning(f"Page index {page_idx} out of range, skipping")
1052
1366
  else:
1053
1367
  raise ValueError(f"Invalid pages parameter: {pages}")
1054
1368
 
1055
- results = []
1056
- for page_idx in target_pages:
1057
- if 0 <= page_idx < len(self.pages):
1058
- page = self.pages[page_idx]
1059
- page_result = qa_engine.ask_pdf_page(
1060
- page=page, question=question, min_confidence=min_confidence, **kwargs
1061
- )
1369
+ if not target_pages:
1370
+ logger.warning("No valid pages found for QA processing.")
1371
+ return [
1372
+ {
1373
+ "answer": None,
1374
+ "confidence": 0.0,
1375
+ "found": False,
1376
+ "page_num": None,
1377
+ "source_elements": [],
1378
+ }
1379
+ for _ in questions
1380
+ ]
1062
1381
 
1063
- if page_result and page_result.get("found", False):
1064
- results.append(page_result)
1382
+ logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
1065
1383
 
1066
- results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
1384
+ # Collect all page images and metadata for batch processing
1385
+ page_images = []
1386
+ page_word_boxes = []
1387
+ page_metadata = []
1067
1388
 
1068
- if results:
1069
- return results[0]
1070
- else:
1071
- return {
1072
- "answer": None,
1073
- "confidence": 0.0,
1074
- "found": False,
1075
- "page_num": None,
1076
- "source_elements": [],
1077
- }
1389
+ for page in target_pages:
1390
+ # Get page image
1391
+ try:
1392
+ page_image = page.to_image(resolution=150, include_highlights=False)
1393
+ if page_image is None:
1394
+ logger.warning(f"Failed to render image for page {page.number}, skipping")
1395
+ continue
1396
+
1397
+ # Get text elements for word boxes
1398
+ elements = page.find_all("text")
1399
+ if not elements:
1400
+ logger.warning(f"No text elements found on page {page.number}")
1401
+ word_boxes = []
1402
+ else:
1403
+ word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
1404
+
1405
+ page_images.append(page_image)
1406
+ page_word_boxes.append(word_boxes)
1407
+ page_metadata.append({
1408
+ "page_number": page.number,
1409
+ "page_object": page
1410
+ })
1411
+
1412
+ except Exception as e:
1413
+ logger.warning(f"Error processing page {page.number}: {e}")
1414
+ continue
1415
+
1416
+ if not page_images:
1417
+ logger.warning("No page images could be processed for QA.")
1418
+ return [
1419
+ {
1420
+ "answer": None,
1421
+ "confidence": 0.0,
1422
+ "found": False,
1423
+ "page_num": None,
1424
+ "source_elements": [],
1425
+ }
1426
+ for _ in questions
1427
+ ]
1428
+
1429
+ # Process all questions against all pages in batch
1430
+ all_results = []
1431
+
1432
+ for question_text in questions:
1433
+ question_results = []
1434
+
1435
+ # Ask this question against each page (but in batch per page)
1436
+ for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
1437
+ try:
1438
+ # Use the DocumentQA batch interface
1439
+ page_result = qa_engine.ask(
1440
+ image=page_image,
1441
+ question=question_text,
1442
+ word_boxes=word_boxes,
1443
+ min_confidence=min_confidence,
1444
+ **kwargs
1445
+ )
1446
+
1447
+ if page_result and page_result.found:
1448
+ # Add page metadata to result
1449
+ page_result_dict = {
1450
+ "answer": page_result.answer,
1451
+ "confidence": page_result.confidence,
1452
+ "found": page_result.found,
1453
+ "page_num": page_meta["page_number"],
1454
+ "source_elements": getattr(page_result, 'source_elements', []),
1455
+ "start": getattr(page_result, 'start', -1),
1456
+ "end": getattr(page_result, 'end', -1),
1457
+ }
1458
+ question_results.append(page_result_dict)
1459
+
1460
+ except Exception as e:
1461
+ logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
1462
+ continue
1463
+
1464
+ # Sort results by confidence and take the best one for this question
1465
+ question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
1466
+
1467
+ if question_results:
1468
+ all_results.append(question_results[0])
1469
+ else:
1470
+ # No results found for this question
1471
+ all_results.append({
1472
+ "answer": None,
1473
+ "confidence": 0.0,
1474
+ "found": False,
1475
+ "page_num": None,
1476
+ "source_elements": [],
1477
+ })
1478
+
1479
+ return all_results
1078
1480
 
1079
1481
  def search_within_index(
1080
1482
  self,
@@ -1519,7 +1921,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1519
1921
 
1520
1922
  if not manager or not manager.is_available():
1521
1923
  from natural_pdf.classification.manager import is_classification_available
1522
-
1924
+
1523
1925
  if not is_classification_available():
1524
1926
  raise ImportError(
1525
1927
  "Classification dependencies missing. "