natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +188 -82
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +132 -16
  19. natural_pdf/core/pdf.py +486 -71
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +238 -111
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.32.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -7,6 +7,7 @@ import tempfile
7
7
  import threading
8
8
  import time
9
9
  import urllib.request
10
+ import weakref
10
11
  from pathlib import Path
11
12
  from typing import (
12
13
  TYPE_CHECKING,
@@ -23,9 +24,7 @@ from typing import (
23
24
  )
24
25
 
25
26
  import pdfplumber
26
- from PIL import Image
27
27
  from tqdm.auto import tqdm
28
- import weakref
29
28
 
30
29
  from natural_pdf.analyzers.layout.layout_manager import LayoutManager
31
30
  from natural_pdf.classification.manager import ClassificationError
@@ -73,11 +72,14 @@ except ImportError:
73
72
 
74
73
  logger = logging.getLogger("natural_pdf.core.pdf")
75
74
 
75
+
76
76
  def _get_classification_manager_class():
77
77
  """Lazy import for ClassificationManager."""
78
78
  from natural_pdf.classification.manager import ClassificationManager
79
+
79
80
  return ClassificationManager
80
81
 
82
+
81
83
  DEFAULT_MANAGERS = {
82
84
  "classification": _get_classification_manager_class,
83
85
  "structured_data": StructuredDataManager,
@@ -100,20 +102,51 @@ except ImportError:
100
102
  # --- Lazy Page List Helper --- #
101
103
  from collections.abc import Sequence
102
104
 
105
+
103
106
  class _LazyPageList(Sequence):
104
107
  """A lightweight, list-like object that lazily instantiates natural-pdf Page objects.
105
108
 
109
+ This class implements the Sequence protocol to provide list-like access to PDF pages
110
+ while minimizing memory usage. Pages are only created when accessed, and once created,
111
+ they are cached for subsequent access. This design allows efficient handling of large
112
+ PDF documents without loading all pages into memory immediately.
113
+
106
114
  The sequence holds `None` placeholders until an index is accessed, at which point
107
- a real `Page` object is created, cached, and returned. Slices and iteration are
108
- also supported and will materialise pages on demand.
115
+ a real `Page` object is created, cached, and returned. Slices and iteration are
116
+ also supported and will materialize pages on demand.
117
+
118
+ Attributes:
119
+ _parent_pdf: Reference to the parent PDF object.
120
+ _plumber_pdf: Underlying pdfplumber PDF object.
121
+ _font_attrs: Font attributes to use when creating pages.
122
+ _cache: List of cached Page objects (None until accessed).
123
+ _load_text: Whether to load text layer when creating pages.
124
+
125
+ Example:
126
+ ```python
127
+ # Access is transparent - pages created on demand
128
+ pdf = npdf.PDF("document.pdf")
129
+ first_page = pdf.pages[0] # Creates Page object here
130
+ last_page = pdf.pages[-1] # Creates another Page object
131
+
132
+ # Slicing works too
133
+ first_three = pdf.pages[0:3] # Creates 3 Page objects
134
+
135
+ # Iteration creates all pages
136
+ for page in pdf.pages: # Each page created as needed
137
+ print(f"Page {page.index}")
138
+ ```
109
139
  """
110
140
 
111
- def __init__(self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None):
141
+ def __init__(
142
+ self, parent_pdf: "PDF", plumber_pdf: "pdfplumber.PDF", font_attrs=None, load_text=True
143
+ ):
112
144
  self._parent_pdf = parent_pdf
113
145
  self._plumber_pdf = plumber_pdf
114
146
  self._font_attrs = font_attrs
115
147
  # One slot per pdfplumber page – initially all None
116
148
  self._cache: List[Optional["Page"]] = [None] * len(self._plumber_pdf.pages)
149
+ self._load_text = load_text
117
150
 
118
151
  # Internal helper -----------------------------------------------------
119
152
  def _create_page(self, index: int) -> "Page":
@@ -123,7 +156,13 @@ class _LazyPageList(Sequence):
123
156
  from natural_pdf.core.page import Page
124
157
 
125
158
  plumber_page = self._plumber_pdf.pages[index]
126
- cached = Page(plumber_page, parent=self._parent_pdf, index=index, font_attrs=self._font_attrs)
159
+ cached = Page(
160
+ plumber_page,
161
+ parent=self._parent_pdf,
162
+ index=index,
163
+ font_attrs=self._font_attrs,
164
+ load_text=self._load_text,
165
+ )
127
166
  self._cache[index] = cached
128
167
  return cached
129
168
 
@@ -152,14 +191,44 @@ class _LazyPageList(Sequence):
152
191
  def __repr__(self) -> str: # pragma: no cover
153
192
  return f"<_LazyPageList(len={len(self)})>"
154
193
 
194
+
155
195
  # --- End Lazy Page List Helper --- #
156
196
 
197
+
157
198
  class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
158
- """
159
- Enhanced PDF wrapper built on top of pdfplumber.
199
+ """Enhanced PDF wrapper built on top of pdfplumber.
160
200
 
161
201
  This class provides a fluent interface for working with PDF documents,
162
- with improved selection, navigation, and extraction capabilities.
202
+ with improved selection, navigation, and extraction capabilities. It integrates
203
+ OCR, layout analysis, and AI-powered data extraction features while maintaining
204
+ compatibility with the underlying pdfplumber API.
205
+
206
+ The PDF class supports loading from files, URLs, or streams, and provides
207
+ spatial navigation, element selection with CSS-like selectors, and advanced
208
+ document processing workflows including multi-page content flows.
209
+
210
+ Attributes:
211
+ pages: Lazy-loaded list of Page objects for document pages.
212
+ path: Resolved path to the PDF file or source identifier.
213
+ source_path: Original path, URL, or stream identifier provided during initialization.
214
+ highlighter: Service for rendering highlighted visualizations of document content.
215
+
216
+ Example:
217
+ Basic usage:
218
+ ```python
219
+ import natural_pdf as npdf
220
+
221
+ pdf = npdf.PDF("document.pdf")
222
+ page = pdf.pages[0]
223
+ text_elements = page.find_all('text:contains("Summary")')
224
+ ```
225
+
226
+ Advanced usage with OCR:
227
+ ```python
228
+ pdf = npdf.PDF("scanned_document.pdf")
229
+ pdf.apply_ocr(engine="easyocr", resolution=144)
230
+ tables = pdf.pages[0].find_all('table')
231
+ ```
163
232
  """
164
233
 
165
234
  def __init__(
@@ -170,22 +239,56 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
170
239
  keep_spaces: bool = True,
171
240
  text_tolerance: Optional[dict] = None,
172
241
  auto_text_tolerance: bool = True,
242
+ text_layer: bool = True,
173
243
  ):
174
- """
175
- Initialize the enhanced PDF object.
244
+ """Initialize the enhanced PDF object.
176
245
 
177
246
  Args:
178
- path_or_url_or_stream: Path to the PDF file, a URL, or a file-like object (stream).
179
- reading_order: Whether to use natural reading order
180
- font_attrs: Font attributes for grouping characters into words
181
- keep_spaces: Whether to include spaces in word elements
182
- text_tolerance: PDFplumber-style tolerance settings
183
- auto_text_tolerance: Whether to automatically scale text tolerance
247
+ path_or_url_or_stream: Path to the PDF file (str/Path), a URL (str),
248
+ or a file-like object (stream). URLs must start with 'http://' or 'https://'.
249
+ reading_order: If True, use natural reading order for text extraction.
250
+ Defaults to True.
251
+ font_attrs: List of font attributes for grouping characters into words.
252
+ Common attributes include ['fontname', 'size']. Defaults to None.
253
+ keep_spaces: If True, include spaces in word elements during text extraction.
254
+ Defaults to True.
255
+ text_tolerance: PDFplumber-style tolerance settings for text grouping.
256
+ Dictionary with keys like 'x_tolerance', 'y_tolerance'. Defaults to None.
257
+ auto_text_tolerance: If True, automatically scale text tolerance based on
258
+ font size and document characteristics. Defaults to True.
259
+ text_layer: If True, preserve existing text layer from the PDF. If False,
260
+ removes all existing text elements during initialization, useful for
261
+ OCR-only workflows. Defaults to True.
262
+
263
+ Raises:
264
+ TypeError: If path_or_url_or_stream is not a valid type.
265
+ IOError: If the PDF file cannot be opened or read.
266
+ ValueError: If URL download fails.
267
+
268
+ Example:
269
+ ```python
270
+ # From file path
271
+ pdf = npdf.PDF("document.pdf")
272
+
273
+ # From URL
274
+ pdf = npdf.PDF("https://example.com/document.pdf")
275
+
276
+ # From stream
277
+ with open("document.pdf", "rb") as f:
278
+ pdf = npdf.PDF(f)
279
+
280
+ # With custom settings
281
+ pdf = npdf.PDF("document.pdf",
282
+ reading_order=False,
283
+ text_layer=False, # For OCR-only processing
284
+ font_attrs=['fontname', 'size', 'flags'])
285
+ ```
184
286
  """
185
287
  self._original_path_or_stream = path_or_url_or_stream
186
288
  self._temp_file = None
187
289
  self._resolved_path = None
188
290
  self._is_stream = False
291
+ self._text_layer = text_layer
189
292
  stream_to_open = None
190
293
 
191
294
  if hasattr(path_or_url_or_stream, "read"): # Check if it's file-like
@@ -257,7 +360,9 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
257
360
  self._manager_registry = {}
258
361
 
259
362
  # Lazily instantiate pages only when accessed
260
- self._pages = _LazyPageList(self, self._pdf, font_attrs=font_attrs)
363
+ self._pages = _LazyPageList(
364
+ self, self._pdf, font_attrs=font_attrs, load_text=self._text_layer
365
+ )
261
366
 
262
367
  self._element_cache = {}
263
368
  self._exclusions = []
@@ -267,6 +372,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
267
372
 
268
373
  self._initialize_managers()
269
374
  self._initialize_highlighter()
375
+
376
+ # Remove text layer if requested
377
+ if not self._text_layer:
378
+ logger.info("Removing text layer as requested (text_layer=False)")
379
+ # Text layer is not loaded when text_layer=False, so no need to remove
380
+ pass
381
+
270
382
  # Analysis results accessed via self.analyses property (see below)
271
383
 
272
384
  # --- Automatic cleanup when object is garbage-collected ---
@@ -303,7 +415,30 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
303
415
  self._managers = {} # Will hold instantiated managers
304
416
 
305
417
  def get_manager(self, key: str) -> Any:
306
- """Retrieve a manager instance by its key, instantiating it lazily if needed."""
418
+ """Retrieve a manager instance by its key, instantiating it lazily if needed.
419
+
420
+ Managers are specialized components that handle specific functionality like
421
+ classification, structured data extraction, or OCR processing. They are
422
+ instantiated on-demand to minimize memory usage and startup time.
423
+
424
+ Args:
425
+ key: The manager key to retrieve. Common keys include 'classification'
426
+ and 'structured_data'.
427
+
428
+ Returns:
429
+ The manager instance for the specified key.
430
+
431
+ Raises:
432
+ KeyError: If no manager is registered for the given key.
433
+ RuntimeError: If the manager failed to initialize.
434
+
435
+ Example:
436
+ ```python
437
+ pdf = npdf.PDF("document.pdf")
438
+ classification_mgr = pdf.get_manager('classification')
439
+ structured_data_mgr = pdf.get_manager('structured_data')
440
+ ```
441
+ """
307
442
  # Check if already instantiated
308
443
  if key in self._managers:
309
444
  manager_instance = self._managers[key]
@@ -339,12 +474,56 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
339
474
 
340
475
  @property
341
476
  def metadata(self) -> Dict[str, Any]:
342
- """Access metadata as a dictionary."""
477
+ """Access PDF metadata as a dictionary.
478
+
479
+ Returns document metadata such as title, author, creation date, and other
480
+ properties embedded in the PDF file. The exact keys available depend on
481
+ what metadata was included when the PDF was created.
482
+
483
+ Returns:
484
+ Dictionary containing PDF metadata. Common keys include 'Title',
485
+ 'Author', 'Subject', 'Creator', 'Producer', 'CreationDate', and
486
+ 'ModDate'. May be empty if no metadata is available.
487
+
488
+ Example:
489
+ ```python
490
+ pdf = npdf.PDF("document.pdf")
491
+ print(pdf.metadata.get('Title', 'No title'))
492
+ print(f"Created: {pdf.metadata.get('CreationDate')}")
493
+ ```
494
+ """
343
495
  return self._pdf.metadata
344
496
 
345
497
  @property
346
498
  def pages(self) -> "PageCollection":
347
- """Access pages as a PageCollection object."""
499
+ """Access pages as a PageCollection object.
500
+
501
+ Provides access to individual pages of the PDF document through a
502
+ collection interface that supports indexing, slicing, and iteration.
503
+ Pages are lazy-loaded to minimize memory usage.
504
+
505
+ Returns:
506
+ PageCollection object that provides list-like access to PDF pages.
507
+
508
+ Raises:
509
+ AttributeError: If PDF pages are not yet initialized.
510
+
511
+ Example:
512
+ ```python
513
+ pdf = npdf.PDF("document.pdf")
514
+
515
+ # Access individual pages
516
+ first_page = pdf.pages[0]
517
+ last_page = pdf.pages[-1]
518
+
519
+ # Slice pages
520
+ first_three = pdf.pages[0:3]
521
+
522
+ # Iterate over pages
523
+ for page in pdf.pages:
524
+ print(f"Page {page.index} has {len(page.chars)} characters")
525
+ ```
526
+ """
348
527
  from natural_pdf.elements.collections import PageCollection
349
528
 
350
529
  if not hasattr(self, "_pages"):
@@ -352,11 +531,26 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
352
531
  return PageCollection(self._pages)
353
532
 
354
533
  def clear_exclusions(self) -> "PDF":
355
- """
356
- Clear all exclusion functions from the PDF.
534
+ """Clear all exclusion functions from the PDF.
535
+
536
+ Removes all previously added exclusion functions that were used to filter
537
+ out unwanted content (like headers, footers, or administrative text) from
538
+ text extraction and analysis operations.
357
539
 
358
540
  Returns:
359
- Self for method chaining
541
+ Self for method chaining.
542
+
543
+ Raises:
544
+ AttributeError: If PDF pages are not yet initialized.
545
+
546
+ Example:
547
+ ```python
548
+ pdf = npdf.PDF("document.pdf")
549
+ pdf.add_exclusion(lambda page: page.find('text:contains("CONFIDENTIAL")').above())
550
+
551
+ # Later, remove all exclusions
552
+ pdf.clear_exclusions()
553
+ ```
360
554
  """
361
555
  if not hasattr(self, "_pages"):
362
556
  raise AttributeError("PDF pages not yet initialized.")
@@ -369,16 +563,46 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
369
563
  def add_exclusion(
370
564
  self, exclusion_func: Callable[["Page"], Optional["Region"]], label: str = None
371
565
  ) -> "PDF":
372
- """
373
- Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
566
+ """Add an exclusion function to the PDF.
567
+
568
+ Exclusion functions define regions of each page that should be ignored during
569
+ text extraction and analysis operations. This is useful for filtering out headers,
570
+ footers, watermarks, or other administrative content that shouldn't be included
571
+ in the main document processing.
374
572
 
375
573
  Args:
376
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None
377
- exclusion_func: A function that takes a Page and returns a Region to exclude, or None
378
- label: Optional label for this exclusion
574
+ exclusion_func: A function that takes a Page object and returns a Region
575
+ to exclude from processing, or None if no exclusion should be applied
576
+ to that page. The function is called once per page.
577
+ label: Optional descriptive label for this exclusion rule, useful for
578
+ debugging and identification.
379
579
 
380
580
  Returns:
381
- Self for method chaining
581
+ Self for method chaining.
582
+
583
+ Raises:
584
+ AttributeError: If PDF pages are not yet initialized.
585
+
586
+ Example:
587
+ ```python
588
+ pdf = npdf.PDF("document.pdf")
589
+
590
+ # Exclude headers (top 50 points of each page)
591
+ pdf.add_exclusion(
592
+ lambda page: page.region(0, 0, page.width, 50),
593
+ label="header_exclusion"
594
+ )
595
+
596
+ # Exclude any text containing "CONFIDENTIAL"
597
+ pdf.add_exclusion(
598
+ lambda page: page.find('text:contains("CONFIDENTIAL")').above(include_source=True)
599
+ if page.find('text:contains("CONFIDENTIAL")') else None,
600
+ label="confidential_exclusion"
601
+ )
602
+
603
+ # Chain multiple exclusions
604
+ pdf.add_exclusion(header_func).add_exclusion(footer_func)
605
+ ```
382
606
  """
383
607
  if not hasattr(self, "_pages"):
384
608
  raise AttributeError("PDF pages not yet initialized.")
@@ -404,23 +628,74 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
404
628
  options: Optional[Any] = None,
405
629
  pages: Optional[Union[Iterable[int], range, slice]] = None,
406
630
  ) -> "PDF":
407
- """
408
- Applies OCR to specified pages of the PDF using batch processing.
631
+ """Apply OCR to specified pages of the PDF using batch processing.
632
+
633
+ Performs optical character recognition on the specified pages, converting
634
+ image-based text into searchable and extractable text elements. This method
635
+ supports multiple OCR engines and provides batch processing for efficiency.
409
636
 
410
637
  Args:
411
- engine: Name of the OCR engine
412
- languages: List of language codes
413
- min_confidence: Minimum confidence threshold
414
- device: Device to run OCR on
415
- resolution: DPI resolution for page images
416
- apply_exclusions: Whether to mask excluded areas
417
- detect_only: If True, only detect text boxes
418
- replace: Whether to replace existing OCR elements
419
- options: Engine-specific options
420
- pages: Page indices to process or None for all pages
638
+ engine: Name of the OCR engine to use. Supported engines include
639
+ 'easyocr' (default), 'surya', 'paddle', and 'doctr'. If None,
640
+ uses the global default from natural_pdf.options.ocr.engine.
641
+ languages: List of language codes for OCR recognition (e.g., ['en', 'es']).
642
+ If None, uses the global default from natural_pdf.options.ocr.languages.
643
+ min_confidence: Minimum confidence threshold (0.0-1.0) for accepting
644
+ OCR results. Text with lower confidence will be filtered out.
645
+ If None, uses the global default.
646
+ device: Device to run OCR on ('cpu', 'cuda', 'mps'). Engine-specific
647
+ availability varies. If None, uses engine defaults.
648
+ resolution: DPI resolution for rendering pages to images before OCR.
649
+ Higher values improve accuracy but increase processing time and memory.
650
+ Typical values: 150 (fast), 300 (balanced), 600 (high quality).
651
+ apply_exclusions: If True, mask excluded regions before OCR to prevent
652
+ processing of headers, footers, or other unwanted content.
653
+ detect_only: If True, only detect text bounding boxes without performing
654
+ character recognition. Useful for layout analysis workflows.
655
+ replace: If True, replace any existing OCR elements on the pages.
656
+ If False, append new OCR results to existing elements.
657
+ options: Engine-specific options object (e.g., EasyOCROptions, SuryaOptions).
658
+ Allows fine-tuning of engine behavior beyond common parameters.
659
+ pages: Page indices to process. Can be:
660
+ - None: Process all pages
661
+ - slice: Process a range of pages (e.g., slice(0, 10))
662
+ - Iterable[int]: Process specific page indices (e.g., [0, 2, 5])
421
663
 
422
664
  Returns:
423
- Self for method chaining
665
+ Self for method chaining.
666
+
667
+ Raises:
668
+ ValueError: If invalid page index is provided.
669
+ TypeError: If pages parameter has invalid type.
670
+ RuntimeError: If OCR engine is not available or fails.
671
+
672
+ Example:
673
+ ```python
674
+ pdf = npdf.PDF("scanned_document.pdf")
675
+
676
+ # Basic OCR on all pages
677
+ pdf.apply_ocr()
678
+
679
+ # High-quality OCR with specific settings
680
+ pdf.apply_ocr(
681
+ engine='easyocr',
682
+ languages=['en', 'es'],
683
+ resolution=300,
684
+ min_confidence=0.8
685
+ )
686
+
687
+ # OCR specific pages only
688
+ pdf.apply_ocr(pages=[0, 1, 2]) # First 3 pages
689
+ pdf.apply_ocr(pages=slice(5, 10)) # Pages 5-9
690
+
691
+ # Detection-only workflow for layout analysis
692
+ pdf.apply_ocr(detect_only=True, resolution=150)
693
+ ```
694
+
695
+ Note:
696
+ OCR processing can be time and memory intensive, especially at high
697
+ resolutions. Consider using exclusions to mask unwanted regions and
698
+ processing pages in batches for large documents.
424
699
  """
425
700
  if not self._ocr_manager:
426
701
  logger.error("OCRManager not available. Cannot apply OCR.")
@@ -1013,10 +1288,47 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1013
1288
  **kwargs,
1014
1289
  ) -> Dict[str, Any]:
1015
1290
  """
1016
- Ask a question about the document content.
1291
+ Ask a single question about the document content.
1292
+
1293
+ Args:
1294
+ question: Question string to ask about the document
1295
+ mode: "extractive" to extract answer from document, "generative" to generate
1296
+ pages: Specific pages to query (default: all pages)
1297
+ min_confidence: Minimum confidence threshold for answers
1298
+ model: Optional model name for question answering
1299
+ **kwargs: Additional parameters passed to the QA engine
1300
+
1301
+ Returns:
1302
+ Dict containing: answer, confidence, found, page_num, source_elements, etc.
1303
+ """
1304
+ # Delegate to ask_batch and return the first result
1305
+ results = self.ask_batch([question], mode=mode, pages=pages, min_confidence=min_confidence, model=model, **kwargs)
1306
+ return results[0] if results else {
1307
+ "answer": None,
1308
+ "confidence": 0.0,
1309
+ "found": False,
1310
+ "page_num": None,
1311
+ "source_elements": [],
1312
+ }
1313
+
1314
+ def ask_batch(
1315
+ self,
1316
+ questions: List[str],
1317
+ mode: str = "extractive",
1318
+ pages: Union[int, List[int], range] = None,
1319
+ min_confidence: float = 0.1,
1320
+ model: str = None,
1321
+ **kwargs,
1322
+ ) -> List[Dict[str, Any]]:
1323
+ """
1324
+ Ask multiple questions about the document content using batch processing.
1325
+
1326
+ This method processes multiple questions efficiently in a single batch,
1327
+ avoiding the multiprocessing resource accumulation that can occur with
1328
+ sequential individual question calls.
1017
1329
 
1018
1330
  Args:
1019
- question: Question to ask about the document
1331
+ questions: List of question strings to ask about the document
1020
1332
  mode: "extractive" to extract answer from document, "generative" to generate
1021
1333
  pages: Specific pages to query (default: all pages)
1022
1334
  min_confidence: Minimum confidence threshold for answers
@@ -1024,45 +1336,147 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1024
1336
  **kwargs: Additional parameters passed to the QA engine
1025
1337
 
1026
1338
  Returns:
1027
- A dictionary containing the answer, confidence, and other metadata
1028
- A dictionary containing the answer, confidence, and other metadata
1339
+ List of Dicts, each containing: answer, confidence, found, page_num, source_elements, etc.
1029
1340
  """
1030
1341
  from natural_pdf.qa import get_qa_engine
1031
1342
 
1343
+ if not questions:
1344
+ return []
1345
+
1346
+ if not isinstance(questions, list) or not all(isinstance(q, str) for q in questions):
1347
+ raise TypeError("'questions' must be a list of strings")
1348
+
1032
1349
  qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
1033
1350
 
1351
+ # Resolve target pages
1034
1352
  if pages is None:
1035
- target_pages = list(range(len(self.pages)))
1353
+ target_pages = self.pages
1036
1354
  elif isinstance(pages, int):
1037
- target_pages = [pages]
1355
+ if 0 <= pages < len(self.pages):
1356
+ target_pages = [self.pages[pages]]
1357
+ else:
1358
+ raise IndexError(f"Page index {pages} out of range (0-{len(self.pages)-1})")
1038
1359
  elif isinstance(pages, (list, range)):
1039
- target_pages = pages
1360
+ target_pages = []
1361
+ for page_idx in pages:
1362
+ if 0 <= page_idx < len(self.pages):
1363
+ target_pages.append(self.pages[page_idx])
1364
+ else:
1365
+ logger.warning(f"Page index {page_idx} out of range, skipping")
1040
1366
  else:
1041
1367
  raise ValueError(f"Invalid pages parameter: {pages}")
1042
1368
 
1043
- results = []
1044
- for page_idx in target_pages:
1045
- if 0 <= page_idx < len(self.pages):
1046
- page = self.pages[page_idx]
1047
- page_result = qa_engine.ask_pdf_page(
1048
- page=page, question=question, min_confidence=min_confidence, **kwargs
1049
- )
1369
+ if not target_pages:
1370
+ logger.warning("No valid pages found for QA processing.")
1371
+ return [
1372
+ {
1373
+ "answer": None,
1374
+ "confidence": 0.0,
1375
+ "found": False,
1376
+ "page_num": None,
1377
+ "source_elements": [],
1378
+ }
1379
+ for _ in questions
1380
+ ]
1050
1381
 
1051
- if page_result and page_result.get("found", False):
1052
- results.append(page_result)
1382
+ logger.info(f"Processing {len(questions)} question(s) across {len(target_pages)} page(s) using batch QA...")
1053
1383
 
1054
- results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
1384
+ # Collect all page images and metadata for batch processing
1385
+ page_images = []
1386
+ page_word_boxes = []
1387
+ page_metadata = []
1055
1388
 
1056
- if results:
1057
- return results[0]
1058
- else:
1059
- return {
1060
- "answer": None,
1061
- "confidence": 0.0,
1062
- "found": False,
1063
- "page_num": None,
1064
- "source_elements": [],
1065
- }
1389
+ for page in target_pages:
1390
+ # Get page image
1391
+ try:
1392
+ page_image = page.to_image(resolution=150, include_highlights=False)
1393
+ if page_image is None:
1394
+ logger.warning(f"Failed to render image for page {page.number}, skipping")
1395
+ continue
1396
+
1397
+ # Get text elements for word boxes
1398
+ elements = page.find_all("text")
1399
+ if not elements:
1400
+ logger.warning(f"No text elements found on page {page.number}")
1401
+ word_boxes = []
1402
+ else:
1403
+ word_boxes = qa_engine._get_word_boxes_from_elements(elements, offset_x=0, offset_y=0)
1404
+
1405
+ page_images.append(page_image)
1406
+ page_word_boxes.append(word_boxes)
1407
+ page_metadata.append({
1408
+ "page_number": page.number,
1409
+ "page_object": page
1410
+ })
1411
+
1412
+ except Exception as e:
1413
+ logger.warning(f"Error processing page {page.number}: {e}")
1414
+ continue
1415
+
1416
+ if not page_images:
1417
+ logger.warning("No page images could be processed for QA.")
1418
+ return [
1419
+ {
1420
+ "answer": None,
1421
+ "confidence": 0.0,
1422
+ "found": False,
1423
+ "page_num": None,
1424
+ "source_elements": [],
1425
+ }
1426
+ for _ in questions
1427
+ ]
1428
+
1429
+ # Process all questions against all pages in batch
1430
+ all_results = []
1431
+
1432
+ for question_text in questions:
1433
+ question_results = []
1434
+
1435
+ # Ask this question against each page (but in batch per page)
1436
+ for i, (page_image, word_boxes, page_meta) in enumerate(zip(page_images, page_word_boxes, page_metadata)):
1437
+ try:
1438
+ # Use the DocumentQA batch interface
1439
+ page_result = qa_engine.ask(
1440
+ image=page_image,
1441
+ question=question_text,
1442
+ word_boxes=word_boxes,
1443
+ min_confidence=min_confidence,
1444
+ **kwargs
1445
+ )
1446
+
1447
+ if page_result and page_result.found:
1448
+ # Add page metadata to result
1449
+ page_result_dict = {
1450
+ "answer": page_result.answer,
1451
+ "confidence": page_result.confidence,
1452
+ "found": page_result.found,
1453
+ "page_num": page_meta["page_number"],
1454
+ "source_elements": getattr(page_result, 'source_elements', []),
1455
+ "start": getattr(page_result, 'start', -1),
1456
+ "end": getattr(page_result, 'end', -1),
1457
+ }
1458
+ question_results.append(page_result_dict)
1459
+
1460
+ except Exception as e:
1461
+ logger.warning(f"Error processing question '{question_text}' on page {page_meta['page_number']}: {e}")
1462
+ continue
1463
+
1464
+ # Sort results by confidence and take the best one for this question
1465
+ question_results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
1466
+
1467
+ if question_results:
1468
+ all_results.append(question_results[0])
1469
+ else:
1470
+ # No results found for this question
1471
+ all_results.append({
1472
+ "answer": None,
1473
+ "confidence": 0.0,
1474
+ "found": False,
1475
+ "page_num": None,
1476
+ "source_elements": [],
1477
+ })
1478
+
1479
+ return all_results
1066
1480
 
1067
1481
  def search_within_index(
1068
1482
  self,
@@ -1463,6 +1877,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1463
1877
  reading_order=self._reading_order,
1464
1878
  font_attrs=self._font_attrs,
1465
1879
  keep_spaces=self._config.get("keep_spaces", True),
1880
+ text_layer=self._text_layer,
1466
1881
  )
1467
1882
  return new_pdf
1468
1883
  except Exception as e:
@@ -1506,7 +1921,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1506
1921
 
1507
1922
  if not manager or not manager.is_available():
1508
1923
  from natural_pdf.classification.manager import is_classification_available
1509
-
1924
+
1510
1925
  if not is_classification_available():
1511
1926
  raise ImportError(
1512
1927
  "Classification dependencies missing. "