natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,572 @@
1
+ import pdfplumber
2
+ import logging
3
+ from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type
4
+
5
+ from natural_pdf.core.page import Page
6
+ from natural_pdf.selectors.parser import parse_selector
7
+ from natural_pdf.elements.collections import ElementCollection
8
+ from natural_pdf.elements.region import Region
9
+ from natural_pdf.utils.ocr import OCRManager
10
+
11
+ # Set up module logger
12
+ logger = logging.getLogger("natural_pdf.core.pdf")
13
+
14
+ # Import OCR engines
15
+ try:
16
+ from natural_pdf.ocr import OCREngine, EasyOCREngine, PaddleOCREngine, get_engine
17
+ HAS_OCR_ENGINES = True
18
+ except ImportError:
19
+ # Fallback if the OCR engines are not available
20
+ HAS_OCR_ENGINES = False
21
+
22
+
23
+ class PDF:
24
+ """
25
+ Enhanced PDF wrapper built on top of pdfplumber.
26
+
27
+ This class provides a fluent interface for working with PDF documents,
28
+ with improved selection, navigation, and extraction capabilities.
29
+ """
30
+
31
+ def __init__(self, path: str, reading_order: bool = True,
32
+ ocr: Optional[Union[bool, str, List, Dict]] = None,
33
+ ocr_engine: Optional[Union[str, Any]] = None,
34
+ font_attrs: Optional[List[str]] = None,
35
+ keep_spaces: bool = True):
36
+ """
37
+ Initialize the enhanced PDF object.
38
+
39
+ Args:
40
+ path: Path to the PDF file
41
+ reading_order: Whether to use natural reading order
42
+ ocr: OCR configuration:
43
+ - None or False: OCR disabled
44
+ - True: OCR enabled with defaults
45
+ - "auto": Auto OCR mode
46
+ - ["en", "fr"]: Use these languages
47
+ - {"languages": ["en"]}: Detailed configuration
48
+ ocr_engine: OCR engine to use:
49
+ - None: Use default engine (PaddleOCR if available, otherwise EasyOCR)
50
+ - "easyocr": Use EasyOCR engine
51
+ - "paddleocr": Use PaddleOCR engine
52
+ - OCREngine instance: Use the provided engine instance
53
+ font_attrs: Font attributes to consider when grouping characters into words.
54
+ Default: ['fontname', 'size'] (Group by font name and size)
55
+ None: Only consider spatial relationships
56
+ List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
57
+ keep_spaces: Whether to include spaces in word elements (default: True).
58
+ True: Spaces are part of words, better for multi-word searching
59
+ False: Break text at spaces, each word is separate (legacy behavior)
60
+ """
61
+ logger.info(f"Initializing PDF from {path}")
62
+ logger.debug(f"Parameters: reading_order={reading_order}, ocr={ocr}, ocr_engine={ocr_engine}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
63
+
64
+ self._pdf = pdfplumber.open(path)
65
+ self._path = path
66
+ self._reading_order = reading_order
67
+ self._config = {
68
+ 'keep_spaces': keep_spaces
69
+ }
70
+
71
+ # Initialize OCR engine
72
+ if HAS_OCR_ENGINES:
73
+ # Handle OCR engine selection
74
+ if ocr_engine is None:
75
+ # Use default engine (EasyOCR)
76
+ self._ocr_engine = EasyOCREngine()
77
+ elif isinstance(ocr_engine, str):
78
+ # String-based engine selection
79
+ try:
80
+ self._ocr_engine = get_engine(ocr_engine)
81
+ except (ImportError, ValueError) as e:
82
+ print(f"Warning: OCR engine '{ocr_engine}' could not be loaded: {e}")
83
+ print("Falling back to default OCR engine.")
84
+ self._ocr_engine = EasyOCREngine()
85
+ elif hasattr(ocr_engine, 'process_image') and hasattr(ocr_engine, 'is_available'):
86
+ # Engine instance
87
+ self._ocr_engine = ocr_engine
88
+ else:
89
+ print("Warning: Invalid OCR engine provided. Using default engine.")
90
+ self._ocr_engine = EasyOCREngine()
91
+ else:
92
+ # Fallback to legacy OCR manager
93
+ self._ocr_engine = None
94
+
95
+ # Normalize OCR configuration
96
+ if self._ocr_engine:
97
+ # Use new OCR engine system
98
+ if ocr is None:
99
+ # If no OCR config is provided, disable OCR by default
100
+ ocr = {"enabled": False}
101
+ elif ocr is False:
102
+ # Explicit disable
103
+ ocr = {"enabled": False}
104
+ elif ocr is True:
105
+ # Explicit enable
106
+ ocr = {"enabled": True}
107
+ elif isinstance(ocr, dict) and "enabled" not in ocr:
108
+ # If OCR config is provided but doesn't specify enabled, disable it by default
109
+ ocr["enabled"] = False
110
+
111
+ # Now normalize the config with the engine
112
+ self._ocr_config = self._ocr_engine.normalize_config(ocr)
113
+ logger.info(f"Initialized PDF with OCR engine: {self._ocr_engine.__class__.__name__}, enabled: {self._ocr_config.get('enabled')}")
114
+
115
+ # Double-check enabled status for debugging
116
+ if isinstance(ocr, dict) and "enabled" in ocr:
117
+ if ocr["enabled"] != self._ocr_config.get("enabled"):
118
+ logger.warning(f"OCR enabled status changed during normalization: {ocr['enabled']} -> {self._ocr_config.get('enabled')}")
119
+ else:
120
+ # Fallback to legacy OCR manager
121
+ self._ocr_manager = OCRManager.get_instance()
122
+ if ocr is None:
123
+ # If no OCR config is provided, disable OCR by default
124
+ ocr = {"enabled": False}
125
+ elif ocr is True:
126
+ # Explicit enable
127
+ ocr = {"enabled": True}
128
+
129
+ self._ocr_config = self._ocr_manager.normalize_config(ocr)
130
+
131
+ self._font_attrs = font_attrs # Store the font attribute configuration
132
+ self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
133
+ self._element_cache = {}
134
+ self._exclusions = [] # List to store exclusion functions/regions
135
+ self._regions = [] # List to store region functions/definitions
136
+
137
+ @property
138
+ def pages(self) -> 'PageCollection':
139
+ """Access pages as a PageCollection object."""
140
+ from natural_pdf.elements.collections import PageCollection
141
+ return PageCollection(self._pages)
142
+
143
+ def with_ocr(self, enabled: bool = False, languages: List[str] = None,
144
+ engine: str = None, min_confidence: float = None) -> 'PDF':
145
+ """
146
+ Configure OCR settings using a builder pattern.
147
+
148
+ Args:
149
+ enabled: Whether OCR is enabled (default: False)
150
+ languages: List of language codes (e.g., ["en", "fr"])
151
+ engine: OCR engine to use ("easyocr" or "paddleocr")
152
+ min_confidence: Minimum confidence threshold for OCR results
153
+
154
+ Returns:
155
+ Self for method chaining
156
+ """
157
+ # Initialize the config object
158
+ config = {"enabled": enabled}
159
+
160
+ # Add optional parameters if provided
161
+ if languages:
162
+ config["languages"] = languages
163
+ if min_confidence is not None:
164
+ config["min_confidence"] = min_confidence
165
+
166
+ # Set up the OCR engine if specified
167
+ if engine:
168
+ self._ocr_engine = None # Clear existing engine
169
+ try:
170
+ from natural_pdf.ocr import get_engine
171
+ self._ocr_engine = get_engine(engine)
172
+ except (ImportError, ValueError) as e:
173
+ logger.warning(f"OCR engine '{engine}' could not be loaded: {e}")
174
+ logger.warning("Falling back to default OCR engine.")
175
+ from natural_pdf.ocr import EasyOCREngine
176
+ self._ocr_engine = EasyOCREngine()
177
+
178
+ # Normalize the configuration
179
+ if self._ocr_engine:
180
+ self._ocr_config = self._ocr_engine.normalize_config(config)
181
+ else:
182
+ from natural_pdf.utils.ocr import OCRManager
183
+ self._ocr_manager = OCRManager.get_instance()
184
+ self._ocr_config = self._ocr_manager.normalize_config(config)
185
+
186
+ return self
187
+
188
+ def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
189
+ """
190
+ Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
191
+
192
+ Args:
193
+ exclusion_func: A function that takes a Page and returns a Region to exclude
194
+ label: Optional label for this exclusion
195
+
196
+ Returns:
197
+ Self for method chaining
198
+ """
199
+ # Store exclusion with its label at PDF level
200
+ exclusion_data = (exclusion_func, label)
201
+ self._exclusions.append(exclusion_data)
202
+
203
+ # Create a wrapper function that properly evaluates on each page
204
+ def exclusion_wrapper(page):
205
+ try:
206
+ region = exclusion_func(page)
207
+ return region
208
+ except Exception as e:
209
+ print(f"Error in PDF-level exclusion for page {page.index}: {e}")
210
+ return None
211
+
212
+ # Apply this exclusion to all pages using the wrapper
213
+ for page in self._pages:
214
+ page.add_exclusion(exclusion_wrapper)
215
+
216
+ return self
217
+
218
+ def add_region(self, region_func: Callable[[Page], Region], name: str = None) -> 'PDF':
219
+ """
220
+ Add a region function to the PDF. This creates regions on all pages using the provided function.
221
+
222
+ Args:
223
+ region_func: A function that takes a Page and returns a Region
224
+ name: Optional name for the region
225
+
226
+ Returns:
227
+ Self for method chaining
228
+ """
229
+ # Store region with its name at PDF level
230
+ region_data = (region_func, name)
231
+ self._regions.append(region_data)
232
+
233
+ # Create a wrapper function that properly evaluates on each page
234
+ def region_wrapper(page):
235
+ try:
236
+ region = region_func(page)
237
+ if region:
238
+ # Apply name if provided
239
+ if name:
240
+ region.name = name
241
+ region.source = 'named'
242
+ return region
243
+ except Exception as e:
244
+ print(f"Error in PDF-level region for page {page.index}: {e}")
245
+ return None
246
+
247
+ # Apply this region to all pages
248
+ for page in self._pages:
249
+ try:
250
+ region = region_wrapper(page)
251
+ if region:
252
+ page.add_region(region, name=name)
253
+ except Exception as e:
254
+ print(f"Error adding region to page {page.index}: {e}")
255
+
256
+ return self
257
+
258
+ def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
259
+ """
260
+ Find the first element matching the selector.
261
+
262
+ Args:
263
+ selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
264
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
265
+ regex: Whether to use regex for text search in :contains (default: False)
266
+ case: Whether to do case-sensitive text search (default: True)
267
+ **kwargs: Additional filter parameters
268
+
269
+ Returns:
270
+ Element object or None if not found
271
+ """
272
+ selector_obj = parse_selector(selector)
273
+
274
+ # Pass regex and case flags to selector function
275
+ kwargs['regex'] = regex
276
+ kwargs['case'] = case
277
+
278
+ results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
279
+ return results.first if results else None
280
+
281
+ def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
282
+ """
283
+ Find all elements matching the selector.
284
+
285
+ Args:
286
+ selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
287
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
288
+ regex: Whether to use regex for text search in :contains (default: False)
289
+ case: Whether to do case-sensitive text search (default: True)
290
+ **kwargs: Additional filter parameters
291
+
292
+ Returns:
293
+ ElementCollection with matching elements
294
+ """
295
+ selector_obj = parse_selector(selector)
296
+
297
+ # Pass regex and case flags to selector function
298
+ kwargs['regex'] = regex
299
+ kwargs['case'] = case
300
+
301
+ results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
302
+ return results
303
+
304
+ def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> ElementCollection:
305
+ """
306
+ Apply selector to PDF elements across all pages.
307
+
308
+ Args:
309
+ selector_obj: Parsed selector dictionary
310
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
311
+ **kwargs: Additional filter parameters
312
+
313
+ Returns:
314
+ ElementCollection of matching elements
315
+ """
316
+ from natural_pdf.elements.collections import ElementCollection
317
+
318
+ # Determine page range to search
319
+ page_range = kwargs.get('pages', range(len(self.pages)))
320
+ if isinstance(page_range, (int, slice)):
321
+ # Convert int or slice to range
322
+ if isinstance(page_range, int):
323
+ page_range = [page_range]
324
+ elif isinstance(page_range, slice):
325
+ start = page_range.start or 0
326
+ stop = page_range.stop or len(self.pages)
327
+ step = page_range.step or 1
328
+ page_range = range(start, stop, step)
329
+
330
+ # Check for cross-page pseudo-classes
331
+ cross_page = False
332
+ for pseudo in selector_obj.get('pseudo_classes', []):
333
+ if pseudo.get('name') in ('spans', 'continues'):
334
+ cross_page = True
335
+ break
336
+
337
+ # If searching across pages, handle specially
338
+ if cross_page:
339
+ # TODO: Implement cross-page element matching
340
+ return ElementCollection([])
341
+
342
+ # Regular case: collect elements from each page
343
+ all_elements = []
344
+ for page_idx in page_range:
345
+ if 0 <= page_idx < len(self.pages):
346
+ page = self.pages[page_idx]
347
+ page_elements = page._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
348
+ all_elements.extend(page_elements.elements)
349
+
350
+ # Create a combined collection
351
+ combined = ElementCollection(all_elements)
352
+
353
+ # Sort in document order if requested
354
+ if kwargs.get('document_order', True):
355
+ combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
356
+
357
+ return combined
358
+
359
+ def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
360
+ use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
361
+ """
362
+ Extract text from the entire document or matching elements.
363
+
364
+ Args:
365
+ selector: Optional selector to filter elements
366
+ preserve_whitespace: Whether to keep blank characters (default: True)
367
+ use_exclusions: Whether to apply exclusion regions (default: True)
368
+ debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
369
+ **kwargs: Additional extraction parameters
370
+
371
+ Returns:
372
+ Extracted text as string
373
+ """
374
+ # If selector is provided, find elements first
375
+ if selector:
376
+ elements = self.find_all(selector)
377
+ return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
378
+
379
+ # Otherwise extract from all pages
380
+ if debug_exclusions:
381
+ print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
382
+ print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
383
+
384
+ texts = []
385
+ for page in self.pages:
386
+ texts.append(page.extract_text(
387
+ preserve_whitespace=preserve_whitespace,
388
+ use_exclusions=use_exclusions,
389
+ debug_exclusions=debug_exclusions,
390
+ **kwargs
391
+ ))
392
+
393
+ if debug_exclusions:
394
+ print(f"PDF: Combined {len(texts)} pages of text")
395
+
396
+ return "\n".join(texts)
397
+
398
+ # Note: extract_text_compat method removed
399
+
400
+ def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
401
+ """
402
+ Shorthand for finding elements and extracting their text.
403
+
404
+ Args:
405
+ selector: CSS-like selector string
406
+ preserve_whitespace: Whether to keep blank characters (default: True)
407
+ **kwargs: Additional extraction parameters
408
+
409
+ Returns:
410
+ Extracted text from matching elements
411
+ """
412
+ return self.extract_text(selector, preserve_whitespace=preserve_whitespace, **kwargs)
413
+
414
+ def debug_ocr(self, output_path, pages=None):
415
+ """
416
+ Generate an interactive HTML debug report for OCR results.
417
+
418
+ This creates a single-file HTML report with:
419
+ - Side-by-side view of image regions and OCR text
420
+ - Confidence scores with color coding
421
+ - Editable correction fields
422
+ - Filtering and sorting options
423
+ - Export functionality for corrected text
424
+
425
+ Args:
426
+ output_path: Path to save the HTML report
427
+ pages: Pages to include in the report (default: all pages)
428
+ Can be a page index, slice, or list of page indices
429
+
430
+ Returns:
431
+ Self for method chaining
432
+ """
433
+ from natural_pdf.utils.ocr import debug_ocr_to_html
434
+
435
+ if pages is None:
436
+ # Include all pages
437
+ target_pages = self.pages
438
+ elif isinstance(pages, int):
439
+ # Single page index
440
+ target_pages = [self.pages[pages]]
441
+ elif isinstance(pages, slice):
442
+ # Slice of pages
443
+ target_pages = self.pages[pages]
444
+ else:
445
+ # Assume it's an iterable of page indices
446
+ target_pages = [self.pages[i] for i in pages]
447
+
448
+ debug_ocr_to_html(target_pages, output_path)
449
+ return self
450
+
451
+ def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
452
+ """
453
+ Extract tables from the document or matching elements.
454
+
455
+ Args:
456
+ selector: Optional selector to filter tables
457
+ merge_across_pages: Whether to merge tables that span across pages
458
+ **kwargs: Additional extraction parameters
459
+
460
+ Returns:
461
+ List of extracted tables
462
+ """
463
+ # TODO: Implement table extraction
464
+ return [] # Placeholder
465
+
466
+ def ask(self, question: str,
467
+ mode: str = "extractive",
468
+ pages: Union[int, List[int], range] = None,
469
+ min_confidence: float = 0.1,
470
+ model: str = None,
471
+ **kwargs) -> Dict[str, Any]:
472
+ """
473
+ Ask a question about the document content.
474
+
475
+ Args:
476
+ question: Question to ask about the document
477
+ mode: "extractive" to extract answer from document, "generative" to generate
478
+ pages: Specific pages to query (default: all pages)
479
+ min_confidence: Minimum confidence threshold for answers
480
+ model: Optional model name for question answering
481
+ **kwargs: Additional parameters passed to the QA engine
482
+
483
+ Returns:
484
+ Dictionary with answer and confidence
485
+ """
486
+ try:
487
+ from natural_pdf.qa import get_qa_engine
488
+
489
+ # Initialize or get QA engine
490
+ qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
491
+
492
+ # Determine which pages to query
493
+ if pages is None:
494
+ # Query all pages by default, prioritizing first few pages
495
+ target_pages = list(range(min(10, len(self.pages))))
496
+ elif isinstance(pages, int):
497
+ # Single page
498
+ target_pages = [pages]
499
+ elif isinstance(pages, (list, range)):
500
+ # List or range of pages
501
+ target_pages = pages
502
+ else:
503
+ raise ValueError(f"Invalid pages parameter: {pages}")
504
+
505
+ # Actually query each page and gather results
506
+ results = []
507
+ for page_idx in target_pages:
508
+ if 0 <= page_idx < len(self.pages):
509
+ page = self.pages[page_idx]
510
+ page_result = qa_engine.ask_pdf_page(
511
+ page=page,
512
+ question=question,
513
+ min_confidence=min_confidence,
514
+ **kwargs
515
+ )
516
+
517
+ # Add to results if it found an answer
518
+ if page_result.get("found", False):
519
+ results.append(page_result)
520
+
521
+ # Sort results by confidence
522
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
523
+
524
+ # Return the best result, or an empty result if none found
525
+ if results:
526
+ return results[0]
527
+ else:
528
+ return {
529
+ "answer": "",
530
+ "confidence": 0.0,
531
+ "found": False,
532
+ "message": "No answer found in document"
533
+ }
534
+
535
+ except ImportError as e:
536
+ logger.warning(f"QA functionality not available: {e}")
537
+ return {
538
+ "answer": "",
539
+ "confidence": 0.0,
540
+ "error": "QA functionality not available",
541
+ "found": False
542
+ }
543
+ except Exception as e:
544
+ logger.error(f"Error in document QA: {e}")
545
+ return {
546
+ "answer": "",
547
+ "confidence": 0.0,
548
+ "error": str(e),
549
+ "found": False
550
+ }
551
+
552
+ def __len__(self) -> int:
553
+ """Return the number of pages in the PDF."""
554
+ return len(self.pages)
555
+
556
+ def __getitem__(self, key) -> Union[Page, List[Page]]:
557
+ """Access pages by index or slice."""
558
+ return self.pages[key]
559
+
560
+ def close(self):
561
+ """Close the underlying PDF file."""
562
+ if hasattr(self, '_pdf') and self._pdf is not None:
563
+ self._pdf.close()
564
+ self._pdf = None
565
+
566
+ def __enter__(self):
567
+ """Context manager entry."""
568
+ return self
569
+
570
+ def __exit__(self, exc_type, exc_val, exc_tb):
571
+ """Context manager exit."""
572
+ self.close()
@@ -0,0 +1,3 @@
1
+ """
2
+ Element classes for Natural PDF.
3
+ """