natural-pdf 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. natural_pdf/__init__.py +55 -0
  2. natural_pdf/analyzers/__init__.py +6 -0
  3. natural_pdf/analyzers/layout/__init__.py +1 -0
  4. natural_pdf/analyzers/layout/base.py +151 -0
  5. natural_pdf/analyzers/layout/docling.py +247 -0
  6. natural_pdf/analyzers/layout/layout_analyzer.py +166 -0
  7. natural_pdf/analyzers/layout/layout_manager.py +200 -0
  8. natural_pdf/analyzers/layout/layout_options.py +78 -0
  9. natural_pdf/analyzers/layout/paddle.py +240 -0
  10. natural_pdf/analyzers/layout/surya.py +151 -0
  11. natural_pdf/analyzers/layout/tatr.py +251 -0
  12. natural_pdf/analyzers/layout/yolo.py +165 -0
  13. natural_pdf/analyzers/text_options.py +60 -0
  14. natural_pdf/analyzers/text_structure.py +270 -0
  15. natural_pdf/analyzers/utils.py +57 -0
  16. natural_pdf/core/__init__.py +3 -0
  17. natural_pdf/core/element_manager.py +457 -0
  18. natural_pdf/core/highlighting_service.py +698 -0
  19. natural_pdf/core/page.py +1444 -0
  20. natural_pdf/core/pdf.py +653 -0
  21. natural_pdf/elements/__init__.py +3 -0
  22. natural_pdf/elements/base.py +761 -0
  23. natural_pdf/elements/collections.py +1345 -0
  24. natural_pdf/elements/line.py +140 -0
  25. natural_pdf/elements/rect.py +122 -0
  26. natural_pdf/elements/region.py +1793 -0
  27. natural_pdf/elements/text.py +304 -0
  28. natural_pdf/ocr/__init__.py +56 -0
  29. natural_pdf/ocr/engine.py +104 -0
  30. natural_pdf/ocr/engine_easyocr.py +179 -0
  31. natural_pdf/ocr/engine_paddle.py +204 -0
  32. natural_pdf/ocr/engine_surya.py +171 -0
  33. natural_pdf/ocr/ocr_manager.py +191 -0
  34. natural_pdf/ocr/ocr_options.py +114 -0
  35. natural_pdf/qa/__init__.py +3 -0
  36. natural_pdf/qa/document_qa.py +396 -0
  37. natural_pdf/selectors/__init__.py +4 -0
  38. natural_pdf/selectors/parser.py +354 -0
  39. natural_pdf/templates/__init__.py +1 -0
  40. natural_pdf/templates/ocr_debug.html +517 -0
  41. natural_pdf/utils/__init__.py +3 -0
  42. natural_pdf/utils/highlighting.py +12 -0
  43. natural_pdf/utils/reading_order.py +227 -0
  44. natural_pdf/utils/visualization.py +223 -0
  45. natural_pdf/widgets/__init__.py +4 -0
  46. natural_pdf/widgets/frontend/viewer.js +88 -0
  47. natural_pdf/widgets/viewer.py +765 -0
  48. natural_pdf-0.1.0.dist-info/METADATA +295 -0
  49. natural_pdf-0.1.0.dist-info/RECORD +52 -0
  50. natural_pdf-0.1.0.dist-info/WHEEL +5 -0
  51. natural_pdf-0.1.0.dist-info/licenses/LICENSE +21 -0
  52. natural_pdf-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,653 @@
1
+ import pdfplumber
2
+ import logging
3
+ import tempfile
4
+ import os
5
+ import re
6
+ import urllib.request
7
+ from typing import List, Optional, Union, Any, Dict, Callable, Tuple, Type, Iterable # Added Iterable
8
+ from PIL import Image
9
+
10
+ from natural_pdf.core.page import Page
11
+ from natural_pdf.selectors.parser import parse_selector
12
+ from natural_pdf.elements.collections import ElementCollection
13
+ from natural_pdf.elements.region import Region
14
+ from natural_pdf.ocr import OCRManager, OCROptions
15
+ from natural_pdf.analyzers.layout.layout_manager import LayoutManager # Import the new LayoutManager
16
+ from natural_pdf.core.highlighting_service import HighlightingService # <-- Import the new service
17
+
18
+ # Set up module logger
19
+ logger = logging.getLogger("natural_pdf.core.pdf")
20
+
21
+
22
+ class PDF:
23
+ """
24
+ Enhanced PDF wrapper built on top of pdfplumber.
25
+
26
+ This class provides a fluent interface for working with PDF documents,
27
+ with improved selection, navigation, and extraction capabilities.
28
+ """
29
+
30
+ def __init__(self, path_or_url: str, reading_order: bool = True,
31
+ font_attrs: Optional[List[str]] = None,
32
+ keep_spaces: bool = True):
33
+ """
34
+ Initialize the enhanced PDF object.
35
+
36
+ Args:
37
+ path_or_url: Path to the PDF file or a URL to a PDF
38
+ reading_order: Whether to use natural reading order
39
+ font_attrs: Font attributes to consider when grouping characters into words.
40
+ Default: ['fontname', 'size'] (Group by font name and size)
41
+ None: Only consider spatial relationships
42
+ List: Custom attributes to consider (e.g., ['fontname', 'size', 'color'])
43
+ keep_spaces: Whether to include spaces in word elements (default: True).
44
+ True: Spaces are part of words, better for multi-word searching
45
+ False: Break text at spaces, each word is separate (legacy behavior)
46
+ """
47
+ # Check if the input is a URL
48
+ is_url = path_or_url.startswith('http://') or path_or_url.startswith('https://')
49
+
50
+ # Initialize path-related attributes
51
+ self._original_path = path_or_url
52
+ self._temp_file = None
53
+
54
+ if is_url:
55
+ logger.info(f"Downloading PDF from URL: {path_or_url}")
56
+ try:
57
+ # Create a temporary file to store the downloaded PDF
58
+ self._temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
59
+
60
+ # Download the PDF
61
+ with urllib.request.urlopen(path_or_url) as response:
62
+ self._temp_file.write(response.read())
63
+ self._temp_file.flush()
64
+ self._temp_file.close()
65
+
66
+ # Use the temporary file path
67
+ path = self._temp_file.name
68
+ logger.info(f"PDF downloaded to temporary file: {path}")
69
+ except Exception as e:
70
+ if self._temp_file and hasattr(self._temp_file, 'name'):
71
+ try:
72
+ os.unlink(self._temp_file.name)
73
+ except:
74
+ pass
75
+ logger.error(f"Failed to download PDF from URL: {e}")
76
+ raise ValueError(f"Failed to download PDF from URL: {e}")
77
+ else:
78
+ # Use the provided path directly
79
+ path = path_or_url
80
+
81
+ logger.info(f"Initializing PDF from {path}")
82
+ logger.debug(f"Parameters: reading_order={reading_order}, font_attrs={font_attrs}, keep_spaces={keep_spaces}")
83
+
84
+ self._pdf = pdfplumber.open(path)
85
+ self._path = path
86
+ self._reading_order = reading_order
87
+ self._config = {
88
+ 'keep_spaces': keep_spaces
89
+ }
90
+ self.path = path
91
+
92
+ self._font_attrs = font_attrs # Store the font attribute configuration
93
+
94
+ if OCRManager:
95
+ self._ocr_manager = OCRManager()
96
+ logger.info(f"Initialized OCRManager. Available engines: {self._ocr_manager.get_available_engines()}")
97
+ else:
98
+ self._ocr_manager = None
99
+ logger.warning("OCRManager could not be imported. OCR functionality disabled.")
100
+
101
+ if LayoutManager:
102
+ self._layout_manager = LayoutManager()
103
+ logger.info(f"Initialized LayoutManager. Available engines: {self._layout_manager.get_available_engines()}")
104
+ else:
105
+ self._layout_manager = None
106
+ logger.warning("LayoutManager could not be imported. Layout analysis disabled.")
107
+
108
+ self._pages = [Page(p, parent=self, index=i, font_attrs=font_attrs) for i, p in enumerate(self._pdf.pages)]
109
+ self._element_cache = {}
110
+ self._exclusions = [] # List to store exclusion functions/regions
111
+ self._regions = [] # List to store region functions/definitions
112
+
113
+ # Initialize the Highlighting Service
114
+ self.highlighter = HighlightingService(self)
115
+ logger.info("Initialized HighlightingService.")
116
+
117
+ @property
118
+ def metadata(self) -> Dict[str, Any]:
119
+ """Access metadata as a dictionary."""
120
+ return self._pdf.metadata
121
+
122
+ @property
123
+ def pages(self) -> 'PageCollection':
124
+ """Access pages as a PageCollection object."""
125
+ from natural_pdf.elements.collections import PageCollection
126
+ return PageCollection(self._pages)
127
+
128
+ def add_exclusion(self, exclusion_func: Callable[[Page], Region], label: str = None) -> 'PDF':
129
+ """
130
+ Add an exclusion function to the PDF. Text from these regions will be excluded from extraction.
131
+
132
+ Args:
133
+ exclusion_func: A function that takes a Page and returns a Region to exclude
134
+ label: Optional label for this exclusion
135
+
136
+ Returns:
137
+ Self for method chaining
138
+ """
139
+ # Store exclusion with its label at PDF level
140
+ exclusion_data = (exclusion_func, label)
141
+ self._exclusions.append(exclusion_data)
142
+
143
+ # Create a wrapper function that properly evaluates on each page
144
+ def exclusion_wrapper(page):
145
+ try:
146
+ region = exclusion_func(page)
147
+ return region
148
+ except Exception as e:
149
+ print(f"Error in PDF-level exclusion for page {page.index}: {e}")
150
+ return None
151
+
152
+ # Apply this exclusion to all pages using the wrapper
153
+ for page in self._pages:
154
+ page.add_exclusion(exclusion_wrapper)
155
+
156
+ return self
157
+
158
+ def apply_ocr_to_pages(
159
+ self,
160
+ pages: Optional[Union[Iterable[int], range, slice]] = None,
161
+ engine: Optional[str] = None,
162
+ options: Optional['OCROptions'] = None,
163
+ languages: Optional[List[str]] = None,
164
+ min_confidence: Optional[float] = None,
165
+ device: Optional[str] = None,
166
+ # Add other simple mode args if needed
167
+ ) -> 'PDF':
168
+ """
169
+ Applies OCR to specified pages (or all pages) of the PDF using batch processing.
170
+
171
+ This method renders the specified pages to images, sends them as a batch
172
+ to the OCRManager, and adds the resulting TextElements to each respective page.
173
+
174
+ Args:
175
+ pages: An iterable of 0-based page indices (list, range, tuple),
176
+ a slice object, or None to process all pages.
177
+ engine: Name of the engine (e.g., 'easyocr', 'paddleocr', 'surya').
178
+ Uses manager's default if None. Ignored if 'options' is provided.
179
+ options: An specific Options object (e.g., EasyOCROptions) for
180
+ advanced configuration. Overrides simple arguments.
181
+ languages: List of language codes for simple mode.
182
+ min_confidence: Minimum confidence threshold for simple mode.
183
+ device: Device string ('cpu', 'cuda', etc.) for simple mode.
184
+
185
+ Returns:
186
+ Self for method chaining.
187
+
188
+ Raises:
189
+ ValueError: If page indices are invalid or the engine name is invalid.
190
+ TypeError: If unexpected keyword arguments are provided in simple mode.
191
+ RuntimeError: If the OCRManager or selected engine is not available.
192
+ """
193
+ if not self._ocr_manager:
194
+ logger.error("OCRManager not available. Cannot apply OCR.")
195
+ # Or raise RuntimeError("OCRManager not initialized.")
196
+ return self
197
+
198
+ # --- Determine Target Pages ---
199
+ target_pages: List[Page] = []
200
+ if pages is None:
201
+ target_pages = self._pages
202
+ elif isinstance(pages, slice):
203
+ target_pages = self._pages[pages]
204
+ elif hasattr(pages, '__iter__'): # Check if it's iterable (list, range, tuple, etc.)
205
+ try:
206
+ target_pages = [self._pages[i] for i in pages]
207
+ except IndexError:
208
+ raise ValueError("Invalid page index provided in 'pages' iterable.")
209
+ except TypeError:
210
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
211
+ else:
212
+ raise TypeError("'pages' must be None, a slice, or an iterable of page indices (int).")
213
+
214
+ if not target_pages:
215
+ logger.warning("No pages selected for OCR processing.")
216
+ return self
217
+
218
+ page_numbers = [p.number for p in target_pages]
219
+ logger.info(f"Applying batch OCR to pages: {page_numbers}...")
220
+
221
+ # --- Render Images for Batch ---
222
+ images_pil: List[Image.Image] = []
223
+ page_image_map: List[Tuple[Page, Image.Image]] = [] # Store page and its image
224
+ logger.info(f"Rendering {len(target_pages)} pages to images...")
225
+ try:
226
+ ocr_scale = getattr(self, '_config', {}).get('ocr_image_scale', 2.0)
227
+ for i, page in enumerate(target_pages):
228
+ logger.debug(f" Rendering page {page.number} (index {page.index})...")
229
+ # Use page.to_image but ensure highlights are off for OCR base image
230
+ img = page.to_image(scale=ocr_scale, include_highlights=False)
231
+ images_pil.append(img)
232
+ page_image_map.append((page, img)) # Store pair
233
+ except Exception as e:
234
+ logger.error(f"Failed to render one or more pages for batch OCR: {e}", exc_info=True)
235
+ # Decide whether to continue with successfully rendered pages or fail completely
236
+ # For now, let's fail if any page rendering fails.
237
+ raise RuntimeError(f"Failed to render page {page.number} for OCR.") from e
238
+
239
+ if not images_pil:
240
+ logger.error("No images were successfully rendered for batch OCR.")
241
+ return self
242
+
243
+ # --- Prepare Arguments for Manager ---
244
+ manager_args = {'images': images_pil, 'options': options, 'engine': engine}
245
+ if languages is not None: manager_args['languages'] = languages
246
+ if min_confidence is not None: manager_args['min_confidence'] = min_confidence
247
+ if device is not None: manager_args['device'] = device
248
+
249
+ # --- Call OCR Manager for Batch Processing ---
250
+ logger.info(f"Calling OCR Manager for batch processing {len(images_pil)} images...")
251
+ try:
252
+ # The manager's apply_ocr handles the batch input and returns List[List[Dict]]
253
+ batch_results = self._ocr_manager.apply_ocr(**manager_args)
254
+
255
+ if not isinstance(batch_results, list) or len(batch_results) != len(images_pil):
256
+ logger.error(f"OCR Manager returned unexpected result format or length for batch processing. "
257
+ f"Expected list of length {len(images_pil)}, got {type(batch_results)} "
258
+ f"with length {len(batch_results) if isinstance(batch_results, list) else 'N/A'}.")
259
+ # Handle error - maybe return early or try processing valid parts?
260
+ return self # Return self without adding elements
261
+
262
+ logger.info("OCR Manager batch processing complete.")
263
+
264
+ except Exception as e:
265
+ logger.error(f"Batch OCR processing failed: {e}", exc_info=True)
266
+ return self # Return self without adding elements
267
+
268
+ # --- Distribute Results and Add Elements to Pages ---
269
+ logger.info("Adding OCR results to respective pages...")
270
+ total_elements_added = 0
271
+ for i, (page, img) in enumerate(page_image_map):
272
+ results_for_page = batch_results[i]
273
+ if not isinstance(results_for_page, list):
274
+ logger.warning(f"Skipping results for page {page.number}: Expected list, got {type(results_for_page)}")
275
+ continue
276
+
277
+ logger.debug(f" Processing {len(results_for_page)} results for page {page.number}...")
278
+ # Use the page's element manager to create elements from its results
279
+ # Changed from page._create_text_elements_from_ocr to use element_mgr
280
+ elements = page._element_mgr.create_text_elements_from_ocr(results_for_page, img.width, img.height)
281
+
282
+ if elements:
283
+ # Note: element_mgr.create_text_elements_from_ocr already adds them
284
+ total_elements_added += len(elements)
285
+ logger.debug(f" Added {len(elements)} OCR TextElements to page {page.number}.")
286
+ else:
287
+ logger.debug(f" No valid TextElements created for page {page.number}.")
288
+
289
+ logger.info(f"Finished adding OCR results. Total elements added across {len(target_pages)} pages: {total_elements_added}")
290
+ return self
291
+
292
+ def add_region(self, region_func: Callable[[Page], Region], name: str = None) -> 'PDF':
293
+ """
294
+ Add a region function to the PDF. This creates regions on all pages using the provided function.
295
+
296
+ Args:
297
+ region_func: A function that takes a Page and returns a Region
298
+ name: Optional name for the region
299
+
300
+ Returns:
301
+ Self for method chaining
302
+ """
303
+ # Store region with its name at PDF level
304
+ region_data = (region_func, name)
305
+ self._regions.append(region_data)
306
+
307
+ # Create a wrapper function that properly evaluates on each page
308
+ def region_wrapper(page):
309
+ try:
310
+ region = region_func(page)
311
+ if region:
312
+ # Apply name if provided
313
+ if name:
314
+ region.name = name
315
+ region.source = 'named'
316
+ return region
317
+ except Exception as e:
318
+ print(f"Error in PDF-level region for page {page.index}: {e}")
319
+ return None
320
+
321
+ # Apply this region to all pages
322
+ for page in self._pages:
323
+ try:
324
+ region = region_wrapper(page)
325
+ if region:
326
+ page.add_region(region, name=name)
327
+ except Exception as e:
328
+ print(f"Error adding region to page {page.index}: {e}")
329
+
330
+ return self
331
+
332
+ def find(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> Any:
333
+ """
334
+ Find the first element matching the selector.
335
+
336
+ Args:
337
+ selector: CSS-like selector string (e.g., 'text:contains("Annual Report")')
338
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
339
+ regex: Whether to use regex for text search in :contains (default: False)
340
+ case: Whether to do case-sensitive text search (default: True)
341
+ **kwargs: Additional filter parameters
342
+
343
+ Returns:
344
+ Element object or None if not found
345
+ """
346
+ selector_obj = parse_selector(selector)
347
+
348
+ # Pass regex and case flags to selector function
349
+ kwargs['regex'] = regex
350
+ kwargs['case'] = case
351
+
352
+ results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
353
+ return results.first if results else None
354
+
355
+ def find_all(self, selector: str, apply_exclusions=True, regex=False, case=True, **kwargs) -> ElementCollection:
356
+ """
357
+ Find all elements matching the selector.
358
+
359
+ Args:
360
+ selector: CSS-like selector string (e.g., 'text[color=(1,0,0)]')
361
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
362
+ regex: Whether to use regex for text search in :contains (default: False)
363
+ case: Whether to do case-sensitive text search (default: True)
364
+ **kwargs: Additional filter parameters
365
+
366
+ Returns:
367
+ ElementCollection with matching elements
368
+ """
369
+ selector_obj = parse_selector(selector)
370
+
371
+ # Pass regex and case flags to selector function
372
+ kwargs['regex'] = regex
373
+ kwargs['case'] = case
374
+
375
+ results = self._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
376
+ return results
377
+
378
+ def _apply_selector(self, selector_obj: Dict, apply_exclusions=True, **kwargs) -> ElementCollection:
379
+ """
380
+ Apply selector to PDF elements across all pages.
381
+
382
+ Args:
383
+ selector_obj: Parsed selector dictionary
384
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
385
+ **kwargs: Additional filter parameters
386
+
387
+ Returns:
388
+ ElementCollection of matching elements
389
+ """
390
+ from natural_pdf.elements.collections import ElementCollection
391
+
392
+ # Determine page range to search
393
+ page_range = kwargs.get('pages', range(len(self.pages)))
394
+ if isinstance(page_range, (int, slice)):
395
+ # Convert int or slice to range
396
+ if isinstance(page_range, int):
397
+ page_range = [page_range]
398
+ elif isinstance(page_range, slice):
399
+ start = page_range.start or 0
400
+ stop = page_range.stop or len(self.pages)
401
+ step = page_range.step or 1
402
+ page_range = range(start, stop, step)
403
+
404
+ # Check for cross-page pseudo-classes
405
+ cross_page = False
406
+ for pseudo in selector_obj.get('pseudo_classes', []):
407
+ if pseudo.get('name') in ('spans', 'continues'):
408
+ cross_page = True
409
+ break
410
+
411
+ # If searching across pages, handle specially
412
+ if cross_page:
413
+ # TODO: Implement cross-page element matching
414
+ return ElementCollection([])
415
+
416
+ # Regular case: collect elements from each page
417
+ all_elements = []
418
+ for page_idx in page_range:
419
+ if 0 <= page_idx < len(self.pages):
420
+ page = self.pages[page_idx]
421
+ page_elements = page._apply_selector(selector_obj, apply_exclusions=apply_exclusions, **kwargs)
422
+ all_elements.extend(page_elements.elements)
423
+
424
+ # Create a combined collection
425
+ combined = ElementCollection(all_elements)
426
+
427
+ # Sort in document order if requested
428
+ if kwargs.get('document_order', True):
429
+ # Check if elements have page, top, x0 before sorting
430
+ if all(hasattr(el, 'page') and hasattr(el, 'top') and hasattr(el, 'x0') for el in combined.elements):
431
+ combined.sort(key=lambda el: (el.page.index, el.top, el.x0))
432
+ else:
433
+ logger.warning("Cannot sort elements in document order: Missing required attributes (page, top, x0).")
434
+
435
+ return combined
436
+
437
+ def extract_text(self, selector: Optional[str] = None, preserve_whitespace=True,
438
+ use_exclusions=True, debug_exclusions=False, **kwargs) -> str:
439
+ """
440
+ Extract text from the entire document or matching elements.
441
+
442
+ Args:
443
+ selector: Optional selector to filter elements
444
+ preserve_whitespace: Whether to keep blank characters (default: True)
445
+ use_exclusions: Whether to apply exclusion regions (default: True)
446
+ debug_exclusions: Whether to output detailed debugging for exclusions (default: False)
447
+ **kwargs: Additional extraction parameters
448
+
449
+ Returns:
450
+ Extracted text as string
451
+ """
452
+ # If selector is provided, find elements first
453
+ if selector:
454
+ elements = self.find_all(selector)
455
+ return elements.extract_text(preserve_whitespace=preserve_whitespace, **kwargs)
456
+
457
+ # Otherwise extract from all pages
458
+ if debug_exclusions:
459
+ print(f"PDF: Extracting text with exclusions from {len(self.pages)} pages")
460
+ print(f"PDF: Found {len(self._exclusions)} document-level exclusions")
461
+
462
+ texts = []
463
+ for page in self.pages:
464
+ texts.append(page.extract_text(
465
+ preserve_whitespace=preserve_whitespace,
466
+ use_exclusions=use_exclusions,
467
+ debug_exclusions=debug_exclusions,
468
+ **kwargs
469
+ ))
470
+
471
+ if debug_exclusions:
472
+ print(f"PDF: Combined {len(texts)} pages of text")
473
+
474
+ return "\n".join(texts)
475
+
476
+ # Note: extract_text_compat method removed
477
+
478
+ def extract(self, selector: str, preserve_whitespace=True, **kwargs) -> str:
479
+ """
480
+ Shorthand for finding elements and extracting their text.
481
+
482
+ Args:
483
+ selector: CSS-like selector string
484
+ preserve_whitespace: Whether to keep blank characters (default: True)
485
+ **kwargs: Additional extraction parameters
486
+
487
+ Returns:
488
+ Extracted text from matching elements
489
+ """
490
+ return self.extract_text(selector, preserve_whitespace=preserve_whitespace, **kwargs)
491
+
492
+ # def debug_ocr(self, output_path, pages=None):
493
+ # """
494
+ # Generate an interactive HTML debug report for OCR results.
495
+
496
+ # This creates a single-file HTML report with:
497
+ # - Side-by-side view of image regions and OCR text
498
+ # - Confidence scores with color coding
499
+ # - Editable correction fields
500
+ # - Filtering and sorting options
501
+ # - Export functionality for corrected text
502
+
503
+ # Args:
504
+ # output_path: Path to save the HTML report
505
+ # pages: Pages to include in the report (default: all pages)
506
+ # Can be a page index, slice, or list of page indices
507
+
508
+ # Returns:
509
+ # Self for method chaining
510
+ # """
511
+ # from natural_pdf.utils.ocr import debug_ocr_to_html
512
+
513
+ # if pages is None:
514
+ # # Include all pages
515
+ # target_pages = self.pages
516
+ # elif isinstance(pages, int):
517
+ # # Single page index
518
+ # target_pages = [self.pages[pages]]
519
+ # elif isinstance(pages, slice):
520
+ # # Slice of pages
521
+ # target_pages = self.pages[pages]
522
+ # else:
523
+ # # Assume it's an iterable of page indices
524
+ # target_pages = [self.pages[i] for i in pages]
525
+
526
+ # debug_ocr_to_html(target_pages, output_path)
527
+ # return self
528
+
529
+ def extract_tables(self, selector: Optional[str] = None, merge_across_pages: bool = False, **kwargs) -> List[Any]:
530
+ """
531
+ Extract tables from the document or matching elements.
532
+
533
+ Args:
534
+ selector: Optional selector to filter tables
535
+ merge_across_pages: Whether to merge tables that span across pages
536
+ **kwargs: Additional extraction parameters
537
+
538
+ Returns:
539
+ List of extracted tables
540
+ """
541
+ # TODO: Implement table extraction
542
+ return [] # Placeholder
543
+
544
+ def ask(self, question: str,
545
+ mode: str = "extractive",
546
+ pages: Union[int, List[int], range] = None,
547
+ min_confidence: float = 0.1,
548
+ model: str = None,
549
+ **kwargs) -> Dict[str, Any]:
550
+ """
551
+ Ask a question about the document content.
552
+
553
+ Args:
554
+ question: Question to ask about the document
555
+ mode: "extractive" to extract answer from document, "generative" to generate
556
+ pages: Specific pages to query (default: all pages)
557
+ min_confidence: Minimum confidence threshold for answers
558
+ model: Optional model name for question answering
559
+ **kwargs: Additional parameters passed to the QA engine
560
+
561
+ Returns:
562
+ A dictionary containing the answer, confidence, and other metadata.
563
+ Result will have an 'answer' key containing the answer text.
564
+ """
565
+ from natural_pdf.qa import get_qa_engine
566
+
567
+ # Initialize or get QA engine
568
+ qa_engine = get_qa_engine() if model is None else get_qa_engine(model_name=model)
569
+
570
+ # Determine which pages to query
571
+ if pages is None:
572
+ target_pages = list(range(len(self.pages)))
573
+ elif isinstance(pages, int):
574
+ # Single page
575
+ target_pages = [pages]
576
+ elif isinstance(pages, (list, range)):
577
+ # List or range of pages
578
+ target_pages = pages
579
+ else:
580
+ raise ValueError(f"Invalid pages parameter: {pages}")
581
+
582
+ # Actually query each page and gather results
583
+ results = []
584
+ for page_idx in target_pages:
585
+ if 0 <= page_idx < len(self.pages):
586
+ page = self.pages[page_idx]
587
+ page_result = qa_engine.ask_pdf_page(
588
+ page=page,
589
+ question=question,
590
+ min_confidence=min_confidence,
591
+ **kwargs
592
+ )
593
+
594
+ # Add to results if it found an answer
595
+ if page_result and page_result.get("found", False):
596
+ results.append(page_result)
597
+
598
+ # Sort results by confidence
599
+ results.sort(key=lambda x: x.get("confidence", 0), reverse=True)
600
+
601
+ # Return the best result, or a default result if none found
602
+ if results:
603
+ return results[0]
604
+ else:
605
+ # Return a structure indicating no answer found
606
+ return {
607
+ "answer": None,
608
+ "confidence": 0.0,
609
+ "found": False,
610
+ "page_num": None, # Or maybe the pages searched?
611
+ "source_elements": []
612
+ }
613
+
614
+ def __len__(self) -> int:
615
+ """Return the number of pages in the PDF."""
616
+ return len(self.pages)
617
+
618
+ def __getitem__(self, key) -> Union[Page, List[Page]]:
619
+ """Access pages by index or slice."""
620
+ # Check if self._pages has been initialized
621
+ if not hasattr(self, '_pages'):
622
+ raise AttributeError("PDF pages not initialized yet.")
623
+ if isinstance(key, slice):
624
+ # Return a PageCollection slice
625
+ from natural_pdf.elements.collections import PageCollection
626
+ return PageCollection(self._pages[key])
627
+ # Return a single Page object
628
+ return self._pages[key]
629
+
630
+ def close(self):
631
+ """Close the underlying PDF file and clean up any temporary files."""
632
+ if hasattr(self, '_pdf') and self._pdf is not None:
633
+ self._pdf.close()
634
+ self._pdf = None
635
+
636
+ # Clean up temporary file if it exists
637
+ if hasattr(self, '_temp_file') and self._temp_file is not None:
638
+ try:
639
+ if hasattr(self._temp_file, 'name') and self._temp_file.name and os.path.exists(self._temp_file.name):
640
+ os.unlink(self._temp_file.name)
641
+ logger.debug(f"Removed temporary PDF file: {self._temp_file.name}")
642
+ except Exception as e:
643
+ logger.warning(f"Failed to clean up temporary PDF file: {e}")
644
+ finally:
645
+ self._temp_file = None
646
+
647
+ def __enter__(self):
648
+ """Context manager entry."""
649
+ return self
650
+
651
+ def __exit__(self, exc_type, exc_val, exc_tb):
652
+ """Context manager exit."""
653
+ self.close()
@@ -0,0 +1,3 @@
1
+ """
2
+ Element classes for Natural PDF.
3
+ """