natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -4,12 +4,24 @@ import logging
4
4
  import os
5
5
  import re # Added for safe path generation
6
6
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
7
+ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union, Callable
8
+ import concurrent.futures # Import concurrent.futures
9
+ import time # Import time for logging timestamps
10
+ import threading # Import threading for logging thread information
8
11
 
9
12
  from PIL import Image
10
13
  from tqdm import tqdm
14
+ from tqdm.auto import tqdm as auto_tqdm
15
+ from tqdm.notebook import tqdm as notebook_tqdm
16
+
17
+ from natural_pdf.utils.tqdm_utils import get_tqdm
18
+
19
+ # Get the appropriate tqdm class once
20
+ tqdm = get_tqdm()
11
21
 
12
22
  # Set up logger early
23
+ # Configure logging to include thread information
24
+ # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(threadName)s - %(name)s - %(levelname)s - %(message)s')
13
25
  logger = logging.getLogger(__name__)
14
26
 
15
27
  from natural_pdf.core.pdf import PDF
@@ -36,9 +48,11 @@ except ImportError as e:
36
48
  SearchServiceProtocol, SearchOptions, Indexable = object, object, object
37
49
 
38
50
  from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
51
+ # Import the ApplyMixin
52
+ from natural_pdf.collections.mixins import ApplyMixin
39
53
 
40
54
 
41
- class PDFCollection(SearchableMixin): # Inherit from the mixin
55
+ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
42
56
  def __init__(
43
57
  self,
44
58
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -237,30 +251,214 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
237
251
 
238
252
  def __repr__(self) -> str:
239
253
  # Removed search status
240
- return f"<PDFCollection(count={len(self)})>"
254
+ return f"<PDFCollection(count={len(self._pdfs)})>"
241
255
 
242
256
  @property
243
257
  def pdfs(self) -> List["PDF"]:
244
258
  """Returns the list of PDF objects held by the collection."""
245
259
  return self._pdfs
246
260
 
247
- def apply_ocr(self, *args, **kwargs):
248
- PDF = self._get_pdf_class()
249
- # Delegate to individual PDF objects
250
- logger.info("Applying OCR to relevant PDFs in collection...")
251
- results = []
261
+ def find_all(
262
+ self,
263
+ selector: str,
264
+ apply_exclusions: bool = True, # Added explicit parameter
265
+ regex: bool = False, # Added explicit parameter
266
+ case: bool = True, # Added explicit parameter
267
+ **kwargs
268
+ ) -> "ElementCollection":
269
+ """
270
+ Find all elements matching the selector across all PDFs in the collection.
271
+
272
+ This creates an ElementCollection that can span multiple PDFs. Note that
273
+ some ElementCollection methods have limitations when spanning PDFs.
274
+
275
+ Args:
276
+ selector: CSS-like selector string to query elements
277
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
278
+ regex: Whether to use regex for text search in :contains (default: False)
279
+ case: Whether to do case-sensitive text search (default: True)
280
+ **kwargs: Additional keyword arguments passed to the find_all method of each PDF
281
+
282
+ Returns:
283
+ ElementCollection containing all matching elements across all PDFs
284
+ """
285
+ from natural_pdf.elements.collections import ElementCollection
286
+
287
+ # Collect elements from all PDFs
288
+ all_elements = []
252
289
  for pdf in self._pdfs:
253
- # We need to figure out which pages belong to which PDF if batching here
254
- # For now, simpler to call on each PDF
255
290
  try:
256
- # Assume apply_ocr exists on PDF and accepts similar args
257
- pdf.apply_ocr(*args, **kwargs)
291
+ # Explicitly pass the relevant arguments down
292
+ elements = pdf.find_all(
293
+ selector,
294
+ apply_exclusions=apply_exclusions,
295
+ regex=regex,
296
+ case=case,
297
+ **kwargs
298
+ )
299
+ all_elements.extend(elements.elements)
300
+ except Exception as e:
301
+ logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
302
+
303
+ return ElementCollection(all_elements)
304
+
305
+ def apply_ocr(
306
+ self,
307
+ engine: Optional[str] = None,
308
+ languages: Optional[List[str]] = None,
309
+ min_confidence: Optional[float] = None,
310
+ device: Optional[str] = None,
311
+ resolution: Optional[int] = None,
312
+ apply_exclusions: bool = True,
313
+ detect_only: bool = False,
314
+ replace: bool = True,
315
+ options: Optional[Any] = None,
316
+ pages: Optional[Union[slice, List[int]]] = None,
317
+ max_workers: Optional[int] = None,
318
+ ) -> "PDFCollection":
319
+ """
320
+ Apply OCR to all PDFs in the collection, potentially in parallel.
321
+
322
+ Args:
323
+ engine: OCR engine to use (e.g., 'easyocr', 'paddleocr', 'surya')
324
+ languages: List of language codes for OCR
325
+ min_confidence: Minimum confidence threshold for text detection
326
+ device: Device to use for OCR (e.g., 'cpu', 'cuda')
327
+ resolution: DPI resolution for page rendering
328
+ apply_exclusions: Whether to apply exclusion regions
329
+ detect_only: If True, only detect text regions without extracting text
330
+ replace: If True, replace existing OCR elements
331
+ options: Engine-specific options
332
+ pages: Specific pages to process (None for all pages)
333
+ max_workers: Maximum number of threads to process PDFs concurrently.
334
+ If None or 1, processing is sequential. (default: None)
335
+
336
+ Returns:
337
+ Self for method chaining
338
+ """
339
+ PDF = self._get_pdf_class()
340
+ logger.info(f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})...")
341
+
342
+ # Worker function takes PDF object again
343
+ def _process_pdf(pdf: PDF):
344
+ """Helper function to apply OCR to a single PDF, handling errors."""
345
+ thread_id = threading.current_thread().name # Get thread name for logging
346
+ pdf_path = pdf.path # Get path for logging
347
+ logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
348
+ start_time = time.monotonic()
349
+ try:
350
+ pdf.apply_ocr( # Call apply_ocr on the original PDF object
351
+ pages=pages,
352
+ engine=engine,
353
+ languages=languages,
354
+ min_confidence=min_confidence,
355
+ device=device,
356
+ resolution=resolution,
357
+ apply_exclusions=apply_exclusions,
358
+ detect_only=detect_only,
359
+ replace=replace,
360
+ options=options,
361
+ # Note: We might want a max_workers here too for page rendering?
362
+ # For now, PDF.apply_ocr doesn't have it.
363
+ )
364
+ end_time = time.monotonic()
365
+ logger.debug(f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
366
+ return pdf_path, None
258
367
  except Exception as e:
259
- logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
368
+ end_time = time.monotonic()
369
+ logger.error(f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
370
+ return pdf_path, e # Return path and error
371
+
372
+ # Use ThreadPoolExecutor for parallel processing if max_workers > 1
373
+ if max_workers is not None and max_workers > 1:
374
+ futures = []
375
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="OCRWorker") as executor:
376
+ for pdf in self._pdfs:
377
+ # Submit the PDF object to the worker function
378
+ futures.append(executor.submit(_process_pdf, pdf))
379
+
380
+ # Use the selected tqdm class with as_completed for progress tracking
381
+ progress_bar = tqdm(
382
+ concurrent.futures.as_completed(futures),
383
+ total=len(self._pdfs),
384
+ desc="Applying OCR (Parallel)",
385
+ unit="pdf"
386
+ )
387
+
388
+ for future in progress_bar:
389
+ pdf_path, error = future.result() # Get result (or exception)
390
+ if error:
391
+ progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
392
+ # Progress is updated automatically by tqdm
393
+
394
+ else: # Sequential processing (max_workers is None or 1)
395
+ logger.info("Applying OCR sequentially...")
396
+ # Use the selected tqdm class for sequential too for consistency
397
+ # Iterate over PDF objects directly for sequential
398
+ for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
399
+ _process_pdf(pdf) # Call helper directly with PDF object
400
+
401
+ logger.info("Finished applying OCR across the collection.")
260
402
  return self
261
403
 
262
- # --- Advanced Method Placeholders ---
263
- # Placeholder for categorize removed as find_relevant is now implemented
404
+ def correct_ocr(
405
+ self,
406
+ correction_callback: Callable[[Any], Optional[str]],
407
+ max_workers: Optional[int] = None,
408
+ progress_callback: Optional[Callable[[], None]] = None,
409
+ ) -> "PDFCollection":
410
+ """
411
+ Apply OCR correction to all relevant elements across all pages and PDFs
412
+ in the collection using a single progress bar.
413
+
414
+ Args:
415
+ correction_callback: Function to apply to each OCR element.
416
+ It receives the element and should return
417
+ the corrected text (str) or None.
418
+ max_workers: Max threads to use for parallel execution within each page.
419
+ progress_callback: Optional callback function to call after processing each element.
420
+
421
+ Returns:
422
+ Self for method chaining.
423
+ """
424
+ PDF = self._get_pdf_class() # Ensure PDF class is available
425
+ if not callable(correction_callback):
426
+ raise TypeError("`correction_callback` must be a callable function.")
427
+
428
+ logger.info(f"Gathering OCR elements from {len(self._pdfs)} PDFs for correction...")
429
+
430
+ # 1. Gather all target elements using the collection's find_all
431
+ # Crucially, set apply_exclusions=False to include elements in headers/footers etc.
432
+ all_ocr_elements = self.find_all("text[source=ocr]", apply_exclusions=False).elements
433
+
434
+ if not all_ocr_elements:
435
+ logger.info("No OCR elements found in the collection to correct.")
436
+ return self
437
+
438
+ total_elements = len(all_ocr_elements)
439
+ logger.info(f"Found {total_elements} OCR elements across the collection. Starting correction process...")
440
+
441
+ # 2. Initialize the progress bar
442
+ progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
443
+
444
+ # 3. Iterate through PDFs and delegate to PDF.correct_ocr
445
+ # PDF.correct_ocr handles page iteration and passing the progress callback down.
446
+ for pdf in self._pdfs:
447
+ if not pdf.pages:
448
+ continue
449
+ try:
450
+ pdf.correct_ocr(
451
+ correction_callback=correction_callback,
452
+ max_workers=max_workers,
453
+ progress_callback=progress_bar.update # Pass the bar's update method
454
+ )
455
+ except Exception as e:
456
+ logger.error(f"Error occurred during correction process for PDF {pdf.path}: {e}", exc_info=True)
457
+ # Decide if we should stop or continue? For now, continue.
458
+
459
+ progress_bar.close()
460
+
461
+ return self
264
462
 
265
463
  def categorize(self, categories: List[str], **kwargs):
266
464
  """Categorizes PDFs in the collection based on content or features."""
@@ -309,3 +507,111 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
309
507
  # logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
310
508
  # continue
311
509
  yield page
510
+
511
+ # --- Classification Method --- #
512
+ def classify_all(
513
+ self,
514
+ categories: List[str],
515
+ model: str = "text",
516
+ max_workers: Optional[int] = None,
517
+ **kwargs,
518
+ ) -> "PDFCollection":
519
+ """
520
+ Classify all pages across all PDFs in the collection, potentially in parallel.
521
+
522
+ This method uses the unified `classify_all` approach, delegating page
523
+ classification to each PDF's `classify_pages` method.
524
+ It displays a progress bar tracking individual pages.
525
+
526
+ Args:
527
+ categories: A list of string category names.
528
+ model: Model identifier ('text', 'vision', or specific HF ID).
529
+ max_workers: Maximum number of threads to process PDFs concurrently.
530
+ If None or 1, processing is sequential.
531
+ **kwargs: Additional arguments passed down to `pdf.classify_pages` and
532
+ subsequently to `page.classify` (e.g., device,
533
+ confidence_threshold, resolution).
534
+
535
+ Returns:
536
+ Self for method chaining.
537
+
538
+ Raises:
539
+ ValueError: If categories list is empty.
540
+ ClassificationError: If classification fails for any page (will stop processing).
541
+ ImportError: If classification dependencies are missing.
542
+ """
543
+ PDF = self._get_pdf_class()
544
+ if not categories:
545
+ raise ValueError("Categories list cannot be empty.")
546
+
547
+ logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
548
+
549
+ # Calculate total pages for the progress bar
550
+ total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
551
+ if total_pages == 0:
552
+ logger.warning("No pages found in the PDF collection to classify.")
553
+ return self
554
+
555
+ progress_bar = tqdm(
556
+ total=total_pages,
557
+ desc=f"Classifying Pages (model: {model})",
558
+ unit="page"
559
+ )
560
+
561
+ # Worker function
562
+ def _process_pdf_classification(pdf: PDF):
563
+ thread_id = threading.current_thread().name
564
+ pdf_path = pdf.path
565
+ logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
566
+ start_time = time.monotonic()
567
+ try:
568
+ # Call classify_pages on the PDF, passing the progress callback
569
+ pdf.classify_pages(
570
+ categories=categories,
571
+ model=model,
572
+ progress_callback=progress_bar.update,
573
+ **kwargs
574
+ )
575
+ end_time = time.monotonic()
576
+ logger.debug(f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
577
+ return pdf_path, None # Return path and no error
578
+ except Exception as e:
579
+ end_time = time.monotonic()
580
+ # Error is logged within classify_pages, but log summary here
581
+ logger.error(f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
582
+ # Close progress bar immediately on error to avoid hanging
583
+ progress_bar.close()
584
+ # Re-raise the exception to stop the entire collection processing
585
+ raise
586
+
587
+ # Use ThreadPoolExecutor for parallel processing if max_workers > 1
588
+ try:
589
+ if max_workers is not None and max_workers > 1:
590
+ logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
591
+ futures = []
592
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="ClassifyWorker") as executor:
593
+ for pdf in self._pdfs:
594
+ futures.append(executor.submit(_process_pdf_classification, pdf))
595
+
596
+ # Wait for all futures to complete (progress updated by callback)
597
+ # Exceptions are raised by future.result() if worker failed
598
+ for future in concurrent.futures.as_completed(futures):
599
+ future.result() # Raise exception if worker failed
600
+
601
+ else: # Sequential processing
602
+ logger.info("Classifying PDFs sequentially.")
603
+ for pdf in self._pdfs:
604
+ _process_pdf_classification(pdf)
605
+
606
+ logger.info("Finished classification across the collection.")
607
+
608
+ finally:
609
+ # Ensure progress bar is closed even if errors occurred elsewhere
610
+ if not progress_bar.disable and progress_bar.n < progress_bar.total:
611
+ progress_bar.close()
612
+ elif progress_bar.disable is False:
613
+ progress_bar.close()
614
+
615
+ return self
616
+
617
+ # --- End Classification Method --- #
@@ -539,3 +539,70 @@ class ElementManager:
539
539
  """Get all region elements."""
540
540
  self.load_elements()
541
541
  return self._elements.get("regions", [])
542
+
543
+ def remove_ocr_elements(self):
544
+ """
545
+ Remove all elements with source="ocr" from the elements dictionary.
546
+ This should be called before adding new OCR elements if replacement is desired.
547
+
548
+ Returns:
549
+ int: Number of OCR elements removed
550
+ """
551
+ # Load elements if not already loaded
552
+ self.load_elements()
553
+
554
+ removed_count = 0
555
+
556
+ # Filter out OCR elements from words
557
+ if "words" in self._elements:
558
+ original_len = len(self._elements["words"])
559
+ self._elements["words"] = [
560
+ word for word in self._elements["words"]
561
+ if getattr(word, "source", None) != "ocr"
562
+ ]
563
+ removed_count += original_len - len(self._elements["words"])
564
+
565
+ # Filter out OCR elements from chars
566
+ if "chars" in self._elements:
567
+ original_len = len(self._elements["chars"])
568
+ self._elements["chars"] = [
569
+ char for char in self._elements["chars"]
570
+ if (isinstance(char, dict) and char.get("source") != "ocr") or
571
+ (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
572
+ ]
573
+ removed_count += original_len - len(self._elements["chars"])
574
+
575
+ logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
576
+ return removed_count
577
+
578
+ def remove_element(self, element, element_type="words"):
579
+ """
580
+ Remove a specific element from the managed elements.
581
+
582
+ Args:
583
+ element: The element to remove
584
+ element_type: The type of element ('words', 'chars', etc.)
585
+
586
+ Returns:
587
+ bool: True if removed successfully, False otherwise
588
+ """
589
+ # Load elements if not already loaded
590
+ self.load_elements()
591
+
592
+ # Check if the collection exists
593
+ if element_type not in self._elements:
594
+ logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
595
+ return False
596
+
597
+ # Try to remove the element
598
+ try:
599
+ if element in self._elements[element_type]:
600
+ self._elements[element_type].remove(element)
601
+ logger.debug(f"Removed element from {element_type}: {element}")
602
+ return True
603
+ else:
604
+ logger.debug(f"Element not found in {element_type}: {element}")
605
+ return False
606
+ except Exception as e:
607
+ logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
608
+ return False