natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,111 @@
1
+ import logging
2
+ from typing import Any, Callable, Iterable, TypeVar
3
+
4
+ from tqdm.auto import tqdm
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ T = TypeVar("T") # Generic type for items in the collection
9
+
10
+
11
+ class DirectionalCollectionMixin:
12
+ """
13
+ Mixin providing directional methods for collections of elements/regions.
14
+ """
15
+
16
+ def below(self, **kwargs) -> "ElementCollection":
17
+ """Find regions below all elements in this collection."""
18
+ return self.apply(lambda element: element.below(**kwargs))
19
+
20
+ def above(self, **kwargs) -> "ElementCollection":
21
+ """Find regions above all elements in this collection."""
22
+ return self.apply(lambda element: element.above(**kwargs))
23
+
24
+ def left(self, **kwargs) -> "ElementCollection":
25
+ """Find regions to the left of all elements in this collection."""
26
+ return self.apply(lambda element: element.left(**kwargs))
27
+
28
+ def right(self, **kwargs) -> "ElementCollection":
29
+ """Find regions to the right of all elements in this collection."""
30
+ return self.apply(lambda element: element.right(**kwargs))
31
+
32
+ def expand(self, **kwargs) -> "ElementCollection":
33
+ """Expand all elements in this collection."""
34
+ return self.apply(lambda element: element.expand(**kwargs))
35
+
36
+
37
+ class ApplyMixin:
38
+ """
39
+ Mixin class providing an `.apply()` method for collections.
40
+
41
+ Assumes the inheriting class implements `__iter__` and `__len__` appropriately
42
+ for the items to be processed by `apply`.
43
+ """
44
+
45
+ def _get_items_for_apply(self) -> Iterable[Any]:
46
+ """
47
+ Returns the iterable of items to apply the function to.
48
+ Defaults to iterating over `self`. Subclasses should override this
49
+ if the default iteration is not suitable for the apply operation.
50
+ """
51
+ # Default to standard iteration over the collection itself
52
+ return iter(self)
53
+
54
+ def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> Iterable[Any]:
55
+ """
56
+ Applies a function to each item in the collection.
57
+
58
+ Args:
59
+ func: The function to apply to each item. The item itself
60
+ will be passed as the first argument to the function.
61
+ *args: Additional positional arguments to pass to func.
62
+ **kwargs: Additional keyword arguments to pass to func.
63
+ A special keyword argument 'show_progress' (bool, default=False)
64
+ can be used to display a progress bar.
65
+ """
66
+ show_progress = kwargs.pop("show_progress", False)
67
+ # Derive unit name from class name
68
+ unit_name = self.__class__.__name__.lower()
69
+ items_iterable = self._get_items_for_apply()
70
+
71
+ # Need total count for tqdm, assumes __len__ is implemented by the inheriting class
72
+ total_items = 0
73
+ try:
74
+ total_items = len(self)
75
+ except TypeError: # Handle cases where __len__ might not be defined on self
76
+ logger.warning(f"Could not determine collection length for progress bar.")
77
+
78
+ if show_progress and total_items > 0:
79
+ items_iterable = tqdm(
80
+ items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name
81
+ )
82
+ elif show_progress:
83
+ logger.info(
84
+ f"Applying {func.__name__} (progress bar disabled for zero/unknown length)."
85
+ )
86
+
87
+ results = [func(item, *args, **kwargs) for item in items_iterable]
88
+
89
+ # If results is empty, return an empty list
90
+ if not results:
91
+ return []
92
+
93
+ # Import here to avoid circular imports
94
+ from natural_pdf import PDF, Page
95
+ from natural_pdf.collections.pdf_collection import PDFCollection
96
+ from natural_pdf.elements.base import Element
97
+ from natural_pdf.elements.collections import ElementCollection, PageCollection
98
+ from natural_pdf.elements.region import Region
99
+
100
+ first_non_none = next((r for r in results if r is not None), None)
101
+ first_type = type(first_non_none) if first_non_none is not None else None
102
+
103
+ # Return the appropriate collection based on result type (...generally)
104
+ if issubclass(first_type, Element) or issubclass(first_type, Region):
105
+ return ElementCollection(results)
106
+ elif first_type == PDF:
107
+ return PDFCollection(results)
108
+ elif first_type == Page:
109
+ return PageCollection(results)
110
+
111
+ return results
@@ -1,19 +1,47 @@
1
+ import concurrent.futures # Import concurrent.futures
1
2
  import copy # Added for copying options
2
3
  import glob as py_glob
3
4
  import logging
4
5
  import os
5
6
  import re # Added for safe path generation
7
+ import threading # Import threading for logging thread information
8
+ import time # Import time for logging timestamps
6
9
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union
10
+ from typing import (
11
+ TYPE_CHECKING,
12
+ Any,
13
+ Callable,
14
+ Dict,
15
+ Generic,
16
+ Iterable,
17
+ Iterator,
18
+ List,
19
+ Optional,
20
+ Set,
21
+ Type,
22
+ TypeVar,
23
+ Union,
24
+ overload,
25
+ )
8
26
 
9
27
  from PIL import Image
10
28
  from tqdm import tqdm
29
+ from tqdm.auto import tqdm as auto_tqdm
30
+ from tqdm.notebook import tqdm as notebook_tqdm
31
+
32
+ from natural_pdf.utils.tqdm_utils import get_tqdm
33
+
34
+ # Get the appropriate tqdm class once
35
+ tqdm = get_tqdm()
11
36
 
12
37
  # Set up logger early
38
+ # Configure logging to include thread information
39
+ # logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(threadName)s - %(name)s - %(levelname)s - %(message)s')
13
40
  logger = logging.getLogger(__name__)
14
41
 
15
42
  from natural_pdf.core.pdf import PDF
16
43
  from natural_pdf.elements.region import Region
44
+ from natural_pdf.export.mixin import ExportMixin
17
45
 
18
46
  # --- Search Imports ---
19
47
  try:
@@ -35,10 +63,12 @@ except ImportError as e:
35
63
 
36
64
  SearchServiceProtocol, SearchOptions, Indexable = object, object, object
37
65
 
66
+ # Import the ApplyMixin
67
+ from natural_pdf.collections.mixins import ApplyMixin
38
68
  from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
39
69
 
40
70
 
41
- class PDFCollection(SearchableMixin): # Inherit from the mixin
71
+ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
42
72
  def __init__(
43
73
  self,
44
74
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -237,30 +267,257 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
237
267
 
238
268
  def __repr__(self) -> str:
239
269
  # Removed search status
240
- return f"<PDFCollection(count={len(self)})>"
270
+ return f"<PDFCollection(count={len(self._pdfs)})>"
271
+ return f"<PDFCollection(count={len(self._pdfs)})>"
241
272
 
242
273
  @property
243
274
  def pdfs(self) -> List["PDF"]:
244
275
  """Returns the list of PDF objects held by the collection."""
245
276
  return self._pdfs
246
277
 
247
- def apply_ocr(self, *args, **kwargs):
248
- PDF = self._get_pdf_class()
249
- # Delegate to individual PDF objects
250
- logger.info("Applying OCR to relevant PDFs in collection...")
251
- results = []
278
+ @overload
279
+ def find_all(
280
+ self,
281
+ *,
282
+ text: str,
283
+ apply_exclusions: bool = True,
284
+ regex: bool = False,
285
+ case: bool = True,
286
+ **kwargs,
287
+ ) -> "ElementCollection": ...
288
+
289
+ @overload
290
+ def find_all(
291
+ self,
292
+ selector: str,
293
+ *,
294
+ apply_exclusions: bool = True,
295
+ regex: bool = False,
296
+ case: bool = True,
297
+ **kwargs,
298
+ ) -> "ElementCollection": ...
299
+
300
+ def find_all(
301
+ self,
302
+ selector: Optional[str] = None, # Now optional
303
+ *,
304
+ text: Optional[str] = None, # New text parameter
305
+ apply_exclusions: bool = True,
306
+ regex: bool = False,
307
+ case: bool = True,
308
+ **kwargs,
309
+ ) -> "ElementCollection":
310
+ """
311
+ Find all elements matching the selector OR text across all PDFs in the collection.
312
+
313
+ Provide EITHER `selector` OR `text`, but not both.
314
+
315
+ This creates an ElementCollection that can span multiple PDFs. Note that
316
+ some ElementCollection methods have limitations when spanning PDFs.
317
+
318
+ Args:
319
+ selector: CSS-like selector string to query elements.
320
+ text: Text content to search for (equivalent to 'text:contains(...)').
321
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
322
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
323
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
324
+ **kwargs: Additional keyword arguments passed to the find_all method of each PDF.
325
+
326
+ Returns:
327
+ ElementCollection containing all matching elements across all PDFs.
328
+ """
329
+ # Validation happens within pdf.find_all
330
+
331
+ # Collect elements from all PDFs
332
+ all_elements = []
252
333
  for pdf in self._pdfs:
253
- # We need to figure out which pages belong to which PDF if batching here
254
- # For now, simpler to call on each PDF
255
334
  try:
256
- # Assume apply_ocr exists on PDF and accepts similar args
257
- pdf.apply_ocr(*args, **kwargs)
335
+ # Pass the relevant arguments down to each PDF's find_all
336
+ elements = pdf.find_all(
337
+ selector=selector,
338
+ text=text,
339
+ apply_exclusions=apply_exclusions,
340
+ regex=regex,
341
+ case=case,
342
+ **kwargs,
343
+ )
344
+ all_elements.extend(elements.elements)
345
+ except Exception as e:
346
+ logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
347
+
348
+ return ElementCollection(all_elements)
349
+
350
+ def apply_ocr(
351
+ self,
352
+ engine: Optional[str] = None,
353
+ languages: Optional[List[str]] = None,
354
+ min_confidence: Optional[float] = None,
355
+ device: Optional[str] = None,
356
+ resolution: Optional[int] = None,
357
+ apply_exclusions: bool = True,
358
+ detect_only: bool = False,
359
+ replace: bool = True,
360
+ options: Optional[Any] = None,
361
+ pages: Optional[Union[slice, List[int]]] = None,
362
+ max_workers: Optional[int] = None,
363
+ ) -> "PDFCollection":
364
+ """
365
+ Apply OCR to all PDFs in the collection, potentially in parallel.
366
+
367
+ Args:
368
+ engine: OCR engine to use (e.g., 'easyocr', 'paddleocr', 'surya')
369
+ languages: List of language codes for OCR
370
+ min_confidence: Minimum confidence threshold for text detection
371
+ device: Device to use for OCR (e.g., 'cpu', 'cuda')
372
+ resolution: DPI resolution for page rendering
373
+ apply_exclusions: Whether to apply exclusion regions
374
+ detect_only: If True, only detect text regions without extracting text
375
+ replace: If True, replace existing OCR elements
376
+ options: Engine-specific options
377
+ pages: Specific pages to process (None for all pages)
378
+ max_workers: Maximum number of threads to process PDFs concurrently.
379
+ If None or 1, processing is sequential. (default: None)
380
+
381
+ Returns:
382
+ Self for method chaining
383
+ """
384
+ PDF = self._get_pdf_class()
385
+ logger.info(
386
+ f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
387
+ )
388
+
389
+ # Worker function takes PDF object again
390
+ def _process_pdf(pdf: PDF):
391
+ """Helper function to apply OCR to a single PDF, handling errors."""
392
+ thread_id = threading.current_thread().name # Get thread name for logging
393
+ pdf_path = pdf.path # Get path for logging
394
+ logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
395
+ start_time = time.monotonic()
396
+ try:
397
+ pdf.apply_ocr( # Call apply_ocr on the original PDF object
398
+ pages=pages,
399
+ engine=engine,
400
+ languages=languages,
401
+ min_confidence=min_confidence,
402
+ device=device,
403
+ resolution=resolution,
404
+ apply_exclusions=apply_exclusions,
405
+ detect_only=detect_only,
406
+ replace=replace,
407
+ options=options,
408
+ # Note: We might want a max_workers here too for page rendering?
409
+ # For now, PDF.apply_ocr doesn't have it.
410
+ )
411
+ end_time = time.monotonic()
412
+ logger.debug(
413
+ f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
414
+ )
415
+ return pdf_path, None
258
416
  except Exception as e:
259
- logger.error(f"Failed applying OCR to {pdf.path}: {e}", exc_info=True)
417
+ end_time = time.monotonic()
418
+ logger.error(
419
+ f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
420
+ exc_info=False,
421
+ )
422
+ return pdf_path, e # Return path and error
423
+
424
+ # Use ThreadPoolExecutor for parallel processing if max_workers > 1
425
+ if max_workers is not None and max_workers > 1:
426
+ futures = []
427
+ with concurrent.futures.ThreadPoolExecutor(
428
+ max_workers=max_workers, thread_name_prefix="OCRWorker"
429
+ ) as executor:
430
+ for pdf in self._pdfs:
431
+ # Submit the PDF object to the worker function
432
+ futures.append(executor.submit(_process_pdf, pdf))
433
+
434
+ # Use the selected tqdm class with as_completed for progress tracking
435
+ progress_bar = tqdm(
436
+ concurrent.futures.as_completed(futures),
437
+ total=len(self._pdfs),
438
+ desc="Applying OCR (Parallel)",
439
+ unit="pdf",
440
+ )
441
+
442
+ for future in progress_bar:
443
+ pdf_path, error = future.result() # Get result (or exception)
444
+ if error:
445
+ progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
446
+ # Progress is updated automatically by tqdm
447
+
448
+ else: # Sequential processing (max_workers is None or 1)
449
+ logger.info("Applying OCR sequentially...")
450
+ # Use the selected tqdm class for sequential too for consistency
451
+ # Iterate over PDF objects directly for sequential
452
+ for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
453
+ _process_pdf(pdf) # Call helper directly with PDF object
454
+
455
+ logger.info("Finished applying OCR across the collection.")
260
456
  return self
261
457
 
262
- # --- Advanced Method Placeholders ---
263
- # Placeholder for categorize removed as find_relevant is now implemented
458
+ def correct_ocr(
459
+ self,
460
+ correction_callback: Callable[[Any], Optional[str]],
461
+ max_workers: Optional[int] = None,
462
+ progress_callback: Optional[Callable[[], None]] = None,
463
+ ) -> "PDFCollection":
464
+ """
465
+ Apply OCR correction to all relevant elements across all pages and PDFs
466
+ in the collection using a single progress bar.
467
+
468
+ Args:
469
+ correction_callback: Function to apply to each OCR element.
470
+ It receives the element and should return
471
+ the corrected text (str) or None.
472
+ max_workers: Max threads to use for parallel execution within each page.
473
+ progress_callback: Optional callback function to call after processing each element.
474
+
475
+ Returns:
476
+ Self for method chaining.
477
+ """
478
+ PDF = self._get_pdf_class() # Ensure PDF class is available
479
+ if not callable(correction_callback):
480
+ raise TypeError("`correction_callback` must be a callable function.")
481
+
482
+ logger.info(f"Gathering OCR elements from {len(self._pdfs)} PDFs for correction...")
483
+
484
+ # 1. Gather all target elements using the collection's find_all
485
+ # Crucially, set apply_exclusions=False to include elements in headers/footers etc.
486
+ all_ocr_elements = self.find_all("text[source=ocr]", apply_exclusions=False).elements
487
+
488
+ if not all_ocr_elements:
489
+ logger.info("No OCR elements found in the collection to correct.")
490
+ return self
491
+
492
+ total_elements = len(all_ocr_elements)
493
+ logger.info(
494
+ f"Found {total_elements} OCR elements across the collection. Starting correction process..."
495
+ )
496
+
497
+ # 2. Initialize the progress bar
498
+ progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
499
+
500
+ # 3. Iterate through PDFs and delegate to PDF.correct_ocr
501
+ # PDF.correct_ocr handles page iteration and passing the progress callback down.
502
+ for pdf in self._pdfs:
503
+ if not pdf.pages:
504
+ continue
505
+ try:
506
+ pdf.correct_ocr(
507
+ correction_callback=correction_callback,
508
+ max_workers=max_workers,
509
+ progress_callback=progress_bar.update, # Pass the bar's update method
510
+ )
511
+ except Exception as e:
512
+ logger.error(
513
+ f"Error occurred during correction process for PDF {pdf.path}: {e}",
514
+ exc_info=True,
515
+ )
516
+ # Decide if we should stop or continue? For now, continue.
517
+
518
+ progress_bar.close()
519
+
520
+ return self
264
521
 
265
522
  def categorize(self, categories: List[str], **kwargs):
266
523
  """Categorizes PDFs in the collection based on content or features."""
@@ -309,3 +566,165 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
309
566
  # logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
310
567
  # continue
311
568
  yield page
569
+
570
+ # --- Classification Method --- #
571
+ def classify_all(
572
+ self,
573
+ categories: List[str],
574
+ model: str = "text",
575
+ max_workers: Optional[int] = None,
576
+ **kwargs,
577
+ ) -> "PDFCollection":
578
+ """
579
+ Classify all pages across all PDFs in the collection, potentially in parallel.
580
+
581
+ This method uses the unified `classify_all` approach, delegating page
582
+ classification to each PDF's `classify_pages` method.
583
+ It displays a progress bar tracking individual pages.
584
+
585
+ Args:
586
+ categories: A list of string category names.
587
+ model: Model identifier ('text', 'vision', or specific HF ID).
588
+ max_workers: Maximum number of threads to process PDFs concurrently.
589
+ If None or 1, processing is sequential.
590
+ **kwargs: Additional arguments passed down to `pdf.classify_pages` and
591
+ subsequently to `page.classify` (e.g., device,
592
+ confidence_threshold, resolution).
593
+
594
+ Returns:
595
+ Self for method chaining.
596
+
597
+ Raises:
598
+ ValueError: If categories list is empty.
599
+ ClassificationError: If classification fails for any page (will stop processing).
600
+ ImportError: If classification dependencies are missing.
601
+ """
602
+ PDF = self._get_pdf_class()
603
+ if not categories:
604
+ raise ValueError("Categories list cannot be empty.")
605
+
606
+ logger.info(
607
+ f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
608
+ )
609
+
610
+ # Calculate total pages for the progress bar
611
+ total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
612
+ if total_pages == 0:
613
+ logger.warning("No pages found in the PDF collection to classify.")
614
+ return self
615
+
616
+ progress_bar = tqdm(
617
+ total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
618
+ )
619
+
620
+ # Worker function
621
+ def _process_pdf_classification(pdf: PDF):
622
+ thread_id = threading.current_thread().name
623
+ pdf_path = pdf.path
624
+ logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
625
+ start_time = time.monotonic()
626
+ try:
627
+ # Call classify_pages on the PDF, passing the progress callback
628
+ pdf.classify_pages(
629
+ categories=categories,
630
+ model=model,
631
+ progress_callback=progress_bar.update,
632
+ **kwargs,
633
+ )
634
+ end_time = time.monotonic()
635
+ logger.debug(
636
+ f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
637
+ )
638
+ return pdf_path, None # Return path and no error
639
+ except Exception as e:
640
+ end_time = time.monotonic()
641
+ # Error is logged within classify_pages, but log summary here
642
+ logger.error(
643
+ f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
644
+ exc_info=False,
645
+ )
646
+ # Close progress bar immediately on error to avoid hanging
647
+ progress_bar.close()
648
+ # Re-raise the exception to stop the entire collection processing
649
+ raise
650
+
651
+ # Use ThreadPoolExecutor for parallel processing if max_workers > 1
652
+ try:
653
+ if max_workers is not None and max_workers > 1:
654
+ logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
655
+ futures = []
656
+ with concurrent.futures.ThreadPoolExecutor(
657
+ max_workers=max_workers, thread_name_prefix="ClassifyWorker"
658
+ ) as executor:
659
+ for pdf in self._pdfs:
660
+ futures.append(executor.submit(_process_pdf_classification, pdf))
661
+
662
+ # Wait for all futures to complete (progress updated by callback)
663
+ # Exceptions are raised by future.result() if worker failed
664
+ for future in concurrent.futures.as_completed(futures):
665
+ future.result() # Raise exception if worker failed
666
+
667
+ else: # Sequential processing
668
+ logger.info("Classifying PDFs sequentially.")
669
+ for pdf in self._pdfs:
670
+ _process_pdf_classification(pdf)
671
+
672
+ logger.info("Finished classification across the collection.")
673
+
674
+ finally:
675
+ # Ensure progress bar is closed even if errors occurred elsewhere
676
+ if not progress_bar.disable and progress_bar.n < progress_bar.total:
677
+ progress_bar.close()
678
+ elif progress_bar.disable is False:
679
+ progress_bar.close()
680
+
681
+ return self
682
+
683
+ # --- End Classification Method --- #
684
+
685
+ def _gather_analysis_data(
686
+ self,
687
+ analysis_keys: List[str],
688
+ include_content: bool,
689
+ include_images: bool,
690
+ image_dir: Optional[Path],
691
+ image_format: str,
692
+ image_resolution: int,
693
+ ) -> List[Dict[str, Any]]:
694
+ """
695
+ Gather analysis data from all PDFs in the collection.
696
+
697
+ Args:
698
+ analysis_keys: Keys in the analyses dictionary to export
699
+ include_content: Whether to include extracted text
700
+ include_images: Whether to export images
701
+ image_dir: Directory to save images
702
+ image_format: Format to save images
703
+ image_resolution: Resolution for exported images
704
+
705
+ Returns:
706
+ List of dictionaries containing analysis data
707
+ """
708
+ if not self._pdfs:
709
+ logger.warning("No PDFs found in collection")
710
+ return []
711
+
712
+ all_data = []
713
+
714
+ for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
715
+ # PDF level data
716
+ pdf_data = {
717
+ "pdf_path": pdf.path,
718
+ "pdf_filename": Path(pdf.path).name,
719
+ "total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
720
+ }
721
+
722
+ # Add metadata if available
723
+ if hasattr(pdf, "metadata") and pdf.metadata:
724
+ for k, v in pdf.metadata.items():
725
+ if v: # Only add non-empty metadata
726
+ pdf_data[f"metadata.{k}"] = str(v)
727
+
728
+ all_data.append(pdf_data)
729
+
730
+ return all_data
@@ -539,3 +539,86 @@ class ElementManager:
539
539
  """Get all region elements."""
540
540
  self.load_elements()
541
541
  return self._elements.get("regions", [])
542
+
543
+ def remove_ocr_elements(self):
544
+ """
545
+ Remove all elements with source="ocr" from the elements dictionary.
546
+ This should be called before adding new OCR elements if replacement is desired.
547
+
548
+ Returns:
549
+ int: Number of OCR elements removed
550
+ """
551
+ # Load elements if not already loaded
552
+ self.load_elements()
553
+
554
+ removed_count = 0
555
+
556
+ # Filter out OCR elements from words
557
+ if "words" in self._elements:
558
+ original_len = len(self._elements["words"])
559
+ self._elements["words"] = [
560
+ word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
561
+ ]
562
+ removed_count += original_len - len(self._elements["words"])
563
+
564
+ # Filter out OCR elements from chars
565
+ if "chars" in self._elements:
566
+ original_len = len(self._elements["chars"])
567
+ self._elements["chars"] = [
568
+ char
569
+ for char in self._elements["chars"]
570
+ if (isinstance(char, dict) and char.get("source") != "ocr")
571
+ or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
572
+ ]
573
+ removed_count += original_len - len(self._elements["chars"])
574
+
575
+ logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
576
+ return removed_count
577
+
578
+ def remove_element(self, element, element_type="words"):
579
+ """
580
+ Remove a specific element from the managed elements.
581
+
582
+ Args:
583
+ element: The element to remove
584
+ element_type: The type of element ('words', 'chars', etc.)
585
+
586
+ Returns:
587
+ bool: True if removed successfully, False otherwise
588
+ """
589
+ # Load elements if not already loaded
590
+ self.load_elements()
591
+
592
+ # Check if the collection exists
593
+ if element_type not in self._elements:
594
+ logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
595
+ return False
596
+
597
+ # Try to remove the element
598
+ try:
599
+ if element in self._elements[element_type]:
600
+ self._elements[element_type].remove(element)
601
+ logger.debug(f"Removed element from {element_type}: {element}")
602
+ return True
603
+ else:
604
+ logger.debug(f"Element not found in {element_type}: {element}")
605
+ return False
606
+ except Exception as e:
607
+ logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
608
+ return False
609
+
610
+ def has_elements(self) -> bool:
611
+ """
612
+ Check if any significant elements (words, rects, lines, regions)
613
+ have been loaded or added.
614
+
615
+ Returns:
616
+ True if any elements exist, False otherwise.
617
+ """
618
+ self.load_elements()
619
+
620
+ for key in ["words", "rects", "lines", "regions"]:
621
+ if self._elements.get(key):
622
+ return True
623
+
624
+ return False