natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,28 @@
1
+ import concurrent.futures # Import concurrent.futures
1
2
  import copy # Added for copying options
2
3
  import glob as py_glob
3
4
  import logging
4
5
  import os
5
6
  import re # Added for safe path generation
7
+ import threading # Import threading for logging thread information
8
+ import time # Import time for logging timestamps
6
9
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union, Callable
8
- import concurrent.futures # Import concurrent.futures
9
- import time # Import time for logging timestamps
10
- import threading # Import threading for logging thread information
10
+ from typing import (
11
+ TYPE_CHECKING,
12
+ Any,
13
+ Callable,
14
+ Dict,
15
+ Generic,
16
+ Iterable,
17
+ Iterator,
18
+ List,
19
+ Optional,
20
+ Set,
21
+ Type,
22
+ TypeVar,
23
+ Union,
24
+ overload,
25
+ )
11
26
 
12
27
  from PIL import Image
13
28
  from tqdm import tqdm
@@ -26,6 +41,7 @@ logger = logging.getLogger(__name__)
26
41
 
27
42
  from natural_pdf.core.pdf import PDF
28
43
  from natural_pdf.elements.region import Region
44
+ from natural_pdf.export.mixin import ExportMixin
29
45
 
30
46
  # --- Search Imports ---
31
47
  try:
@@ -47,12 +63,12 @@ except ImportError as e:
47
63
 
48
64
  SearchServiceProtocol, SearchOptions, Indexable = object, object, object
49
65
 
50
- from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
51
66
  # Import the ApplyMixin
52
67
  from natural_pdf.collections.mixins import ApplyMixin
68
+ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
53
69
 
54
70
 
55
- class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
71
+ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
56
72
  def __init__(
57
73
  self,
58
74
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -252,54 +268,83 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
252
268
  def __repr__(self) -> str:
253
269
  # Removed search status
254
270
  return f"<PDFCollection(count={len(self._pdfs)})>"
271
+ return f"<PDFCollection(count={len(self._pdfs)})>"
255
272
 
256
273
  @property
257
274
  def pdfs(self) -> List["PDF"]:
258
275
  """Returns the list of PDF objects held by the collection."""
259
276
  return self._pdfs
260
277
 
278
+ @overload
279
+ def find_all(
280
+ self,
281
+ *,
282
+ text: str,
283
+ apply_exclusions: bool = True,
284
+ regex: bool = False,
285
+ case: bool = True,
286
+ **kwargs,
287
+ ) -> "ElementCollection": ...
288
+
289
+ @overload
290
+ def find_all(
291
+ self,
292
+ selector: str,
293
+ *,
294
+ apply_exclusions: bool = True,
295
+ regex: bool = False,
296
+ case: bool = True,
297
+ **kwargs,
298
+ ) -> "ElementCollection": ...
299
+
261
300
  def find_all(
262
- self,
263
- selector: str,
264
- apply_exclusions: bool = True, # Added explicit parameter
265
- regex: bool = False, # Added explicit parameter
266
- case: bool = True, # Added explicit parameter
267
- **kwargs
301
+ self,
302
+ selector: Optional[str] = None, # Now optional
303
+ *,
304
+ text: Optional[str] = None, # New text parameter
305
+ apply_exclusions: bool = True,
306
+ regex: bool = False,
307
+ case: bool = True,
308
+ **kwargs,
268
309
  ) -> "ElementCollection":
269
310
  """
270
- Find all elements matching the selector across all PDFs in the collection.
271
-
311
+ Find all elements matching the selector OR text across all PDFs in the collection.
312
+
313
+ Provide EITHER `selector` OR `text`, but not both.
314
+
272
315
  This creates an ElementCollection that can span multiple PDFs. Note that
273
316
  some ElementCollection methods have limitations when spanning PDFs.
274
-
317
+
275
318
  Args:
276
- selector: CSS-like selector string to query elements
277
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
278
- regex: Whether to use regex for text search in :contains (default: False)
279
- case: Whether to do case-sensitive text search (default: True)
280
- **kwargs: Additional keyword arguments passed to the find_all method of each PDF
281
-
319
+ selector: CSS-like selector string to query elements.
320
+ text: Text content to search for (equivalent to 'text:contains(...)').
321
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
322
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
323
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
324
+ **kwargs: Additional keyword arguments passed to the find_all method of each PDF.
325
+
282
326
  Returns:
283
- ElementCollection containing all matching elements across all PDFs
327
+ ElementCollection containing all matching elements across all PDFs.
284
328
  """
285
- from natural_pdf.elements.collections import ElementCollection
286
-
329
+ # Validation happens within pdf.find_all
330
+
287
331
  # Collect elements from all PDFs
288
332
  all_elements = []
289
333
  for pdf in self._pdfs:
290
334
  try:
291
- # Explicitly pass the relevant arguments down
335
+ # Pass the relevant arguments down to each PDF's find_all
292
336
  elements = pdf.find_all(
293
- selector,
337
+ selector=selector,
338
+ text=text,
294
339
  apply_exclusions=apply_exclusions,
295
340
  regex=regex,
296
341
  case=case,
297
- **kwargs
342
+ **kwargs,
298
343
  )
299
344
  all_elements.extend(elements.elements)
300
345
  except Exception as e:
301
346
  logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
302
-
347
+
303
348
  return ElementCollection(all_elements)
304
349
 
305
350
  def apply_ocr(
@@ -330,24 +375,26 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
330
375
  replace: If True, replace existing OCR elements
331
376
  options: Engine-specific options
332
377
  pages: Specific pages to process (None for all pages)
333
- max_workers: Maximum number of threads to process PDFs concurrently.
378
+ max_workers: Maximum number of threads to process PDFs concurrently.
334
379
  If None or 1, processing is sequential. (default: None)
335
380
 
336
381
  Returns:
337
382
  Self for method chaining
338
383
  """
339
384
  PDF = self._get_pdf_class()
340
- logger.info(f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})...")
385
+ logger.info(
386
+ f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
387
+ )
341
388
 
342
389
  # Worker function takes PDF object again
343
390
  def _process_pdf(pdf: PDF):
344
391
  """Helper function to apply OCR to a single PDF, handling errors."""
345
- thread_id = threading.current_thread().name # Get thread name for logging
346
- pdf_path = pdf.path # Get path for logging
392
+ thread_id = threading.current_thread().name # Get thread name for logging
393
+ pdf_path = pdf.path # Get path for logging
347
394
  logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
348
395
  start_time = time.monotonic()
349
396
  try:
350
- pdf.apply_ocr( # Call apply_ocr on the original PDF object
397
+ pdf.apply_ocr( # Call apply_ocr on the original PDF object
351
398
  pages=pages,
352
399
  engine=engine,
353
400
  languages=languages,
@@ -362,17 +409,24 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
362
409
  # For now, PDF.apply_ocr doesn't have it.
363
410
  )
364
411
  end_time = time.monotonic()
365
- logger.debug(f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
412
+ logger.debug(
413
+ f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
414
+ )
366
415
  return pdf_path, None
367
416
  except Exception as e:
368
417
  end_time = time.monotonic()
369
- logger.error(f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
370
- return pdf_path, e # Return path and error
418
+ logger.error(
419
+ f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
420
+ exc_info=False,
421
+ )
422
+ return pdf_path, e # Return path and error
371
423
 
372
424
  # Use ThreadPoolExecutor for parallel processing if max_workers > 1
373
425
  if max_workers is not None and max_workers > 1:
374
426
  futures = []
375
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="OCRWorker") as executor:
427
+ with concurrent.futures.ThreadPoolExecutor(
428
+ max_workers=max_workers, thread_name_prefix="OCRWorker"
429
+ ) as executor:
376
430
  for pdf in self._pdfs:
377
431
  # Submit the PDF object to the worker function
378
432
  futures.append(executor.submit(_process_pdf, pdf))
@@ -382,22 +436,22 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
382
436
  concurrent.futures.as_completed(futures),
383
437
  total=len(self._pdfs),
384
438
  desc="Applying OCR (Parallel)",
385
- unit="pdf"
439
+ unit="pdf",
386
440
  )
387
-
441
+
388
442
  for future in progress_bar:
389
- pdf_path, error = future.result() # Get result (or exception)
443
+ pdf_path, error = future.result() # Get result (or exception)
390
444
  if error:
391
445
  progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
392
446
  # Progress is updated automatically by tqdm
393
447
 
394
- else: # Sequential processing (max_workers is None or 1)
448
+ else: # Sequential processing (max_workers is None or 1)
395
449
  logger.info("Applying OCR sequentially...")
396
450
  # Use the selected tqdm class for sequential too for consistency
397
451
  # Iterate over PDF objects directly for sequential
398
452
  for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
399
- _process_pdf(pdf) # Call helper directly with PDF object
400
-
453
+ _process_pdf(pdf) # Call helper directly with PDF object
454
+
401
455
  logger.info("Finished applying OCR across the collection.")
402
456
  return self
403
457
 
@@ -421,7 +475,7 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
421
475
  Returns:
422
476
  Self for method chaining.
423
477
  """
424
- PDF = self._get_pdf_class() # Ensure PDF class is available
478
+ PDF = self._get_pdf_class() # Ensure PDF class is available
425
479
  if not callable(correction_callback):
426
480
  raise TypeError("`correction_callback` must be a callable function.")
427
481
 
@@ -436,7 +490,9 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
436
490
  return self
437
491
 
438
492
  total_elements = len(all_ocr_elements)
439
- logger.info(f"Found {total_elements} OCR elements across the collection. Starting correction process...")
493
+ logger.info(
494
+ f"Found {total_elements} OCR elements across the collection. Starting correction process..."
495
+ )
440
496
 
441
497
  # 2. Initialize the progress bar
442
498
  progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
@@ -450,11 +506,14 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
450
506
  pdf.correct_ocr(
451
507
  correction_callback=correction_callback,
452
508
  max_workers=max_workers,
453
- progress_callback=progress_bar.update # Pass the bar's update method
509
+ progress_callback=progress_bar.update, # Pass the bar's update method
454
510
  )
455
511
  except Exception as e:
456
- logger.error(f"Error occurred during correction process for PDF {pdf.path}: {e}", exc_info=True)
457
- # Decide if we should stop or continue? For now, continue.
512
+ logger.error(
513
+ f"Error occurred during correction process for PDF {pdf.path}: {e}",
514
+ exc_info=True,
515
+ )
516
+ # Decide if we should stop or continue? For now, continue.
458
517
 
459
518
  progress_bar.close()
460
519
 
@@ -544,7 +603,9 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
544
603
  if not categories:
545
604
  raise ValueError("Categories list cannot be empty.")
546
605
 
547
- logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
606
+ logger.info(
607
+ f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
608
+ )
548
609
 
549
610
  # Calculate total pages for the progress bar
550
611
  total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
@@ -553,9 +614,7 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
553
614
  return self
554
615
 
555
616
  progress_bar = tqdm(
556
- total=total_pages,
557
- desc=f"Classifying Pages (model: {model})",
558
- unit="page"
617
+ total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
559
618
  )
560
619
 
561
620
  # Worker function
@@ -570,15 +629,20 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
570
629
  categories=categories,
571
630
  model=model,
572
631
  progress_callback=progress_bar.update,
573
- **kwargs
632
+ **kwargs,
574
633
  )
575
634
  end_time = time.monotonic()
576
- logger.debug(f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
577
- return pdf_path, None # Return path and no error
635
+ logger.debug(
636
+ f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
637
+ )
638
+ return pdf_path, None # Return path and no error
578
639
  except Exception as e:
579
640
  end_time = time.monotonic()
580
641
  # Error is logged within classify_pages, but log summary here
581
- logger.error(f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
642
+ logger.error(
643
+ f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
644
+ exc_info=False,
645
+ )
582
646
  # Close progress bar immediately on error to avoid hanging
583
647
  progress_bar.close()
584
648
  # Re-raise the exception to stop the entire collection processing
@@ -589,16 +653,18 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
589
653
  if max_workers is not None and max_workers > 1:
590
654
  logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
591
655
  futures = []
592
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="ClassifyWorker") as executor:
656
+ with concurrent.futures.ThreadPoolExecutor(
657
+ max_workers=max_workers, thread_name_prefix="ClassifyWorker"
658
+ ) as executor:
593
659
  for pdf in self._pdfs:
594
660
  futures.append(executor.submit(_process_pdf_classification, pdf))
595
661
 
596
662
  # Wait for all futures to complete (progress updated by callback)
597
663
  # Exceptions are raised by future.result() if worker failed
598
664
  for future in concurrent.futures.as_completed(futures):
599
- future.result() # Raise exception if worker failed
665
+ future.result() # Raise exception if worker failed
600
666
 
601
- else: # Sequential processing
667
+ else: # Sequential processing
602
668
  logger.info("Classifying PDFs sequentially.")
603
669
  for pdf in self._pdfs:
604
670
  _process_pdf_classification(pdf)
@@ -606,12 +672,59 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
606
672
  logger.info("Finished classification across the collection.")
607
673
 
608
674
  finally:
609
- # Ensure progress bar is closed even if errors occurred elsewhere
610
- if not progress_bar.disable and progress_bar.n < progress_bar.total:
611
- progress_bar.close()
612
- elif progress_bar.disable is False:
613
- progress_bar.close()
675
+ # Ensure progress bar is closed even if errors occurred elsewhere
676
+ if not progress_bar.disable and progress_bar.n < progress_bar.total:
677
+ progress_bar.close()
678
+ elif progress_bar.disable is False:
679
+ progress_bar.close()
614
680
 
615
681
  return self
616
682
 
617
683
  # --- End Classification Method --- #
684
+
685
+ def _gather_analysis_data(
686
+ self,
687
+ analysis_keys: List[str],
688
+ include_content: bool,
689
+ include_images: bool,
690
+ image_dir: Optional[Path],
691
+ image_format: str,
692
+ image_resolution: int,
693
+ ) -> List[Dict[str, Any]]:
694
+ """
695
+ Gather analysis data from all PDFs in the collection.
696
+
697
+ Args:
698
+ analysis_keys: Keys in the analyses dictionary to export
699
+ include_content: Whether to include extracted text
700
+ include_images: Whether to export images
701
+ image_dir: Directory to save images
702
+ image_format: Format to save images
703
+ image_resolution: Resolution for exported images
704
+
705
+ Returns:
706
+ List of dictionaries containing analysis data
707
+ """
708
+ if not self._pdfs:
709
+ logger.warning("No PDFs found in collection")
710
+ return []
711
+
712
+ all_data = []
713
+
714
+ for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
715
+ # PDF level data
716
+ pdf_data = {
717
+ "pdf_path": pdf.path,
718
+ "pdf_filename": Path(pdf.path).name,
719
+ "total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
720
+ }
721
+
722
+ # Add metadata if available
723
+ if hasattr(pdf, "metadata") and pdf.metadata:
724
+ for k, v in pdf.metadata.items():
725
+ if v: # Only add non-empty metadata
726
+ pdf_data[f"metadata.{k}"] = str(v)
727
+
728
+ all_data.append(pdf_data)
729
+
730
+ return all_data
@@ -544,56 +544,56 @@ class ElementManager:
544
544
  """
545
545
  Remove all elements with source="ocr" from the elements dictionary.
546
546
  This should be called before adding new OCR elements if replacement is desired.
547
-
547
+
548
548
  Returns:
549
549
  int: Number of OCR elements removed
550
550
  """
551
551
  # Load elements if not already loaded
552
552
  self.load_elements()
553
-
553
+
554
554
  removed_count = 0
555
-
555
+
556
556
  # Filter out OCR elements from words
557
557
  if "words" in self._elements:
558
558
  original_len = len(self._elements["words"])
559
559
  self._elements["words"] = [
560
- word for word in self._elements["words"]
561
- if getattr(word, "source", None) != "ocr"
560
+ word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
562
561
  ]
563
562
  removed_count += original_len - len(self._elements["words"])
564
-
563
+
565
564
  # Filter out OCR elements from chars
566
565
  if "chars" in self._elements:
567
566
  original_len = len(self._elements["chars"])
568
567
  self._elements["chars"] = [
569
- char for char in self._elements["chars"]
570
- if (isinstance(char, dict) and char.get("source") != "ocr") or
571
- (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
568
+ char
569
+ for char in self._elements["chars"]
570
+ if (isinstance(char, dict) and char.get("source") != "ocr")
571
+ or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
572
572
  ]
573
573
  removed_count += original_len - len(self._elements["chars"])
574
-
574
+
575
575
  logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
576
576
  return removed_count
577
577
 
578
578
  def remove_element(self, element, element_type="words"):
579
579
  """
580
580
  Remove a specific element from the managed elements.
581
-
581
+
582
582
  Args:
583
583
  element: The element to remove
584
584
  element_type: The type of element ('words', 'chars', etc.)
585
-
585
+
586
586
  Returns:
587
587
  bool: True if removed successfully, False otherwise
588
588
  """
589
589
  # Load elements if not already loaded
590
590
  self.load_elements()
591
-
591
+
592
592
  # Check if the collection exists
593
593
  if element_type not in self._elements:
594
594
  logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
595
595
  return False
596
-
596
+
597
597
  # Try to remove the element
598
598
  try:
599
599
  if element in self._elements[element_type]:
@@ -606,3 +606,19 @@ class ElementManager:
606
606
  except Exception as e:
607
607
  logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
608
608
  return False
609
+
610
+ def has_elements(self) -> bool:
611
+ """
612
+ Check if any significant elements (words, rects, lines, regions)
613
+ have been loaded or added.
614
+
615
+ Returns:
616
+ True if any elements exist, False otherwise.
617
+ """
618
+ self.load_elements()
619
+
620
+ for key in ["words", "rects", "lines", "regions"]:
621
+ if self._elements.get(key):
622
+ return True
623
+
624
+ return False
@@ -18,7 +18,12 @@ except ImportError:
18
18
  Page = Any # Fallback if circular import issue arises during type checking
19
19
 
20
20
  # Import ColorManager and related utils
21
- from natural_pdf.utils.visualization import ColorManager, create_legend, merge_images_with_legend
21
+ from natural_pdf.utils.visualization import (
22
+ ColorManager,
23
+ create_legend,
24
+ merge_images_with_legend,
25
+ render_plain_page,
26
+ )
22
27
 
23
28
  # Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
24
29
  BORDER_ALPHA = 180 # Default alpha for highlight border
@@ -622,28 +627,14 @@ class HighlightingService:
622
627
  return None
623
628
 
624
629
  page = self._pdf[page_index]
625
- highlights_on_page = self.get_highlights_for_page(
626
- page_index
627
- ) # This list will be empty if clear_page was called
630
+ highlights_on_page = self.get_highlights_for_page(page_index)
628
631
 
629
- # --- Get Base Image ---
630
- try:
631
- render_resolution = resolution if resolution is not None else scale * 72
632
- img_object = page._page.to_image(resolution=render_resolution, **kwargs)
633
- base_image = img_object.annotated
634
- if not isinstance(base_image, Image.Image):
635
- png_data = img_object._repr_png_()
636
- if png_data:
637
- base_image = Image.open(io.BytesIO(png_data)).convert("RGB")
638
- else:
639
- raise ValueError("Could not extract base PIL image from pdfplumber.")
640
- base_image = base_image.convert("RGBA")
641
- logger.debug(
642
- f"Base image for page {page_index} rendered with resolution {render_resolution}."
643
- )
644
- except Exception as e:
645
- logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
646
- return None
632
+ render_resolution = resolution if resolution is not None else scale * 72
633
+ base_image = render_plain_page(page, render_resolution)
634
+ base_image = base_image.convert("RGBA")
635
+ logger.debug(
636
+ f"Base image for page {page_index} rendered with resolution {render_resolution}."
637
+ )
647
638
 
648
639
  # --- Render Highlights ---
649
640
  rendered_image: Image.Image