natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +241 -158
  13. natural_pdf/classification/mixin.py +52 -38
  14. natural_pdf/classification/results.py +71 -45
  15. natural_pdf/collections/mixins.py +85 -20
  16. natural_pdf/collections/pdf_collection.py +245 -100
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +694 -195
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +610 -134
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.10.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,28 @@
1
+ import concurrent.futures # Import concurrent.futures
1
2
  import copy # Added for copying options
2
3
  import glob as py_glob
3
4
  import logging
4
5
  import os
5
6
  import re # Added for safe path generation
7
+ import threading # Import threading for logging thread information
8
+ import time # Import time for logging timestamps
6
9
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Type, Union, Callable
8
- import concurrent.futures # Import concurrent.futures
9
- import time # Import time for logging timestamps
10
- import threading # Import threading for logging thread information
10
+ from typing import (
11
+ TYPE_CHECKING,
12
+ Any,
13
+ Callable,
14
+ Dict,
15
+ Generic,
16
+ Iterable,
17
+ Iterator,
18
+ List,
19
+ Optional,
20
+ Set,
21
+ Type,
22
+ TypeVar,
23
+ Union,
24
+ overload,
25
+ )
11
26
 
12
27
  from PIL import Image
13
28
  from tqdm import tqdm
@@ -26,6 +41,7 @@ logger = logging.getLogger(__name__)
26
41
 
27
42
  from natural_pdf.core.pdf import PDF
28
43
  from natural_pdf.elements.region import Region
44
+ from natural_pdf.export.mixin import ExportMixin
29
45
 
30
46
  # --- Search Imports ---
31
47
  try:
@@ -47,12 +63,12 @@ except ImportError as e:
47
63
 
48
64
  SearchServiceProtocol, SearchOptions, Indexable = object, object, object
49
65
 
50
- from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
51
66
  # Import the ApplyMixin
52
67
  from natural_pdf.collections.mixins import ApplyMixin
68
+ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
53
69
 
54
70
 
55
- class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
71
+ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
56
72
  def __init__(
57
73
  self,
58
74
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -252,54 +268,83 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
252
268
  def __repr__(self) -> str:
253
269
  # Removed search status
254
270
  return f"<PDFCollection(count={len(self._pdfs)})>"
271
+ return f"<PDFCollection(count={len(self._pdfs)})>"
255
272
 
256
273
  @property
257
274
  def pdfs(self) -> List["PDF"]:
258
275
  """Returns the list of PDF objects held by the collection."""
259
276
  return self._pdfs
260
277
 
278
+ @overload
279
+ def find_all(
280
+ self,
281
+ *,
282
+ text: str,
283
+ apply_exclusions: bool = True,
284
+ regex: bool = False,
285
+ case: bool = True,
286
+ **kwargs,
287
+ ) -> "ElementCollection": ...
288
+
289
+ @overload
261
290
  def find_all(
262
- self,
263
- selector: str,
264
- apply_exclusions: bool = True, # Added explicit parameter
265
- regex: bool = False, # Added explicit parameter
266
- case: bool = True, # Added explicit parameter
267
- **kwargs
291
+ self,
292
+ selector: str,
293
+ *,
294
+ apply_exclusions: bool = True,
295
+ regex: bool = False,
296
+ case: bool = True,
297
+ **kwargs,
298
+ ) -> "ElementCollection": ...
299
+
300
+ def find_all(
301
+ self,
302
+ selector: Optional[str] = None, # Now optional
303
+ *,
304
+ text: Optional[str] = None, # New text parameter
305
+ apply_exclusions: bool = True,
306
+ regex: bool = False,
307
+ case: bool = True,
308
+ **kwargs,
268
309
  ) -> "ElementCollection":
269
310
  """
270
- Find all elements matching the selector across all PDFs in the collection.
271
-
311
+ Find all elements matching the selector OR text across all PDFs in the collection.
312
+
313
+ Provide EITHER `selector` OR `text`, but not both.
314
+
272
315
  This creates an ElementCollection that can span multiple PDFs. Note that
273
316
  some ElementCollection methods have limitations when spanning PDFs.
274
-
317
+
275
318
  Args:
276
- selector: CSS-like selector string to query elements
277
- apply_exclusions: Whether to exclude elements in exclusion regions (default: True)
278
- regex: Whether to use regex for text search in :contains (default: False)
279
- case: Whether to do case-sensitive text search (default: True)
280
- **kwargs: Additional keyword arguments passed to the find_all method of each PDF
281
-
319
+ selector: CSS-like selector string to query elements.
320
+ text: Text content to search for (equivalent to 'text:contains(...)').
321
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
322
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
323
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
324
+ **kwargs: Additional keyword arguments passed to the find_all method of each PDF.
325
+
282
326
  Returns:
283
- ElementCollection containing all matching elements across all PDFs
327
+ ElementCollection containing all matching elements across all PDFs.
284
328
  """
285
- from natural_pdf.elements.collections import ElementCollection
286
-
329
+ # Validation happens within pdf.find_all
330
+
287
331
  # Collect elements from all PDFs
288
332
  all_elements = []
289
333
  for pdf in self._pdfs:
290
334
  try:
291
- # Explicitly pass the relevant arguments down
335
+ # Pass the relevant arguments down to each PDF's find_all
292
336
  elements = pdf.find_all(
293
- selector,
337
+ selector=selector,
338
+ text=text,
294
339
  apply_exclusions=apply_exclusions,
295
340
  regex=regex,
296
341
  case=case,
297
- **kwargs
342
+ **kwargs,
298
343
  )
299
344
  all_elements.extend(elements.elements)
300
345
  except Exception as e:
301
346
  logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
302
-
347
+
303
348
  return ElementCollection(all_elements)
304
349
 
305
350
  def apply_ocr(
@@ -330,24 +375,26 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
330
375
  replace: If True, replace existing OCR elements
331
376
  options: Engine-specific options
332
377
  pages: Specific pages to process (None for all pages)
333
- max_workers: Maximum number of threads to process PDFs concurrently.
378
+ max_workers: Maximum number of threads to process PDFs concurrently.
334
379
  If None or 1, processing is sequential. (default: None)
335
380
 
336
381
  Returns:
337
382
  Self for method chaining
338
383
  """
339
384
  PDF = self._get_pdf_class()
340
- logger.info(f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})...")
385
+ logger.info(
386
+ f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
387
+ )
341
388
 
342
389
  # Worker function takes PDF object again
343
390
  def _process_pdf(pdf: PDF):
344
391
  """Helper function to apply OCR to a single PDF, handling errors."""
345
- thread_id = threading.current_thread().name # Get thread name for logging
346
- pdf_path = pdf.path # Get path for logging
392
+ thread_id = threading.current_thread().name # Get thread name for logging
393
+ pdf_path = pdf.path # Get path for logging
347
394
  logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
348
395
  start_time = time.monotonic()
349
396
  try:
350
- pdf.apply_ocr( # Call apply_ocr on the original PDF object
397
+ pdf.apply_ocr( # Call apply_ocr on the original PDF object
351
398
  pages=pages,
352
399
  engine=engine,
353
400
  languages=languages,
@@ -362,17 +409,24 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
362
409
  # For now, PDF.apply_ocr doesn't have it.
363
410
  )
364
411
  end_time = time.monotonic()
365
- logger.debug(f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
412
+ logger.debug(
413
+ f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
414
+ )
366
415
  return pdf_path, None
367
416
  except Exception as e:
368
417
  end_time = time.monotonic()
369
- logger.error(f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
370
- return pdf_path, e # Return path and error
418
+ logger.error(
419
+ f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
420
+ exc_info=False,
421
+ )
422
+ return pdf_path, e # Return path and error
371
423
 
372
424
  # Use ThreadPoolExecutor for parallel processing if max_workers > 1
373
425
  if max_workers is not None and max_workers > 1:
374
426
  futures = []
375
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="OCRWorker") as executor:
427
+ with concurrent.futures.ThreadPoolExecutor(
428
+ max_workers=max_workers, thread_name_prefix="OCRWorker"
429
+ ) as executor:
376
430
  for pdf in self._pdfs:
377
431
  # Submit the PDF object to the worker function
378
432
  futures.append(executor.submit(_process_pdf, pdf))
@@ -382,22 +436,22 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
382
436
  concurrent.futures.as_completed(futures),
383
437
  total=len(self._pdfs),
384
438
  desc="Applying OCR (Parallel)",
385
- unit="pdf"
439
+ unit="pdf",
386
440
  )
387
-
441
+
388
442
  for future in progress_bar:
389
- pdf_path, error = future.result() # Get result (or exception)
443
+ pdf_path, error = future.result() # Get result (or exception)
390
444
  if error:
391
445
  progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
392
446
  # Progress is updated automatically by tqdm
393
447
 
394
- else: # Sequential processing (max_workers is None or 1)
448
+ else: # Sequential processing (max_workers is None or 1)
395
449
  logger.info("Applying OCR sequentially...")
396
450
  # Use the selected tqdm class for sequential too for consistency
397
451
  # Iterate over PDF objects directly for sequential
398
452
  for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
399
- _process_pdf(pdf) # Call helper directly with PDF object
400
-
453
+ _process_pdf(pdf) # Call helper directly with PDF object
454
+
401
455
  logger.info("Finished applying OCR across the collection.")
402
456
  return self
403
457
 
@@ -421,7 +475,7 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
421
475
  Returns:
422
476
  Self for method chaining.
423
477
  """
424
- PDF = self._get_pdf_class() # Ensure PDF class is available
478
+ PDF = self._get_pdf_class() # Ensure PDF class is available
425
479
  if not callable(correction_callback):
426
480
  raise TypeError("`correction_callback` must be a callable function.")
427
481
 
@@ -436,7 +490,9 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
436
490
  return self
437
491
 
438
492
  total_elements = len(all_ocr_elements)
439
- logger.info(f"Found {total_elements} OCR elements across the collection. Starting correction process...")
493
+ logger.info(
494
+ f"Found {total_elements} OCR elements across the collection. Starting correction process..."
495
+ )
440
496
 
441
497
  # 2. Initialize the progress bar
442
498
  progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
@@ -450,17 +506,20 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
450
506
  pdf.correct_ocr(
451
507
  correction_callback=correction_callback,
452
508
  max_workers=max_workers,
453
- progress_callback=progress_bar.update # Pass the bar's update method
509
+ progress_callback=progress_bar.update, # Pass the bar's update method
454
510
  )
455
511
  except Exception as e:
456
- logger.error(f"Error occurred during correction process for PDF {pdf.path}: {e}", exc_info=True)
457
- # Decide if we should stop or continue? For now, continue.
512
+ logger.error(
513
+ f"Error occurred during correction process for PDF {pdf.path}: {e}",
514
+ exc_info=True,
515
+ )
516
+ # Decide if we should stop or continue? For now, continue.
458
517
 
459
518
  progress_bar.close()
460
519
 
461
520
  return self
462
521
 
463
- def categorize(self, categories: List[str], **kwargs):
522
+ def categorize(self, labels: List[str], **kwargs):
464
523
  """Categorizes PDFs in the collection based on content or features."""
465
524
  # Implementation requires integrating with classification models or logic
466
525
  raise NotImplementedError("categorize requires classification implementation.")
@@ -511,107 +570,193 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
511
570
  # --- Classification Method --- #
512
571
  def classify_all(
513
572
  self,
514
- categories: List[str],
515
- model: str = "text",
573
+ labels: List[str],
574
+ using: Optional[str] = None, # Default handled by PDF.classify -> manager
575
+ model: Optional[str] = None, # Optional model ID
516
576
  max_workers: Optional[int] = None,
577
+ analysis_key: str = "classification", # Key for storing result in PDF.analyses
517
578
  **kwargs,
518
579
  ) -> "PDFCollection":
519
580
  """
520
- Classify all pages across all PDFs in the collection, potentially in parallel.
581
+ Classify each PDF document in the collection, potentially in parallel.
521
582
 
522
- This method uses the unified `classify_all` approach, delegating page
523
- classification to each PDF's `classify_pages` method.
524
- It displays a progress bar tracking individual pages.
583
+ This method delegates classification to each PDF object's `classify` method.
584
+ By default, uses the full extracted text of the PDF.
585
+ If `using='vision'`, it classifies the first page's image, but ONLY if
586
+ the PDF has a single page (raises ValueError otherwise).
525
587
 
526
588
  Args:
527
- categories: A list of string category names.
528
- model: Model identifier ('text', 'vision', or specific HF ID).
589
+ labels: A list of string category names.
590
+ using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
591
+ model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
529
592
  max_workers: Maximum number of threads to process PDFs concurrently.
530
593
  If None or 1, processing is sequential.
531
- **kwargs: Additional arguments passed down to `pdf.classify_pages` and
532
- subsequently to `page.classify` (e.g., device,
533
- confidence_threshold, resolution).
594
+ analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
595
+ **kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
596
+ min_confidence, multi_label, text extraction options).
534
597
 
535
598
  Returns:
536
599
  Self for method chaining.
537
600
 
538
601
  Raises:
539
- ValueError: If categories list is empty.
540
- ClassificationError: If classification fails for any page (will stop processing).
602
+ ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
603
+ ClassificationError: If classification fails for any PDF (will stop processing).
541
604
  ImportError: If classification dependencies are missing.
542
605
  """
543
606
  PDF = self._get_pdf_class()
544
- if not categories:
545
- raise ValueError("Categories list cannot be empty.")
546
-
547
- logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
607
+ if not labels:
608
+ raise ValueError("Labels list cannot be empty.")
548
609
 
549
- # Calculate total pages for the progress bar
550
- total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
551
- if total_pages == 0:
552
- logger.warning("No pages found in the PDF collection to classify.")
610
+ if not self._pdfs:
611
+ logger.warning("PDFCollection is empty, skipping classification.")
553
612
  return self
554
613
 
614
+ mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
615
+ logger.info(
616
+ f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
617
+ )
618
+
555
619
  progress_bar = tqdm(
556
- total=total_pages,
557
- desc=f"Classifying Pages (model: {model})",
558
- unit="page"
620
+ total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
559
621
  )
560
622
 
561
623
  # Worker function
562
624
  def _process_pdf_classification(pdf: PDF):
563
625
  thread_id = threading.current_thread().name
564
626
  pdf_path = pdf.path
565
- logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
627
+ logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
566
628
  start_time = time.monotonic()
567
629
  try:
568
- # Call classify_pages on the PDF, passing the progress callback
569
- pdf.classify_pages(
570
- categories=categories,
630
+ # Call classify directly on the PDF object
631
+ pdf.classify(
632
+ labels=labels,
633
+ using=using,
571
634
  model=model,
572
- progress_callback=progress_bar.update,
573
- **kwargs
635
+ analysis_key=analysis_key,
636
+ **kwargs, # Pass other relevant args like min_confidence, multi_label
637
+ )
638
+ end_time = time.monotonic()
639
+ logger.debug(
640
+ f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
574
641
  )
642
+ progress_bar.update(1) # Update progress bar upon success
643
+ return pdf_path, None # Return path and no error
644
+ except ValueError as ve:
645
+ # Catch specific error for vision on multi-page PDF
575
646
  end_time = time.monotonic()
576
- logger.debug(f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)")
577
- return pdf_path, None # Return path and no error
647
+ logger.error(
648
+ f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
649
+ exc_info=False,
650
+ )
651
+ progress_bar.update(1) # Still update progress bar
652
+ return pdf_path, ve # Return the specific ValueError
578
653
  except Exception as e:
579
654
  end_time = time.monotonic()
580
- # Error is logged within classify_pages, but log summary here
581
- logger.error(f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}", exc_info=False)
582
- # Close progress bar immediately on error to avoid hanging
583
- progress_bar.close()
655
+ logger.error(
656
+ f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
657
+ exc_info=True, # Log full traceback for unexpected errors
658
+ )
659
+ # Close progress bar immediately on critical error to avoid hanging
660
+ if not progress_bar.disable:
661
+ progress_bar.close()
584
662
  # Re-raise the exception to stop the entire collection processing
585
- raise
663
+ raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
586
664
 
587
665
  # Use ThreadPoolExecutor for parallel processing if max_workers > 1
666
+ processed_count = 0
667
+ skipped_count = 0
588
668
  try:
589
669
  if max_workers is not None and max_workers > 1:
590
670
  logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
591
671
  futures = []
592
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix="ClassifyWorker") as executor:
672
+ with concurrent.futures.ThreadPoolExecutor(
673
+ max_workers=max_workers, thread_name_prefix="ClassifyWorker"
674
+ ) as executor:
593
675
  for pdf in self._pdfs:
594
676
  futures.append(executor.submit(_process_pdf_classification, pdf))
595
677
 
596
- # Wait for all futures to complete (progress updated by callback)
597
- # Exceptions are raised by future.result() if worker failed
678
+ # Wait for all futures to complete
679
+ # Progress updated within worker
598
680
  for future in concurrent.futures.as_completed(futures):
599
- future.result() # Raise exception if worker failed
600
-
601
- else: # Sequential processing
681
+ processed_count += 1
682
+ pdf_path, error = (
683
+ future.result()
684
+ ) # Raise ClassificationError if worker failed critically
685
+ if isinstance(error, ValueError):
686
+ # Logged in worker, just count as skipped
687
+ skipped_count += 1
688
+
689
+ else: # Sequential processing
602
690
  logger.info("Classifying PDFs sequentially.")
603
691
  for pdf in self._pdfs:
604
- _process_pdf_classification(pdf)
605
-
606
- logger.info("Finished classification across the collection.")
692
+ processed_count += 1
693
+ pdf_path, error = _process_pdf_classification(
694
+ pdf
695
+ ) # Raise ClassificationError if worker failed critically
696
+ if isinstance(error, ValueError):
697
+ skipped_count += 1
698
+
699
+ final_message = (
700
+ f"Finished classification across the collection. Processed: {processed_count}"
701
+ )
702
+ if skipped_count > 0:
703
+ final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
704
+ logger.info(final_message + ".")
607
705
 
608
706
  finally:
609
- # Ensure progress bar is closed even if errors occurred elsewhere
610
- if not progress_bar.disable and progress_bar.n < progress_bar.total:
611
- progress_bar.close()
612
- elif progress_bar.disable is False:
613
- progress_bar.close()
707
+ # Ensure progress bar is closed properly
708
+ if not progress_bar.disable and progress_bar.n < progress_bar.total:
709
+ progress_bar.n = progress_bar.total # Ensure it reaches 100%
710
+ if not progress_bar.disable:
711
+ progress_bar.close()
614
712
 
615
713
  return self
616
714
 
617
715
  # --- End Classification Method --- #
716
+
717
+ def _gather_analysis_data(
718
+ self,
719
+ analysis_keys: List[str],
720
+ include_content: bool,
721
+ include_images: bool,
722
+ image_dir: Optional[Path],
723
+ image_format: str,
724
+ image_resolution: int,
725
+ ) -> List[Dict[str, Any]]:
726
+ """
727
+ Gather analysis data from all PDFs in the collection.
728
+
729
+ Args:
730
+ analysis_keys: Keys in the analyses dictionary to export
731
+ include_content: Whether to include extracted text
732
+ include_images: Whether to export images
733
+ image_dir: Directory to save images
734
+ image_format: Format to save images
735
+ image_resolution: Resolution for exported images
736
+
737
+ Returns:
738
+ List of dictionaries containing analysis data
739
+ """
740
+ if not self._pdfs:
741
+ logger.warning("No PDFs found in collection")
742
+ return []
743
+
744
+ all_data = []
745
+
746
+ for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
747
+ # PDF level data
748
+ pdf_data = {
749
+ "pdf_path": pdf.path,
750
+ "pdf_filename": Path(pdf.path).name,
751
+ "total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
752
+ }
753
+
754
+ # Add metadata if available
755
+ if hasattr(pdf, "metadata") and pdf.metadata:
756
+ for k, v in pdf.metadata.items():
757
+ if v: # Only add non-empty metadata
758
+ pdf_data[f"metadata.{k}"] = str(v)
759
+
760
+ all_data.append(pdf_data)
761
+
762
+ return all_data
@@ -544,56 +544,56 @@ class ElementManager:
544
544
  """
545
545
  Remove all elements with source="ocr" from the elements dictionary.
546
546
  This should be called before adding new OCR elements if replacement is desired.
547
-
547
+
548
548
  Returns:
549
549
  int: Number of OCR elements removed
550
550
  """
551
551
  # Load elements if not already loaded
552
552
  self.load_elements()
553
-
553
+
554
554
  removed_count = 0
555
-
555
+
556
556
  # Filter out OCR elements from words
557
557
  if "words" in self._elements:
558
558
  original_len = len(self._elements["words"])
559
559
  self._elements["words"] = [
560
- word for word in self._elements["words"]
561
- if getattr(word, "source", None) != "ocr"
560
+ word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
562
561
  ]
563
562
  removed_count += original_len - len(self._elements["words"])
564
-
563
+
565
564
  # Filter out OCR elements from chars
566
565
  if "chars" in self._elements:
567
566
  original_len = len(self._elements["chars"])
568
567
  self._elements["chars"] = [
569
- char for char in self._elements["chars"]
570
- if (isinstance(char, dict) and char.get("source") != "ocr") or
571
- (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
568
+ char
569
+ for char in self._elements["chars"]
570
+ if (isinstance(char, dict) and char.get("source") != "ocr")
571
+ or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
572
572
  ]
573
573
  removed_count += original_len - len(self._elements["chars"])
574
-
574
+
575
575
  logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
576
576
  return removed_count
577
577
 
578
578
  def remove_element(self, element, element_type="words"):
579
579
  """
580
580
  Remove a specific element from the managed elements.
581
-
581
+
582
582
  Args:
583
583
  element: The element to remove
584
584
  element_type: The type of element ('words', 'chars', etc.)
585
-
585
+
586
586
  Returns:
587
587
  bool: True if removed successfully, False otherwise
588
588
  """
589
589
  # Load elements if not already loaded
590
590
  self.load_elements()
591
-
591
+
592
592
  # Check if the collection exists
593
593
  if element_type not in self._elements:
594
594
  logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
595
595
  return False
596
-
596
+
597
597
  # Try to remove the element
598
598
  try:
599
599
  if element in self._elements[element_type]:
@@ -606,3 +606,19 @@ class ElementManager:
606
606
  except Exception as e:
607
607
  logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
608
608
  return False
609
+
610
+ def has_elements(self) -> bool:
611
+ """
612
+ Check if any significant elements (words, rects, lines, regions)
613
+ have been loaded or added.
614
+
615
+ Returns:
616
+ True if any elements exist, False otherwise.
617
+ """
618
+ self.load_elements()
619
+
620
+ for key in ["words", "rects", "lines", "regions"]:
621
+ if self._elements.get(key):
622
+ return True
623
+
624
+ return False