natural-pdf 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -161,7 +161,7 @@ class ClassificationManager:
161
161
  def classify_item(
162
162
  self,
163
163
  item_content: Union[str, Image.Image],
164
- categories: List[str],
164
+ labels: List[str],
165
165
  model_id: Optional[str] = None,
166
166
  using: Optional[str] = None,
167
167
  min_confidence: float = 0.0,
@@ -193,13 +193,13 @@ class ClassificationManager:
193
193
  else self.DEFAULT_VISION_MODEL
194
194
  )
195
195
 
196
- if not categories:
197
- raise ValueError("Categories list cannot be empty.")
196
+ if not labels:
197
+ raise ValueError("Labels list cannot be empty.")
198
198
 
199
199
  pipeline_instance = self._get_pipeline(model_id, effective_using)
200
200
  timestamp = datetime.now()
201
201
  parameters = { # Store parameters used for this run
202
- "categories": categories,
202
+ "labels": labels,
203
203
  "model_id": model_id,
204
204
  "using": effective_using,
205
205
  "min_confidence": min_confidence,
@@ -214,7 +214,7 @@ class ClassificationManager:
214
214
  # Handle potential kwargs for specific pipelines if needed
215
215
  # The zero-shot pipelines expect `candidate_labels`
216
216
  result_raw = pipeline_instance(
217
- item_content, candidate_labels=categories, multi_label=multi_label, **kwargs
217
+ item_content, candidate_labels=labels, multi_label=multi_label, **kwargs
218
218
  )
219
219
  logger.debug(f"Raw pipeline result: {result_raw}")
220
220
 
@@ -226,7 +226,7 @@ class ClassificationManager:
226
226
  for label, score_val in zip(result_raw["labels"], result_raw["scores"]):
227
227
  if score_val >= min_confidence:
228
228
  try:
229
- scores_list.append(CategoryScore(label=label, confidence=score_val))
229
+ scores_list.append(CategoryScore(label, score_val))
230
230
  except (ValueError, TypeError) as score_err:
231
231
  logger.warning(
232
232
  f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}"
@@ -241,7 +241,7 @@ class ClassificationManager:
241
241
  label = item["label"]
242
242
  if score_val >= min_confidence:
243
243
  try:
244
- scores_list.append(CategoryScore(label=label, confidence=score_val))
244
+ scores_list.append(CategoryScore(label, score_val))
245
245
  except (ValueError, TypeError) as score_err:
246
246
  logger.warning(
247
247
  f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}"
@@ -253,13 +253,15 @@ class ClassificationManager:
253
253
  # Return empty result?
254
254
  # scores_list = []
255
255
 
256
- return ClassificationResult(
256
+ # ClassificationResult now calculates top score/category internally
257
+ result_obj = ClassificationResult(
258
+ scores=scores_list, # Pass the filtered list
257
259
  model_id=model_id,
258
260
  using=effective_using,
259
- timestamp=timestamp,
260
261
  parameters=parameters,
261
- scores=scores_list,
262
+ timestamp=timestamp,
262
263
  )
264
+ return result_obj
263
265
  # --- End Processing --- #
264
266
 
265
267
  except Exception as e:
@@ -273,7 +275,7 @@ class ClassificationManager:
273
275
  def classify_batch(
274
276
  self,
275
277
  item_contents: List[Union[str, Image.Image]],
276
- categories: List[str],
278
+ labels: List[str],
277
279
  model_id: Optional[str] = None,
278
280
  using: Optional[str] = None,
279
281
  min_confidence: float = 0.0,
@@ -307,13 +309,13 @@ class ClassificationManager:
307
309
  else self.DEFAULT_VISION_MODEL
308
310
  )
309
311
 
310
- if not categories:
311
- raise ValueError("Categories list cannot be empty.")
312
+ if not labels:
313
+ raise ValueError("Labels list cannot be empty.")
312
314
 
313
315
  pipeline_instance = self._get_pipeline(model_id, effective_using)
314
316
  timestamp = datetime.now() # Single timestamp for the batch run
315
317
  parameters = { # Parameters for the whole batch
316
- "categories": categories,
318
+ "labels": labels,
317
319
  "model_id": model_id,
318
320
  "using": effective_using,
319
321
  "min_confidence": min_confidence,
@@ -331,7 +333,7 @@ class ClassificationManager:
331
333
  # Use pipeline directly for batching
332
334
  results_iterator = pipeline_instance(
333
335
  item_contents,
334
- candidate_labels=categories,
336
+ candidate_labels=labels,
335
337
  multi_label=multi_label,
336
338
  batch_size=batch_size,
337
339
  **kwargs,
@@ -362,9 +364,7 @@ class ClassificationManager:
362
364
  for label, score_val in zip(raw_result["labels"], raw_result["scores"]):
363
365
  if score_val >= min_confidence:
364
366
  try:
365
- scores_list.append(
366
- CategoryScore(label=label, confidence=score_val)
367
- )
367
+ scores_list.append(CategoryScore(label, score_val))
368
368
  except (ValueError, TypeError) as score_err:
369
369
  logger.warning(
370
370
  f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}"
@@ -376,9 +376,7 @@ class ClassificationManager:
376
376
  score_val = item["score"]
377
377
  label = item["label"]
378
378
  if score_val >= min_confidence:
379
- scores_list.append(
380
- CategoryScore(label=label, confidence=score_val)
381
- )
379
+ scores_list.append(CategoryScore(label, score_val))
382
380
  except (KeyError, ValueError, TypeError) as item_err:
383
381
  logger.warning(
384
382
  f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}"
@@ -394,14 +392,20 @@ class ClassificationManager:
394
392
  )
395
393
  # scores_list remains empty for this item
396
394
 
395
+ # --- Determine top category and score ---
396
+ scores_list.sort(key=lambda s: s.score, reverse=True)
397
+ top_category = scores_list[0].label
398
+ top_score = scores_list[0].score
399
+ # --- End Determine top category ---
400
+
397
401
  # Append result object for this item
398
402
  batch_results_list.append(
399
403
  ClassificationResult(
404
+ scores=scores_list, # Pass the full list, init will sort/filter
400
405
  model_id=model_id,
401
406
  using=effective_using,
402
407
  timestamp=timestamp, # Use same timestamp for batch
403
408
  parameters=parameters, # Use same params for batch
404
- scores=scores_list,
405
409
  )
406
410
  )
407
411
  # --- End Processing --- #
@@ -44,9 +44,9 @@ class ClassificationMixin:
44
44
 
45
45
  def classify(
46
46
  self,
47
- categories: List[str],
48
- model: Optional[str] = None, # Default handled by manager
49
- using: Optional[str] = None, # Renamed parameter
47
+ labels: List[str],
48
+ model: Optional[str] = None,
49
+ using: Optional[str] = None,
50
50
  min_confidence: float = 0.0,
51
51
  analysis_key: str = "classification", # Default key
52
52
  multi_label: bool = False,
@@ -60,7 +60,7 @@ class ClassificationMixin:
60
60
  result under that key.
61
61
 
62
62
  Args:
63
- categories: A list of string category names.
63
+ labels: A list of string category names.
64
64
  model: Model identifier (e.g., 'text', 'vision', HF ID). Defaults handled by manager.
65
65
  using: Optional processing mode ('text' or 'vision'). If None, inferred by manager.
66
66
  min_confidence: Minimum confidence threshold for results (0.0-1.0).
@@ -103,9 +103,9 @@ class ClassificationMixin:
103
103
  # Manager now returns a ClassificationResult object
104
104
  result_obj: ClassificationResult = manager.classify_item(
105
105
  item_content=content,
106
- categories=categories,
107
- model_id=effective_model_id, # Pass the resolved model ID
108
- using=inferred_using, # Pass renamed argument
106
+ labels=labels,
107
+ model_id=effective_model_id,
108
+ using=inferred_using,
109
109
  min_confidence=min_confidence,
110
110
  multi_label=multi_label,
111
111
  **kwargs,
@@ -11,19 +11,19 @@ logger = logging.getLogger(__name__)
11
11
  class CategoryScore:
12
12
  """Represents a category and its confidence score from classification."""
13
13
 
14
- category: str
14
+ label: str
15
15
  score: float
16
16
 
17
17
  def to_dict(self) -> Dict[str, Any]:
18
18
  """Convert to dictionary for serialization."""
19
- return {"category": self.category, "score": self.score}
19
+ return {"category": self.label, "score": self.score}
20
20
 
21
21
 
22
22
  @dataclass
23
23
  class ClassificationResult:
24
24
  """Results from a classification operation."""
25
25
 
26
- category: str
26
+ category: Optional[str] # Can be None if scores are empty
27
27
  score: float
28
28
  scores: List[CategoryScore]
29
29
  model_id: str
@@ -33,17 +33,25 @@ class ClassificationResult:
33
33
 
34
34
  def __init__(
35
35
  self,
36
- category: str,
37
- score: float,
38
- scores: List[CategoryScore],
36
+ scores: List[CategoryScore], # Now the primary source
39
37
  model_id: str,
40
38
  using: str,
41
39
  parameters: Optional[Dict[str, Any]] = None,
42
40
  timestamp: Optional[datetime] = None,
43
41
  ):
44
- self.category = category
45
- self.score = score
46
- self.scores = scores
42
+ # Determine top category and score from the scores list
43
+ if scores:
44
+ # Sort scores descending by score to find the top one
45
+ sorted_scores = sorted(scores, key=lambda s: s.score, reverse=True)
46
+ self.category = sorted_scores[0].label
47
+ self.score = sorted_scores[0].score
48
+ self.scores = sorted_scores # Store the sorted list
49
+ else:
50
+ # Handle empty scores list
51
+ self.category = None
52
+ self.score = 0.0
53
+ self.scores = [] # Store empty list
54
+
47
55
  self.model_id = model_id
48
56
  self.using = using
49
57
  self.parameters = parameters or {}
@@ -109,3 +109,20 @@ class ApplyMixin:
109
109
  return PageCollection(results)
110
110
 
111
111
  return results
112
+
113
+ def filter(self: Any, predicate: Callable[[Any], bool]) -> Any:
114
+ """
115
+ Filters the collection based on a predicate function.
116
+
117
+ Args:
118
+ predicate: A function that takes an item and returns True if the item
119
+ should be included in the result, False otherwise.
120
+
121
+ Returns:
122
+ A new collection of the same type containing only the items
123
+ for which the predicate returned True.
124
+ """
125
+ items_iterable = self._get_items_for_apply()
126
+ filtered_items = [item for item in items_iterable if predicate(item)]
127
+
128
+ return type(self)(filtered_items)
@@ -519,7 +519,7 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
519
519
 
520
520
  return self
521
521
 
522
- def categorize(self, categories: List[str], **kwargs):
522
+ def categorize(self, labels: List[str], **kwargs):
523
523
  """Categorizes PDFs in the collection based on content or features."""
524
524
  # Implementation requires integrating with classification models or logic
525
525
  raise NotImplementedError("categorize requires classification implementation.")
@@ -570,85 +570,101 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
570
570
  # --- Classification Method --- #
571
571
  def classify_all(
572
572
  self,
573
- categories: List[str],
574
- model: str = "text",
573
+ labels: List[str],
574
+ using: Optional[str] = None, # Default handled by PDF.classify -> manager
575
+ model: Optional[str] = None, # Optional model ID
575
576
  max_workers: Optional[int] = None,
577
+ analysis_key: str = "classification", # Key for storing result in PDF.analyses
576
578
  **kwargs,
577
579
  ) -> "PDFCollection":
578
580
  """
579
- Classify all pages across all PDFs in the collection, potentially in parallel.
581
+ Classify each PDF document in the collection, potentially in parallel.
580
582
 
581
- This method uses the unified `classify_all` approach, delegating page
582
- classification to each PDF's `classify_pages` method.
583
- It displays a progress bar tracking individual pages.
583
+ This method delegates classification to each PDF object's `classify` method.
584
+ By default, uses the full extracted text of the PDF.
585
+ If `using='vision'`, it classifies the first page's image, but ONLY if
586
+ the PDF has a single page (raises ValueError otherwise).
584
587
 
585
588
  Args:
586
- categories: A list of string category names.
587
- model: Model identifier ('text', 'vision', or specific HF ID).
589
+ labels: A list of string category names.
590
+ using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
591
+ model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
588
592
  max_workers: Maximum number of threads to process PDFs concurrently.
589
593
  If None or 1, processing is sequential.
590
- **kwargs: Additional arguments passed down to `pdf.classify_pages` and
591
- subsequently to `page.classify` (e.g., device,
592
- confidence_threshold, resolution).
594
+ analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
595
+ **kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
596
+ min_confidence, multi_label, text extraction options).
593
597
 
594
598
  Returns:
595
599
  Self for method chaining.
596
600
 
597
601
  Raises:
598
- ValueError: If categories list is empty.
599
- ClassificationError: If classification fails for any page (will stop processing).
602
+ ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
603
+ ClassificationError: If classification fails for any PDF (will stop processing).
600
604
  ImportError: If classification dependencies are missing.
601
605
  """
602
606
  PDF = self._get_pdf_class()
603
- if not categories:
604
- raise ValueError("Categories list cannot be empty.")
607
+ if not labels:
608
+ raise ValueError("Labels list cannot be empty.")
605
609
 
610
+ if not self._pdfs:
611
+ logger.warning("PDFCollection is empty, skipping classification.")
612
+ return self
613
+
614
+ mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
606
615
  logger.info(
607
- f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
616
+ f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
608
617
  )
609
618
 
610
- # Calculate total pages for the progress bar
611
- total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
612
- if total_pages == 0:
613
- logger.warning("No pages found in the PDF collection to classify.")
614
- return self
615
-
616
619
  progress_bar = tqdm(
617
- total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
620
+ total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
618
621
  )
619
622
 
620
623
  # Worker function
621
624
  def _process_pdf_classification(pdf: PDF):
622
625
  thread_id = threading.current_thread().name
623
626
  pdf_path = pdf.path
624
- logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
627
+ logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
625
628
  start_time = time.monotonic()
626
629
  try:
627
- # Call classify_pages on the PDF, passing the progress callback
628
- pdf.classify_pages(
629
- categories=categories,
630
+ # Call classify directly on the PDF object
631
+ pdf.classify(
632
+ labels=labels,
633
+ using=using,
630
634
  model=model,
631
- progress_callback=progress_bar.update,
632
- **kwargs,
635
+ analysis_key=analysis_key,
636
+ **kwargs, # Pass other relevant args like min_confidence, multi_label
633
637
  )
634
638
  end_time = time.monotonic()
635
639
  logger.debug(
636
- f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
640
+ f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
637
641
  )
642
+ progress_bar.update(1) # Update progress bar upon success
638
643
  return pdf_path, None # Return path and no error
639
- except Exception as e:
644
+ except ValueError as ve:
645
+ # Catch specific error for vision on multi-page PDF
640
646
  end_time = time.monotonic()
641
- # Error is logged within classify_pages, but log summary here
642
647
  logger.error(
643
- f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
648
+ f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
644
649
  exc_info=False,
645
650
  )
646
- # Close progress bar immediately on error to avoid hanging
647
- progress_bar.close()
651
+ progress_bar.update(1) # Still update progress bar
652
+ return pdf_path, ve # Return the specific ValueError
653
+ except Exception as e:
654
+ end_time = time.monotonic()
655
+ logger.error(
656
+ f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
657
+ exc_info=True, # Log full traceback for unexpected errors
658
+ )
659
+ # Close progress bar immediately on critical error to avoid hanging
660
+ if not progress_bar.disable:
661
+ progress_bar.close()
648
662
  # Re-raise the exception to stop the entire collection processing
649
- raise
663
+ raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
650
664
 
651
665
  # Use ThreadPoolExecutor for parallel processing if max_workers > 1
666
+ processed_count = 0
667
+ skipped_count = 0
652
668
  try:
653
669
  if max_workers is not None and max_workers > 1:
654
670
  logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
@@ -659,23 +675,39 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
659
675
  for pdf in self._pdfs:
660
676
  futures.append(executor.submit(_process_pdf_classification, pdf))
661
677
 
662
- # Wait for all futures to complete (progress updated by callback)
663
- # Exceptions are raised by future.result() if worker failed
678
+ # Wait for all futures to complete
679
+ # Progress updated within worker
664
680
  for future in concurrent.futures.as_completed(futures):
665
- future.result() # Raise exception if worker failed
681
+ processed_count += 1
682
+ pdf_path, error = (
683
+ future.result()
684
+ ) # Raise ClassificationError if worker failed critically
685
+ if isinstance(error, ValueError):
686
+ # Logged in worker, just count as skipped
687
+ skipped_count += 1
666
688
 
667
689
  else: # Sequential processing
668
690
  logger.info("Classifying PDFs sequentially.")
669
691
  for pdf in self._pdfs:
670
- _process_pdf_classification(pdf)
671
-
672
- logger.info("Finished classification across the collection.")
692
+ processed_count += 1
693
+ pdf_path, error = _process_pdf_classification(
694
+ pdf
695
+ ) # Raise ClassificationError if worker failed critically
696
+ if isinstance(error, ValueError):
697
+ skipped_count += 1
698
+
699
+ final_message = (
700
+ f"Finished classification across the collection. Processed: {processed_count}"
701
+ )
702
+ if skipped_count > 0:
703
+ final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
704
+ logger.info(final_message + ".")
673
705
 
674
706
  finally:
675
- # Ensure progress bar is closed even if errors occurred elsewhere
707
+ # Ensure progress bar is closed properly
676
708
  if not progress_bar.disable and progress_bar.n < progress_bar.total:
677
- progress_bar.close()
678
- elif progress_bar.disable is False:
709
+ progress_bar.n = progress_bar.total # Ensure it reaches 100%
710
+ if not progress_bar.disable:
679
711
  progress_bar.close()
680
712
 
681
713
  return self
natural_pdf/core/pdf.py CHANGED
@@ -84,7 +84,7 @@ except ImportError:
84
84
  # End Deskew Imports
85
85
 
86
86
 
87
- class PDF(ExtractionMixin, ExportMixin):
87
+ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
88
88
  """
89
89
  Enhanced PDF wrapper built on top of pdfplumber.
90
90
 
@@ -194,6 +194,7 @@ class PDF(ExtractionMixin, ExportMixin):
194
194
 
195
195
  self._initialize_managers()
196
196
  self._initialize_highlighter()
197
+ self.analyses: Dict[str, Any] = {}
197
198
 
198
199
  def _initialize_managers(self):
199
200
  """Initialize manager instances based on DEFAULT_MANAGERS."""
@@ -1243,7 +1244,7 @@ class PDF(ExtractionMixin, ExportMixin):
1243
1244
 
1244
1245
  def classify_pages(
1245
1246
  self,
1246
- categories: List[str],
1247
+ labels: List[str],
1247
1248
  model: Optional[str] = None,
1248
1249
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1249
1250
  analysis_key: str = "classification",
@@ -1254,7 +1255,7 @@ class PDF(ExtractionMixin, ExportMixin):
1254
1255
  Classifies specified pages of the PDF.
1255
1256
 
1256
1257
  Args:
1257
- categories: List of category names
1258
+ labels: List of category names
1258
1259
  model: Model identifier ('text', 'vision', or specific HF ID)
1259
1260
  pages: Page indices, slice, or None for all pages
1260
1261
  analysis_key: Key to store results in page's analyses dict
@@ -1264,8 +1265,8 @@ class PDF(ExtractionMixin, ExportMixin):
1264
1265
  Returns:
1265
1266
  Self for method chaining
1266
1267
  """
1267
- if not categories:
1268
- raise ValueError("Categories list cannot be empty.")
1268
+ if not labels:
1269
+ raise ValueError("Labels list cannot be empty.")
1269
1270
 
1270
1271
  try:
1271
1272
  manager = self.get_manager("classification")
@@ -1332,7 +1333,7 @@ class PDF(ExtractionMixin, ExportMixin):
1332
1333
  try:
1333
1334
  batch_results = manager.classify_batch(
1334
1335
  item_contents=page_contents,
1335
- categories=categories,
1336
+ labels=labels,
1336
1337
  model_id=model,
1337
1338
  using=inferred_using,
1338
1339
  **kwargs,
@@ -1537,3 +1538,58 @@ class PDF(ExtractionMixin, ExportMixin):
1537
1538
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1538
1539
  else:
1539
1540
  raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
1541
+
1542
+ # --- Classification Mixin Implementation --- #
1543
+
1544
+ def _get_classification_manager(self) -> "ClassificationManager":
1545
+ """Returns the ClassificationManager instance for this PDF."""
1546
+ try:
1547
+ return self.get_manager("classification")
1548
+ except (KeyError, RuntimeError) as e:
1549
+ raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
1550
+
1551
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
1552
+ """
1553
+ Provides the content for classifying the entire PDF.
1554
+
1555
+ Args:
1556
+ model_type: 'text' or 'vision'.
1557
+ **kwargs: Additional arguments (e.g., for text extraction or image rendering).
1558
+
1559
+ Returns:
1560
+ Extracted text (str) or the first page's image (PIL.Image).
1561
+
1562
+ Raises:
1563
+ ValueError: If model_type is 'vision' and PDF has != 1 page,
1564
+ or if model_type is unsupported, or if content cannot be generated.
1565
+ """
1566
+ if model_type == "text":
1567
+ try:
1568
+ # Extract text from the whole document
1569
+ text = self.extract_text(**kwargs) # Pass relevant kwargs
1570
+ if not text or text.isspace():
1571
+ raise ValueError("PDF contains no extractable text for classification.")
1572
+ return text
1573
+ except Exception as e:
1574
+ logger.error(f"Error extracting text for PDF classification: {e}")
1575
+ raise ValueError("Failed to extract text for classification.") from e
1576
+
1577
+ elif model_type == "vision":
1578
+ if len(self.pages) == 1:
1579
+ # Use the single page's content method
1580
+ try:
1581
+ return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
1582
+ except Exception as e:
1583
+ logger.error(f"Error getting image from single page for classification: {e}")
1584
+ raise ValueError("Failed to get image from single page.") from e
1585
+ elif len(self.pages) == 0:
1586
+ raise ValueError("Cannot classify empty PDF using vision model.")
1587
+ else:
1588
+ raise ValueError(
1589
+ f"Vision classification for a PDF object is only supported for single-page PDFs. "
1590
+ f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
1591
+ )
1592
+ else:
1593
+ raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
1594
+
1595
+ # --- End Classification Mixin Implementation ---
@@ -20,6 +20,7 @@ from typing import (
20
20
  )
21
21
 
22
22
  from pdfplumber.utils.geometry import objects_to_bbox
23
+ from PIL import Image, ImageDraw, ImageFont
23
24
 
24
25
  # New Imports
25
26
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
@@ -1239,7 +1240,7 @@ class ElementCollection(
1239
1240
  # --- Classification Method --- #
1240
1241
  def classify_all(
1241
1242
  self,
1242
- categories: List[str],
1243
+ labels: List[str],
1243
1244
  model: Optional[str] = None,
1244
1245
  using: Optional[str] = None,
1245
1246
  min_confidence: float = 0.0,
@@ -1253,7 +1254,7 @@ class ElementCollection(
1253
1254
  """Classifies all elements in the collection in batch.
1254
1255
 
1255
1256
  Args:
1256
- categories: List of category labels.
1257
+ labels: List of category labels.
1257
1258
  model: Model ID (or alias 'text', 'vision').
1258
1259
  using: Optional processing mode ('text' or 'vision'). Inferred if None.
1259
1260
  min_confidence: Minimum confidence threshold.
@@ -1326,7 +1327,7 @@ class ElementCollection(
1326
1327
  # Call manager's batch classify
1327
1328
  batch_results: List[ClassificationResult] = manager.classify_batch(
1328
1329
  item_contents=items_to_classify,
1329
- categories=categories,
1330
+ labels=labels,
1330
1331
  model_id=model,
1331
1332
  using=inferred_using,
1332
1333
  min_confidence=min_confidence,
@@ -2263,3 +2264,106 @@ class PageCollection(Generic[P], ApplyMixin):
2263
2264
  )
2264
2265
 
2265
2266
  # --- End Deskew Method --- #
2267
+
2268
+ def to_image(
2269
+ self,
2270
+ page_width: int = 300,
2271
+ cols: Optional[int] = 4,
2272
+ rows: Optional[int] = None,
2273
+ max_pages: Optional[int] = None,
2274
+ spacing: int = 10,
2275
+ add_labels: bool = True,
2276
+ show_category: bool = False, # Add new flag
2277
+ ) -> Optional["Image.Image"]:
2278
+ """
2279
+ Generate a grid of page images for this collection.
2280
+
2281
+ Args:
2282
+ page_width: Width in pixels for rendering individual pages
2283
+ cols: Number of columns in grid (default: 4)
2284
+ rows: Number of rows in grid (calculated automatically if None)
2285
+ max_pages: Maximum number of pages to include (default: all)
2286
+ spacing: Spacing between page thumbnails in pixels
2287
+ add_labels: Whether to add page number labels
2288
+ show_category: Whether to add category and confidence labels (if available)
2289
+
2290
+ Returns:
2291
+ PIL Image of the page grid or None if no pages
2292
+ """
2293
+ if not self.pages:
2294
+ logger.warning("Cannot generate image for empty PageCollection")
2295
+ return None
2296
+
2297
+ # Limit pages if max_pages is specified
2298
+ pages_to_render = self.pages[:max_pages] if max_pages else self.pages
2299
+
2300
+ # Load font once outside the loop
2301
+ font = ImageFont.load_default(16) if add_labels else None
2302
+
2303
+ # Render individual page images
2304
+ page_images = []
2305
+ for page in pages_to_render:
2306
+ img = page.to_image(width=page_width)
2307
+
2308
+ # Add page number label
2309
+ if add_labels and font: # Check if font was loaded
2310
+ draw = ImageDraw.Draw(img)
2311
+ pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
2312
+ label_text = f"p{page.number} - {pdf_name}"
2313
+
2314
+ # Add category if requested and available
2315
+ if show_category:
2316
+ category = getattr(page, "category", None)
2317
+ confidence = getattr(page, "category_confidence", None)
2318
+ if category is not None and confidence is not None:
2319
+ category_str = f"{category} {confidence:.3f}"
2320
+ label_text += f"\n{category_str}"
2321
+
2322
+ # Calculate bounding box for multi-line text
2323
+ # Use (5, 5) as top-left anchor for textbbox calculation for padding
2324
+ # Use multiline_textbbox for accurate bounds with newlines
2325
+ bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
2326
+ # Add padding to the calculated bbox for the white background
2327
+ bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
2328
+
2329
+ # Draw white background rectangle
2330
+ draw.rectangle(bg_rect, fill=(255, 255, 255))
2331
+
2332
+ # Draw the potentially multi-line text using multiline_text
2333
+ draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
2334
+
2335
+ page_images.append(img)
2336
+
2337
+ # Calculate grid dimensions if not provided
2338
+ if not rows and not cols:
2339
+ # Default to a square-ish grid
2340
+ cols = min(4, int(len(page_images) ** 0.5) + 1)
2341
+ rows = (len(page_images) + cols - 1) // cols
2342
+ elif rows and not cols:
2343
+ cols = (len(page_images) + rows - 1) // rows
2344
+ elif cols and not rows:
2345
+ rows = (len(page_images) + cols - 1) // cols
2346
+
2347
+ # Get maximum dimensions for consistent grid cells
2348
+ max_width = max(img.width for img in page_images)
2349
+ max_height = max(img.height for img in page_images)
2350
+
2351
+ # Create grid image
2352
+ grid_width = cols * max_width + (cols + 1) * spacing
2353
+ grid_height = rows * max_height + (rows + 1) * spacing
2354
+ grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
2355
+
2356
+ # Place images in grid
2357
+ for i, img in enumerate(page_images):
2358
+ if i >= rows * cols:
2359
+ break
2360
+
2361
+ row = i // cols
2362
+ col = i % cols
2363
+
2364
+ x = col * max_width + (col + 1) * spacing
2365
+ y = row * max_height + (row + 1) * spacing
2366
+
2367
+ grid_img.paste(img, (x, y))
2368
+
2369
+ return grid_img
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -15,19 +15,19 @@ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh
15
15
  natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
16
16
  natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
17
17
  natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
18
- natural_pdf/classification/manager.py,sha256=CvZd3-lN3fEhcaLXr8gYfrdBGoBgzkIeE14EqjrOAzU,17730
19
- natural_pdf/classification/mixin.py,sha256=llari9AIMNGy9sTaR7y1g5vtVNUwuCutbKnjbJRMYx4,6903
20
- natural_pdf/classification/results.py,sha256=Ia26BQxObL5sURpFmg66bfjFPCxjcO_jeP2G-S9wRgo,2289
21
- natural_pdf/collections/mixins.py,sha256=ufetdzHmd2_WLGBPW4eBQrzZTFpjXyVsVwBquIE47zw,4476
22
- natural_pdf/collections/pdf_collection.py,sha256=JnsJugE-vxYsW1ZJWmMlVv_jbyG37X-9rZK1RQyKWAY,30020
18
+ natural_pdf/classification/manager.py,sha256=RxJch8xVu8Me6_T2Kh7ZqUNaAKlXvfyCZD0hRc4Hk6w,17929
19
+ natural_pdf/classification/mixin.py,sha256=hhX9qWPShpOq_-mgoEq0GUWnutBnNMo3YdUlxwyNWMA,6781
20
+ natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
21
+ natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
22
+ natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg_G0goBHlrFJI,32018
23
23
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
24
24
  natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
25
25
  natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
26
26
  natural_pdf/core/page.py,sha256=icJLu6jRbkD3iOE8r60XPkQZ8FN3ZcKo5TT5MVGkGl0,105122
27
- natural_pdf/core/pdf.py,sha256=Vw-L5149wO6RSfvb9sAfPDLqd9M1TdYoPHNEePh65y8,61201
27
+ natural_pdf/core/pdf.py,sha256=gOvLumJZaHXdDwpxbX9HcC_Rea4HaYMemBdYg5GX7gQ,63837
28
28
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
29
29
  natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
30
- natural_pdf/elements/collections.py,sha256=YRaJxNbJrBjgwzwuSoOtEotOKh6RaTi7NRCqKiGl514,92955
30
+ natural_pdf/elements/collections.py,sha256=AN0WrrQYfCmcRS0-PHP4RQHxxdpcWnDuH2cWnmqtDE0,97184
31
31
  natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
32
32
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
33
33
  natural_pdf/elements/region.py,sha256=LfyB_9DCw5Tzn_G9xsjFz2FfKBOHRqGIND4DQWoA7KM,97324
@@ -73,8 +73,8 @@ natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7t
73
73
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
74
74
  natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
75
75
  natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
76
- natural_pdf-0.1.9.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
77
- natural_pdf-0.1.9.dist-info/METADATA,sha256=10GX2Qesem-n8sPem4lls2EEQen4KyJVdcmQf1mt9mI,7400
78
- natural_pdf-0.1.9.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
79
- natural_pdf-0.1.9.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
80
- natural_pdf-0.1.9.dist-info/RECORD,,
76
+ natural_pdf-0.1.10.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
77
+ natural_pdf-0.1.10.dist-info/METADATA,sha256=gjUsfmnbqrdiHcaH6L1qiw6VX4MBlWjVj5HqlDnhuQY,7401
78
+ natural_pdf-0.1.10.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
79
+ natural_pdf-0.1.10.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
80
+ natural_pdf-0.1.10.dist-info/RECORD,,