natural-pdf 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/classification/manager.py +26 -22
- natural_pdf/classification/mixin.py +7 -7
- natural_pdf/classification/results.py +17 -9
- natural_pdf/collections/mixins.py +17 -0
- natural_pdf/collections/pdf_collection.py +78 -46
- natural_pdf/core/pdf.py +62 -6
- natural_pdf/elements/collections.py +107 -3
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/RECORD +12 -12
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.9.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -0
@@ -161,7 +161,7 @@ class ClassificationManager:
|
|
161
161
|
def classify_item(
|
162
162
|
self,
|
163
163
|
item_content: Union[str, Image.Image],
|
164
|
-
|
164
|
+
labels: List[str],
|
165
165
|
model_id: Optional[str] = None,
|
166
166
|
using: Optional[str] = None,
|
167
167
|
min_confidence: float = 0.0,
|
@@ -193,13 +193,13 @@ class ClassificationManager:
|
|
193
193
|
else self.DEFAULT_VISION_MODEL
|
194
194
|
)
|
195
195
|
|
196
|
-
if not
|
197
|
-
raise ValueError("
|
196
|
+
if not labels:
|
197
|
+
raise ValueError("Labels list cannot be empty.")
|
198
198
|
|
199
199
|
pipeline_instance = self._get_pipeline(model_id, effective_using)
|
200
200
|
timestamp = datetime.now()
|
201
201
|
parameters = { # Store parameters used for this run
|
202
|
-
"
|
202
|
+
"labels": labels,
|
203
203
|
"model_id": model_id,
|
204
204
|
"using": effective_using,
|
205
205
|
"min_confidence": min_confidence,
|
@@ -214,7 +214,7 @@ class ClassificationManager:
|
|
214
214
|
# Handle potential kwargs for specific pipelines if needed
|
215
215
|
# The zero-shot pipelines expect `candidate_labels`
|
216
216
|
result_raw = pipeline_instance(
|
217
|
-
item_content, candidate_labels=
|
217
|
+
item_content, candidate_labels=labels, multi_label=multi_label, **kwargs
|
218
218
|
)
|
219
219
|
logger.debug(f"Raw pipeline result: {result_raw}")
|
220
220
|
|
@@ -226,7 +226,7 @@ class ClassificationManager:
|
|
226
226
|
for label, score_val in zip(result_raw["labels"], result_raw["scores"]):
|
227
227
|
if score_val >= min_confidence:
|
228
228
|
try:
|
229
|
-
scores_list.append(CategoryScore(label
|
229
|
+
scores_list.append(CategoryScore(label, score_val))
|
230
230
|
except (ValueError, TypeError) as score_err:
|
231
231
|
logger.warning(
|
232
232
|
f"Skipping invalid score from text pipeline: label='{label}', score={score_val}. Error: {score_err}"
|
@@ -241,7 +241,7 @@ class ClassificationManager:
|
|
241
241
|
label = item["label"]
|
242
242
|
if score_val >= min_confidence:
|
243
243
|
try:
|
244
|
-
scores_list.append(CategoryScore(label
|
244
|
+
scores_list.append(CategoryScore(label, score_val))
|
245
245
|
except (ValueError, TypeError) as score_err:
|
246
246
|
logger.warning(
|
247
247
|
f"Skipping invalid score from vision pipeline: label='{label}', score={score_val}. Error: {score_err}"
|
@@ -253,13 +253,15 @@ class ClassificationManager:
|
|
253
253
|
# Return empty result?
|
254
254
|
# scores_list = []
|
255
255
|
|
256
|
-
|
256
|
+
# ClassificationResult now calculates top score/category internally
|
257
|
+
result_obj = ClassificationResult(
|
258
|
+
scores=scores_list, # Pass the filtered list
|
257
259
|
model_id=model_id,
|
258
260
|
using=effective_using,
|
259
|
-
timestamp=timestamp,
|
260
261
|
parameters=parameters,
|
261
|
-
|
262
|
+
timestamp=timestamp,
|
262
263
|
)
|
264
|
+
return result_obj
|
263
265
|
# --- End Processing --- #
|
264
266
|
|
265
267
|
except Exception as e:
|
@@ -273,7 +275,7 @@ class ClassificationManager:
|
|
273
275
|
def classify_batch(
|
274
276
|
self,
|
275
277
|
item_contents: List[Union[str, Image.Image]],
|
276
|
-
|
278
|
+
labels: List[str],
|
277
279
|
model_id: Optional[str] = None,
|
278
280
|
using: Optional[str] = None,
|
279
281
|
min_confidence: float = 0.0,
|
@@ -307,13 +309,13 @@ class ClassificationManager:
|
|
307
309
|
else self.DEFAULT_VISION_MODEL
|
308
310
|
)
|
309
311
|
|
310
|
-
if not
|
311
|
-
raise ValueError("
|
312
|
+
if not labels:
|
313
|
+
raise ValueError("Labels list cannot be empty.")
|
312
314
|
|
313
315
|
pipeline_instance = self._get_pipeline(model_id, effective_using)
|
314
316
|
timestamp = datetime.now() # Single timestamp for the batch run
|
315
317
|
parameters = { # Parameters for the whole batch
|
316
|
-
"
|
318
|
+
"labels": labels,
|
317
319
|
"model_id": model_id,
|
318
320
|
"using": effective_using,
|
319
321
|
"min_confidence": min_confidence,
|
@@ -331,7 +333,7 @@ class ClassificationManager:
|
|
331
333
|
# Use pipeline directly for batching
|
332
334
|
results_iterator = pipeline_instance(
|
333
335
|
item_contents,
|
334
|
-
candidate_labels=
|
336
|
+
candidate_labels=labels,
|
335
337
|
multi_label=multi_label,
|
336
338
|
batch_size=batch_size,
|
337
339
|
**kwargs,
|
@@ -362,9 +364,7 @@ class ClassificationManager:
|
|
362
364
|
for label, score_val in zip(raw_result["labels"], raw_result["scores"]):
|
363
365
|
if score_val >= min_confidence:
|
364
366
|
try:
|
365
|
-
scores_list.append(
|
366
|
-
CategoryScore(label=label, confidence=score_val)
|
367
|
-
)
|
367
|
+
scores_list.append(CategoryScore(label, score_val))
|
368
368
|
except (ValueError, TypeError) as score_err:
|
369
369
|
logger.warning(
|
370
370
|
f"Skipping invalid score from text pipeline batch: label='{label}', score={score_val}. Error: {score_err}"
|
@@ -376,9 +376,7 @@ class ClassificationManager:
|
|
376
376
|
score_val = item["score"]
|
377
377
|
label = item["label"]
|
378
378
|
if score_val >= min_confidence:
|
379
|
-
scores_list.append(
|
380
|
-
CategoryScore(label=label, confidence=score_val)
|
381
|
-
)
|
379
|
+
scores_list.append(CategoryScore(label, score_val))
|
382
380
|
except (KeyError, ValueError, TypeError) as item_err:
|
383
381
|
logger.warning(
|
384
382
|
f"Skipping invalid item in vision result list from batch: {item}. Error: {item_err}"
|
@@ -394,14 +392,20 @@ class ClassificationManager:
|
|
394
392
|
)
|
395
393
|
# scores_list remains empty for this item
|
396
394
|
|
395
|
+
# --- Determine top category and score ---
|
396
|
+
scores_list.sort(key=lambda s: s.score, reverse=True)
|
397
|
+
top_category = scores_list[0].label
|
398
|
+
top_score = scores_list[0].score
|
399
|
+
# --- End Determine top category ---
|
400
|
+
|
397
401
|
# Append result object for this item
|
398
402
|
batch_results_list.append(
|
399
403
|
ClassificationResult(
|
404
|
+
scores=scores_list, # Pass the full list, init will sort/filter
|
400
405
|
model_id=model_id,
|
401
406
|
using=effective_using,
|
402
407
|
timestamp=timestamp, # Use same timestamp for batch
|
403
408
|
parameters=parameters, # Use same params for batch
|
404
|
-
scores=scores_list,
|
405
409
|
)
|
406
410
|
)
|
407
411
|
# --- End Processing --- #
|
@@ -44,9 +44,9 @@ class ClassificationMixin:
|
|
44
44
|
|
45
45
|
def classify(
|
46
46
|
self,
|
47
|
-
|
48
|
-
model: Optional[str] = None,
|
49
|
-
using: Optional[str] = None,
|
47
|
+
labels: List[str],
|
48
|
+
model: Optional[str] = None,
|
49
|
+
using: Optional[str] = None,
|
50
50
|
min_confidence: float = 0.0,
|
51
51
|
analysis_key: str = "classification", # Default key
|
52
52
|
multi_label: bool = False,
|
@@ -60,7 +60,7 @@ class ClassificationMixin:
|
|
60
60
|
result under that key.
|
61
61
|
|
62
62
|
Args:
|
63
|
-
|
63
|
+
labels: A list of string category names.
|
64
64
|
model: Model identifier (e.g., 'text', 'vision', HF ID). Defaults handled by manager.
|
65
65
|
using: Optional processing mode ('text' or 'vision'). If None, inferred by manager.
|
66
66
|
min_confidence: Minimum confidence threshold for results (0.0-1.0).
|
@@ -103,9 +103,9 @@ class ClassificationMixin:
|
|
103
103
|
# Manager now returns a ClassificationResult object
|
104
104
|
result_obj: ClassificationResult = manager.classify_item(
|
105
105
|
item_content=content,
|
106
|
-
|
107
|
-
model_id=effective_model_id,
|
108
|
-
using=inferred_using,
|
106
|
+
labels=labels,
|
107
|
+
model_id=effective_model_id,
|
108
|
+
using=inferred_using,
|
109
109
|
min_confidence=min_confidence,
|
110
110
|
multi_label=multi_label,
|
111
111
|
**kwargs,
|
@@ -11,19 +11,19 @@ logger = logging.getLogger(__name__)
|
|
11
11
|
class CategoryScore:
|
12
12
|
"""Represents a category and its confidence score from classification."""
|
13
13
|
|
14
|
-
|
14
|
+
label: str
|
15
15
|
score: float
|
16
16
|
|
17
17
|
def to_dict(self) -> Dict[str, Any]:
|
18
18
|
"""Convert to dictionary for serialization."""
|
19
|
-
return {"category": self.
|
19
|
+
return {"category": self.label, "score": self.score}
|
20
20
|
|
21
21
|
|
22
22
|
@dataclass
|
23
23
|
class ClassificationResult:
|
24
24
|
"""Results from a classification operation."""
|
25
25
|
|
26
|
-
category: str
|
26
|
+
category: Optional[str] # Can be None if scores are empty
|
27
27
|
score: float
|
28
28
|
scores: List[CategoryScore]
|
29
29
|
model_id: str
|
@@ -33,17 +33,25 @@ class ClassificationResult:
|
|
33
33
|
|
34
34
|
def __init__(
|
35
35
|
self,
|
36
|
-
|
37
|
-
score: float,
|
38
|
-
scores: List[CategoryScore],
|
36
|
+
scores: List[CategoryScore], # Now the primary source
|
39
37
|
model_id: str,
|
40
38
|
using: str,
|
41
39
|
parameters: Optional[Dict[str, Any]] = None,
|
42
40
|
timestamp: Optional[datetime] = None,
|
43
41
|
):
|
44
|
-
|
45
|
-
|
46
|
-
|
42
|
+
# Determine top category and score from the scores list
|
43
|
+
if scores:
|
44
|
+
# Sort scores descending by score to find the top one
|
45
|
+
sorted_scores = sorted(scores, key=lambda s: s.score, reverse=True)
|
46
|
+
self.category = sorted_scores[0].label
|
47
|
+
self.score = sorted_scores[0].score
|
48
|
+
self.scores = sorted_scores # Store the sorted list
|
49
|
+
else:
|
50
|
+
# Handle empty scores list
|
51
|
+
self.category = None
|
52
|
+
self.score = 0.0
|
53
|
+
self.scores = [] # Store empty list
|
54
|
+
|
47
55
|
self.model_id = model_id
|
48
56
|
self.using = using
|
49
57
|
self.parameters = parameters or {}
|
@@ -109,3 +109,20 @@ class ApplyMixin:
|
|
109
109
|
return PageCollection(results)
|
110
110
|
|
111
111
|
return results
|
112
|
+
|
113
|
+
def filter(self: Any, predicate: Callable[[Any], bool]) -> Any:
|
114
|
+
"""
|
115
|
+
Filters the collection based on a predicate function.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
predicate: A function that takes an item and returns True if the item
|
119
|
+
should be included in the result, False otherwise.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
A new collection of the same type containing only the items
|
123
|
+
for which the predicate returned True.
|
124
|
+
"""
|
125
|
+
items_iterable = self._get_items_for_apply()
|
126
|
+
filtered_items = [item for item in items_iterable if predicate(item)]
|
127
|
+
|
128
|
+
return type(self)(filtered_items)
|
@@ -519,7 +519,7 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
519
519
|
|
520
520
|
return self
|
521
521
|
|
522
|
-
def categorize(self,
|
522
|
+
def categorize(self, labels: List[str], **kwargs):
|
523
523
|
"""Categorizes PDFs in the collection based on content or features."""
|
524
524
|
# Implementation requires integrating with classification models or logic
|
525
525
|
raise NotImplementedError("categorize requires classification implementation.")
|
@@ -570,85 +570,101 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
570
570
|
# --- Classification Method --- #
|
571
571
|
def classify_all(
|
572
572
|
self,
|
573
|
-
|
574
|
-
|
573
|
+
labels: List[str],
|
574
|
+
using: Optional[str] = None, # Default handled by PDF.classify -> manager
|
575
|
+
model: Optional[str] = None, # Optional model ID
|
575
576
|
max_workers: Optional[int] = None,
|
577
|
+
analysis_key: str = "classification", # Key for storing result in PDF.analyses
|
576
578
|
**kwargs,
|
577
579
|
) -> "PDFCollection":
|
578
580
|
"""
|
579
|
-
Classify
|
581
|
+
Classify each PDF document in the collection, potentially in parallel.
|
580
582
|
|
581
|
-
This method
|
582
|
-
|
583
|
-
|
583
|
+
This method delegates classification to each PDF object's `classify` method.
|
584
|
+
By default, uses the full extracted text of the PDF.
|
585
|
+
If `using='vision'`, it classifies the first page's image, but ONLY if
|
586
|
+
the PDF has a single page (raises ValueError otherwise).
|
584
587
|
|
585
588
|
Args:
|
586
|
-
|
587
|
-
|
589
|
+
labels: A list of string category names.
|
590
|
+
using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
|
591
|
+
model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
|
588
592
|
max_workers: Maximum number of threads to process PDFs concurrently.
|
589
593
|
If None or 1, processing is sequential.
|
590
|
-
|
591
|
-
|
592
|
-
|
594
|
+
analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
|
595
|
+
**kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
|
596
|
+
min_confidence, multi_label, text extraction options).
|
593
597
|
|
594
598
|
Returns:
|
595
599
|
Self for method chaining.
|
596
600
|
|
597
601
|
Raises:
|
598
|
-
ValueError: If
|
599
|
-
ClassificationError: If classification fails for any
|
602
|
+
ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
|
603
|
+
ClassificationError: If classification fails for any PDF (will stop processing).
|
600
604
|
ImportError: If classification dependencies are missing.
|
601
605
|
"""
|
602
606
|
PDF = self._get_pdf_class()
|
603
|
-
if not
|
604
|
-
raise ValueError("
|
607
|
+
if not labels:
|
608
|
+
raise ValueError("Labels list cannot be empty.")
|
605
609
|
|
610
|
+
if not self._pdfs:
|
611
|
+
logger.warning("PDFCollection is empty, skipping classification.")
|
612
|
+
return self
|
613
|
+
|
614
|
+
mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
|
606
615
|
logger.info(
|
607
|
-
f"Starting classification for {len(self._pdfs)} PDFs in collection (
|
616
|
+
f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
|
608
617
|
)
|
609
618
|
|
610
|
-
# Calculate total pages for the progress bar
|
611
|
-
total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
|
612
|
-
if total_pages == 0:
|
613
|
-
logger.warning("No pages found in the PDF collection to classify.")
|
614
|
-
return self
|
615
|
-
|
616
619
|
progress_bar = tqdm(
|
617
|
-
total=
|
620
|
+
total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
|
618
621
|
)
|
619
622
|
|
620
623
|
# Worker function
|
621
624
|
def _process_pdf_classification(pdf: PDF):
|
622
625
|
thread_id = threading.current_thread().name
|
623
626
|
pdf_path = pdf.path
|
624
|
-
logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
|
627
|
+
logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
|
625
628
|
start_time = time.monotonic()
|
626
629
|
try:
|
627
|
-
# Call
|
628
|
-
pdf.
|
629
|
-
|
630
|
+
# Call classify directly on the PDF object
|
631
|
+
pdf.classify(
|
632
|
+
labels=labels,
|
633
|
+
using=using,
|
630
634
|
model=model,
|
631
|
-
|
632
|
-
**kwargs,
|
635
|
+
analysis_key=analysis_key,
|
636
|
+
**kwargs, # Pass other relevant args like min_confidence, multi_label
|
633
637
|
)
|
634
638
|
end_time = time.monotonic()
|
635
639
|
logger.debug(
|
636
|
-
f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
640
|
+
f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
637
641
|
)
|
642
|
+
progress_bar.update(1) # Update progress bar upon success
|
638
643
|
return pdf_path, None # Return path and no error
|
639
|
-
except
|
644
|
+
except ValueError as ve:
|
645
|
+
# Catch specific error for vision on multi-page PDF
|
640
646
|
end_time = time.monotonic()
|
641
|
-
# Error is logged within classify_pages, but log summary here
|
642
647
|
logger.error(
|
643
|
-
f"[{thread_id}]
|
648
|
+
f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
|
644
649
|
exc_info=False,
|
645
650
|
)
|
646
|
-
#
|
647
|
-
|
651
|
+
progress_bar.update(1) # Still update progress bar
|
652
|
+
return pdf_path, ve # Return the specific ValueError
|
653
|
+
except Exception as e:
|
654
|
+
end_time = time.monotonic()
|
655
|
+
logger.error(
|
656
|
+
f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
657
|
+
exc_info=True, # Log full traceback for unexpected errors
|
658
|
+
)
|
659
|
+
# Close progress bar immediately on critical error to avoid hanging
|
660
|
+
if not progress_bar.disable:
|
661
|
+
progress_bar.close()
|
648
662
|
# Re-raise the exception to stop the entire collection processing
|
649
|
-
raise
|
663
|
+
raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
|
650
664
|
|
651
665
|
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
666
|
+
processed_count = 0
|
667
|
+
skipped_count = 0
|
652
668
|
try:
|
653
669
|
if max_workers is not None and max_workers > 1:
|
654
670
|
logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
|
@@ -659,23 +675,39 @@ class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixi
|
|
659
675
|
for pdf in self._pdfs:
|
660
676
|
futures.append(executor.submit(_process_pdf_classification, pdf))
|
661
677
|
|
662
|
-
# Wait for all futures to complete
|
663
|
-
#
|
678
|
+
# Wait for all futures to complete
|
679
|
+
# Progress updated within worker
|
664
680
|
for future in concurrent.futures.as_completed(futures):
|
665
|
-
|
681
|
+
processed_count += 1
|
682
|
+
pdf_path, error = (
|
683
|
+
future.result()
|
684
|
+
) # Raise ClassificationError if worker failed critically
|
685
|
+
if isinstance(error, ValueError):
|
686
|
+
# Logged in worker, just count as skipped
|
687
|
+
skipped_count += 1
|
666
688
|
|
667
689
|
else: # Sequential processing
|
668
690
|
logger.info("Classifying PDFs sequentially.")
|
669
691
|
for pdf in self._pdfs:
|
670
|
-
|
671
|
-
|
672
|
-
|
692
|
+
processed_count += 1
|
693
|
+
pdf_path, error = _process_pdf_classification(
|
694
|
+
pdf
|
695
|
+
) # Raise ClassificationError if worker failed critically
|
696
|
+
if isinstance(error, ValueError):
|
697
|
+
skipped_count += 1
|
698
|
+
|
699
|
+
final_message = (
|
700
|
+
f"Finished classification across the collection. Processed: {processed_count}"
|
701
|
+
)
|
702
|
+
if skipped_count > 0:
|
703
|
+
final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
|
704
|
+
logger.info(final_message + ".")
|
673
705
|
|
674
706
|
finally:
|
675
|
-
# Ensure progress bar is closed
|
707
|
+
# Ensure progress bar is closed properly
|
676
708
|
if not progress_bar.disable and progress_bar.n < progress_bar.total:
|
677
|
-
progress_bar.
|
678
|
-
|
709
|
+
progress_bar.n = progress_bar.total # Ensure it reaches 100%
|
710
|
+
if not progress_bar.disable:
|
679
711
|
progress_bar.close()
|
680
712
|
|
681
713
|
return self
|
natural_pdf/core/pdf.py
CHANGED
@@ -84,7 +84,7 @@ except ImportError:
|
|
84
84
|
# End Deskew Imports
|
85
85
|
|
86
86
|
|
87
|
-
class PDF(ExtractionMixin, ExportMixin):
|
87
|
+
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
88
88
|
"""
|
89
89
|
Enhanced PDF wrapper built on top of pdfplumber.
|
90
90
|
|
@@ -194,6 +194,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
194
194
|
|
195
195
|
self._initialize_managers()
|
196
196
|
self._initialize_highlighter()
|
197
|
+
self.analyses: Dict[str, Any] = {}
|
197
198
|
|
198
199
|
def _initialize_managers(self):
|
199
200
|
"""Initialize manager instances based on DEFAULT_MANAGERS."""
|
@@ -1243,7 +1244,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1243
1244
|
|
1244
1245
|
def classify_pages(
|
1245
1246
|
self,
|
1246
|
-
|
1247
|
+
labels: List[str],
|
1247
1248
|
model: Optional[str] = None,
|
1248
1249
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1249
1250
|
analysis_key: str = "classification",
|
@@ -1254,7 +1255,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1254
1255
|
Classifies specified pages of the PDF.
|
1255
1256
|
|
1256
1257
|
Args:
|
1257
|
-
|
1258
|
+
labels: List of category names
|
1258
1259
|
model: Model identifier ('text', 'vision', or specific HF ID)
|
1259
1260
|
pages: Page indices, slice, or None for all pages
|
1260
1261
|
analysis_key: Key to store results in page's analyses dict
|
@@ -1264,8 +1265,8 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1264
1265
|
Returns:
|
1265
1266
|
Self for method chaining
|
1266
1267
|
"""
|
1267
|
-
if not
|
1268
|
-
raise ValueError("
|
1268
|
+
if not labels:
|
1269
|
+
raise ValueError("Labels list cannot be empty.")
|
1269
1270
|
|
1270
1271
|
try:
|
1271
1272
|
manager = self.get_manager("classification")
|
@@ -1332,7 +1333,7 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1332
1333
|
try:
|
1333
1334
|
batch_results = manager.classify_batch(
|
1334
1335
|
item_contents=page_contents,
|
1335
|
-
|
1336
|
+
labels=labels,
|
1336
1337
|
model_id=model,
|
1337
1338
|
using=inferred_using,
|
1338
1339
|
**kwargs,
|
@@ -1537,3 +1538,58 @@ class PDF(ExtractionMixin, ExportMixin):
|
|
1537
1538
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1538
1539
|
else:
|
1539
1540
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1541
|
+
|
1542
|
+
# --- Classification Mixin Implementation --- #
|
1543
|
+
|
1544
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
1545
|
+
"""Returns the ClassificationManager instance for this PDF."""
|
1546
|
+
try:
|
1547
|
+
return self.get_manager("classification")
|
1548
|
+
except (KeyError, RuntimeError) as e:
|
1549
|
+
raise AttributeError(f"Could not retrieve ClassificationManager: {e}") from e
|
1550
|
+
|
1551
|
+
def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, Image.Image]:
|
1552
|
+
"""
|
1553
|
+
Provides the content for classifying the entire PDF.
|
1554
|
+
|
1555
|
+
Args:
|
1556
|
+
model_type: 'text' or 'vision'.
|
1557
|
+
**kwargs: Additional arguments (e.g., for text extraction or image rendering).
|
1558
|
+
|
1559
|
+
Returns:
|
1560
|
+
Extracted text (str) or the first page's image (PIL.Image).
|
1561
|
+
|
1562
|
+
Raises:
|
1563
|
+
ValueError: If model_type is 'vision' and PDF has != 1 page,
|
1564
|
+
or if model_type is unsupported, or if content cannot be generated.
|
1565
|
+
"""
|
1566
|
+
if model_type == "text":
|
1567
|
+
try:
|
1568
|
+
# Extract text from the whole document
|
1569
|
+
text = self.extract_text(**kwargs) # Pass relevant kwargs
|
1570
|
+
if not text or text.isspace():
|
1571
|
+
raise ValueError("PDF contains no extractable text for classification.")
|
1572
|
+
return text
|
1573
|
+
except Exception as e:
|
1574
|
+
logger.error(f"Error extracting text for PDF classification: {e}")
|
1575
|
+
raise ValueError("Failed to extract text for classification.") from e
|
1576
|
+
|
1577
|
+
elif model_type == "vision":
|
1578
|
+
if len(self.pages) == 1:
|
1579
|
+
# Use the single page's content method
|
1580
|
+
try:
|
1581
|
+
return self.pages[0]._get_classification_content(model_type="vision", **kwargs)
|
1582
|
+
except Exception as e:
|
1583
|
+
logger.error(f"Error getting image from single page for classification: {e}")
|
1584
|
+
raise ValueError("Failed to get image from single page.") from e
|
1585
|
+
elif len(self.pages) == 0:
|
1586
|
+
raise ValueError("Cannot classify empty PDF using vision model.")
|
1587
|
+
else:
|
1588
|
+
raise ValueError(
|
1589
|
+
f"Vision classification for a PDF object is only supported for single-page PDFs. "
|
1590
|
+
f"This PDF has {len(self.pages)} pages. Use pdf.pages[0].classify() or pdf.classify_pages()."
|
1591
|
+
)
|
1592
|
+
else:
|
1593
|
+
raise ValueError(f"Unsupported model_type for PDF classification: {model_type}")
|
1594
|
+
|
1595
|
+
# --- End Classification Mixin Implementation ---
|
@@ -20,6 +20,7 @@ from typing import (
|
|
20
20
|
)
|
21
21
|
|
22
22
|
from pdfplumber.utils.geometry import objects_to_bbox
|
23
|
+
from PIL import Image, ImageDraw, ImageFont
|
23
24
|
|
24
25
|
# New Imports
|
25
26
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
@@ -1239,7 +1240,7 @@ class ElementCollection(
|
|
1239
1240
|
# --- Classification Method --- #
|
1240
1241
|
def classify_all(
|
1241
1242
|
self,
|
1242
|
-
|
1243
|
+
labels: List[str],
|
1243
1244
|
model: Optional[str] = None,
|
1244
1245
|
using: Optional[str] = None,
|
1245
1246
|
min_confidence: float = 0.0,
|
@@ -1253,7 +1254,7 @@ class ElementCollection(
|
|
1253
1254
|
"""Classifies all elements in the collection in batch.
|
1254
1255
|
|
1255
1256
|
Args:
|
1256
|
-
|
1257
|
+
labels: List of category labels.
|
1257
1258
|
model: Model ID (or alias 'text', 'vision').
|
1258
1259
|
using: Optional processing mode ('text' or 'vision'). Inferred if None.
|
1259
1260
|
min_confidence: Minimum confidence threshold.
|
@@ -1326,7 +1327,7 @@ class ElementCollection(
|
|
1326
1327
|
# Call manager's batch classify
|
1327
1328
|
batch_results: List[ClassificationResult] = manager.classify_batch(
|
1328
1329
|
item_contents=items_to_classify,
|
1329
|
-
|
1330
|
+
labels=labels,
|
1330
1331
|
model_id=model,
|
1331
1332
|
using=inferred_using,
|
1332
1333
|
min_confidence=min_confidence,
|
@@ -2263,3 +2264,106 @@ class PageCollection(Generic[P], ApplyMixin):
|
|
2263
2264
|
)
|
2264
2265
|
|
2265
2266
|
# --- End Deskew Method --- #
|
2267
|
+
|
2268
|
+
def to_image(
|
2269
|
+
self,
|
2270
|
+
page_width: int = 300,
|
2271
|
+
cols: Optional[int] = 4,
|
2272
|
+
rows: Optional[int] = None,
|
2273
|
+
max_pages: Optional[int] = None,
|
2274
|
+
spacing: int = 10,
|
2275
|
+
add_labels: bool = True,
|
2276
|
+
show_category: bool = False, # Add new flag
|
2277
|
+
) -> Optional["Image.Image"]:
|
2278
|
+
"""
|
2279
|
+
Generate a grid of page images for this collection.
|
2280
|
+
|
2281
|
+
Args:
|
2282
|
+
page_width: Width in pixels for rendering individual pages
|
2283
|
+
cols: Number of columns in grid (default: 4)
|
2284
|
+
rows: Number of rows in grid (calculated automatically if None)
|
2285
|
+
max_pages: Maximum number of pages to include (default: all)
|
2286
|
+
spacing: Spacing between page thumbnails in pixels
|
2287
|
+
add_labels: Whether to add page number labels
|
2288
|
+
show_category: Whether to add category and confidence labels (if available)
|
2289
|
+
|
2290
|
+
Returns:
|
2291
|
+
PIL Image of the page grid or None if no pages
|
2292
|
+
"""
|
2293
|
+
if not self.pages:
|
2294
|
+
logger.warning("Cannot generate image for empty PageCollection")
|
2295
|
+
return None
|
2296
|
+
|
2297
|
+
# Limit pages if max_pages is specified
|
2298
|
+
pages_to_render = self.pages[:max_pages] if max_pages else self.pages
|
2299
|
+
|
2300
|
+
# Load font once outside the loop
|
2301
|
+
font = ImageFont.load_default(16) if add_labels else None
|
2302
|
+
|
2303
|
+
# Render individual page images
|
2304
|
+
page_images = []
|
2305
|
+
for page in pages_to_render:
|
2306
|
+
img = page.to_image(width=page_width)
|
2307
|
+
|
2308
|
+
# Add page number label
|
2309
|
+
if add_labels and font: # Check if font was loaded
|
2310
|
+
draw = ImageDraw.Draw(img)
|
2311
|
+
pdf_name = Path(page.pdf.path).stem if hasattr(page, "pdf") and page.pdf else ""
|
2312
|
+
label_text = f"p{page.number} - {pdf_name}"
|
2313
|
+
|
2314
|
+
# Add category if requested and available
|
2315
|
+
if show_category:
|
2316
|
+
category = getattr(page, "category", None)
|
2317
|
+
confidence = getattr(page, "category_confidence", None)
|
2318
|
+
if category is not None and confidence is not None:
|
2319
|
+
category_str = f"{category} {confidence:.3f}"
|
2320
|
+
label_text += f"\n{category_str}"
|
2321
|
+
|
2322
|
+
# Calculate bounding box for multi-line text
|
2323
|
+
# Use (5, 5) as top-left anchor for textbbox calculation for padding
|
2324
|
+
# Use multiline_textbbox for accurate bounds with newlines
|
2325
|
+
bbox = draw.multiline_textbbox((5, 5), label_text, font=font)
|
2326
|
+
# Add padding to the calculated bbox for the white background
|
2327
|
+
bg_rect = (bbox[0] - 2, bbox[1] - 2, bbox[2] + 2, bbox[3] + 2)
|
2328
|
+
|
2329
|
+
# Draw white background rectangle
|
2330
|
+
draw.rectangle(bg_rect, fill=(255, 255, 255))
|
2331
|
+
|
2332
|
+
# Draw the potentially multi-line text using multiline_text
|
2333
|
+
draw.multiline_text((5, 5), label_text, fill=(0, 0, 0), font=font)
|
2334
|
+
|
2335
|
+
page_images.append(img)
|
2336
|
+
|
2337
|
+
# Calculate grid dimensions if not provided
|
2338
|
+
if not rows and not cols:
|
2339
|
+
# Default to a square-ish grid
|
2340
|
+
cols = min(4, int(len(page_images) ** 0.5) + 1)
|
2341
|
+
rows = (len(page_images) + cols - 1) // cols
|
2342
|
+
elif rows and not cols:
|
2343
|
+
cols = (len(page_images) + rows - 1) // rows
|
2344
|
+
elif cols and not rows:
|
2345
|
+
rows = (len(page_images) + cols - 1) // cols
|
2346
|
+
|
2347
|
+
# Get maximum dimensions for consistent grid cells
|
2348
|
+
max_width = max(img.width for img in page_images)
|
2349
|
+
max_height = max(img.height for img in page_images)
|
2350
|
+
|
2351
|
+
# Create grid image
|
2352
|
+
grid_width = cols * max_width + (cols + 1) * spacing
|
2353
|
+
grid_height = rows * max_height + (rows + 1) * spacing
|
2354
|
+
grid_img = Image.new("RGB", (grid_width, grid_height), (255, 255, 255))
|
2355
|
+
|
2356
|
+
# Place images in grid
|
2357
|
+
for i, img in enumerate(page_images):
|
2358
|
+
if i >= rows * cols:
|
2359
|
+
break
|
2360
|
+
|
2361
|
+
row = i // cols
|
2362
|
+
col = i % cols
|
2363
|
+
|
2364
|
+
x = col * max_width + (col + 1) * spacing
|
2365
|
+
y = row * max_height + (row + 1) * spacing
|
2366
|
+
|
2367
|
+
grid_img.paste(img, (x, y))
|
2368
|
+
|
2369
|
+
return grid_img
|
@@ -15,19 +15,19 @@ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh
|
|
15
15
|
natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
|
16
16
|
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
17
17
|
natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
|
18
|
-
natural_pdf/classification/manager.py,sha256=
|
19
|
-
natural_pdf/classification/mixin.py,sha256=
|
20
|
-
natural_pdf/classification/results.py,sha256=
|
21
|
-
natural_pdf/collections/mixins.py,sha256=
|
22
|
-
natural_pdf/collections/pdf_collection.py,sha256=
|
18
|
+
natural_pdf/classification/manager.py,sha256=RxJch8xVu8Me6_T2Kh7ZqUNaAKlXvfyCZD0hRc4Hk6w,17929
|
19
|
+
natural_pdf/classification/mixin.py,sha256=hhX9qWPShpOq_-mgoEq0GUWnutBnNMo3YdUlxwyNWMA,6781
|
20
|
+
natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
|
21
|
+
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
22
|
+
natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg_G0goBHlrFJI,32018
|
23
23
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
24
24
|
natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
|
25
25
|
natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
|
26
26
|
natural_pdf/core/page.py,sha256=icJLu6jRbkD3iOE8r60XPkQZ8FN3ZcKo5TT5MVGkGl0,105122
|
27
|
-
natural_pdf/core/pdf.py,sha256=
|
27
|
+
natural_pdf/core/pdf.py,sha256=gOvLumJZaHXdDwpxbX9HcC_Rea4HaYMemBdYg5GX7gQ,63837
|
28
28
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
29
29
|
natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
|
30
|
-
natural_pdf/elements/collections.py,sha256=
|
30
|
+
natural_pdf/elements/collections.py,sha256=AN0WrrQYfCmcRS0-PHP4RQHxxdpcWnDuH2cWnmqtDE0,97184
|
31
31
|
natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
|
32
32
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
33
33
|
natural_pdf/elements/region.py,sha256=LfyB_9DCw5Tzn_G9xsjFz2FfKBOHRqGIND4DQWoA7KM,97324
|
@@ -73,8 +73,8 @@ natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7t
|
|
73
73
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
74
74
|
natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
|
75
75
|
natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
|
76
|
-
natural_pdf-0.1.
|
77
|
-
natural_pdf-0.1.
|
78
|
-
natural_pdf-0.1.
|
79
|
-
natural_pdf-0.1.
|
80
|
-
natural_pdf-0.1.
|
76
|
+
natural_pdf-0.1.10.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
77
|
+
natural_pdf-0.1.10.dist-info/METADATA,sha256=gjUsfmnbqrdiHcaH6L1qiw6VX4MBlWjVj5HqlDnhuQY,7401
|
78
|
+
natural_pdf-0.1.10.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
|
79
|
+
natural_pdf-0.1.10.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
80
|
+
natural_pdf-0.1.10.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|