valor-lite 0.33.4__py3-none-any.whl → 0.33.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valor-lite might be problematic. Click here for more details.

@@ -0,0 +1,191 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+ from valor_lite.schemas import Metric
5
+
6
+
7
+ class MetricType(Enum):
8
+ Counts = "Counts"
9
+ ROCAUC = "ROCAUC"
10
+ mROCAUC = "mROCAUC"
11
+ Precision = "Precision"
12
+ Recall = "Recall"
13
+ Accuracy = "Accuracy"
14
+ F1 = "F1"
15
+ ConfusionMatrix = "ConfusionMatrix"
16
+
17
+ @classmethod
18
+ def base(cls):
19
+ return [
20
+ cls.Counts,
21
+ cls.ROCAUC,
22
+ cls.mROCAUC,
23
+ cls.Precision,
24
+ cls.Recall,
25
+ cls.Accuracy,
26
+ cls.F1,
27
+ ]
28
+
29
+
30
+ @dataclass
31
+ class Counts:
32
+ tp: list[int]
33
+ fp: list[int]
34
+ fn: list[int]
35
+ tn: list[int]
36
+ score_thresholds: list[float]
37
+ hardmax: bool
38
+ label: tuple[str, str]
39
+
40
+ @property
41
+ def metric(self) -> Metric:
42
+ return Metric(
43
+ type=type(self).__name__,
44
+ value={
45
+ "tp": self.tp,
46
+ "fp": self.fp,
47
+ "fn": self.fn,
48
+ "tn": self.tn,
49
+ },
50
+ parameters={
51
+ "score_thresholds": self.score_thresholds,
52
+ "hardmax": self.hardmax,
53
+ "label": {
54
+ "key": self.label[0],
55
+ "value": self.label[1],
56
+ },
57
+ },
58
+ )
59
+
60
+ def to_dict(self) -> dict:
61
+ return self.metric.to_dict()
62
+
63
+
64
+ @dataclass
65
+ class _ThresholdValue:
66
+ value: list[float]
67
+ score_thresholds: list[float]
68
+ hardmax: bool
69
+ label: tuple[str, str]
70
+
71
+ @property
72
+ def metric(self) -> Metric:
73
+ return Metric(
74
+ type=type(self).__name__,
75
+ value=self.value,
76
+ parameters={
77
+ "score_thresholds": self.score_thresholds,
78
+ "hardmax": self.hardmax,
79
+ "label": {
80
+ "key": self.label[0],
81
+ "value": self.label[1],
82
+ },
83
+ },
84
+ )
85
+
86
+ def to_dict(self) -> dict:
87
+ return self.metric.to_dict()
88
+
89
+
90
+ class Precision(_ThresholdValue):
91
+ pass
92
+
93
+
94
+ class Recall(_ThresholdValue):
95
+ pass
96
+
97
+
98
+ class Accuracy(_ThresholdValue):
99
+ pass
100
+
101
+
102
+ class F1(_ThresholdValue):
103
+ pass
104
+
105
+
106
+ @dataclass
107
+ class ROCAUC:
108
+ value: float
109
+ label: tuple[str, str]
110
+
111
+ @property
112
+ def metric(self) -> Metric:
113
+ return Metric(
114
+ type=type(self).__name__,
115
+ value=self.value,
116
+ parameters={
117
+ "label": {
118
+ "key": self.label[0],
119
+ "value": self.label[1],
120
+ },
121
+ },
122
+ )
123
+
124
+ def to_dict(self) -> dict:
125
+ return self.metric.to_dict()
126
+
127
+
128
+ @dataclass
129
+ class mROCAUC:
130
+ value: float
131
+ label_key: str
132
+
133
+ @property
134
+ def metric(self) -> Metric:
135
+ return Metric(
136
+ type=type(self).__name__,
137
+ value=self.value,
138
+ parameters={
139
+ "label_key": self.label_key,
140
+ },
141
+ )
142
+
143
+ def to_dict(self) -> dict:
144
+ return self.metric.to_dict()
145
+
146
+
147
+ @dataclass
148
+ class ConfusionMatrix:
149
+ confusion_matrix: dict[
150
+ str, # ground truth label value
151
+ dict[
152
+ str, # prediction label value
153
+ dict[
154
+ str, # either `count` or `examples`
155
+ int
156
+ | list[
157
+ dict[
158
+ str, # either `datum` or `score`
159
+ str | float, # datum uid # prediction score
160
+ ]
161
+ ],
162
+ ],
163
+ ],
164
+ ]
165
+ missing_predictions: dict[
166
+ str, # ground truth label value
167
+ dict[
168
+ str, # either `count` or `examples`
169
+ int | list[dict[str, str]], # count or datum examples
170
+ ],
171
+ ]
172
+ score_threshold: float
173
+ label_key: str
174
+ number_of_examples: int
175
+
176
+ @property
177
+ def metric(self) -> Metric:
178
+ return Metric(
179
+ type=type(self).__name__,
180
+ value={
181
+ "confusion_matrix": self.confusion_matrix,
182
+ "missing_predictions": self.missing_predictions,
183
+ },
184
+ parameters={
185
+ "score_threshold": self.score_threshold,
186
+ "label_key": self.label_key,
187
+ },
188
+ )
189
+
190
+ def to_dict(self) -> dict:
191
+ return self.metric.to_dict()
@@ -1,10 +1,13 @@
1
1
  from .annotation import Bitmask, BoundingBox, Detection, Polygon
2
2
  from .computation import (
3
- compute_detailed_counts,
3
+ compute_bbox_iou,
4
+ compute_bitmask_iou,
5
+ compute_confusion_matrix,
4
6
  compute_metrics,
7
+ compute_polygon_iou,
5
8
  compute_ranked_pairs,
6
9
  )
7
- from .manager import DataLoader, Evaluator, compute_iou
10
+ from .manager import DataLoader, Evaluator
8
11
  from .metric import (
9
12
  AP,
10
13
  AR,
@@ -12,8 +15,8 @@ from .metric import (
12
15
  Accuracy,
13
16
  APAveragedOverIOUs,
14
17
  ARAveragedOverScores,
18
+ ConfusionMatrix,
15
19
  Counts,
16
- DetailedCounts,
17
20
  MetricType,
18
21
  Precision,
19
22
  PrecisionRecallCurve,
@@ -44,11 +47,13 @@ __all__ = [
44
47
  "ARAveragedOverScores",
45
48
  "mARAveragedOverScores",
46
49
  "PrecisionRecallCurve",
47
- "DetailedCounts",
48
- "compute_iou",
50
+ "ConfusionMatrix",
51
+ "compute_bbox_iou",
52
+ "compute_bitmask_iou",
53
+ "compute_polygon_iou",
49
54
  "compute_ranked_pairs",
50
55
  "compute_metrics",
51
- "compute_detailed_counts",
56
+ "compute_confusion_matrix",
52
57
  "DataLoader",
53
58
  "Evaluator",
54
59
  ]
@@ -492,13 +492,52 @@ def compute_metrics(
492
492
  )
493
493
 
494
494
 
495
- def compute_detailed_counts(
495
+ def _count_with_examples(
496
+ data: NDArray[np.floating],
497
+ unique_idx: int | list[int],
498
+ label_idx: int | list[int],
499
+ ) -> tuple[NDArray[np.floating], NDArray[np.int32], NDArray[np.int32]]:
500
+ """
501
+ Helper function for counting occurences of unique detailed pairs.
502
+
503
+ Parameters
504
+ ----------
505
+ data : NDArray[np.floating]
506
+ A masked portion of a detailed pairs array.
507
+ unique_idx : int | list[int]
508
+ The index or indices upon which uniqueness is constrained.
509
+ label_idx : int | list[int]
510
+ The index or indices within the unique index or indices that encode labels.
511
+
512
+ Returns
513
+ -------
514
+ NDArray[np.floating]
515
+ Examples drawn from the data input.
516
+ NDArray[np.int32]
517
+ Unique label indices.
518
+ NDArray[np.int32]
519
+ Counts for each unique label index.
520
+ """
521
+ unique_rows, indices = np.unique(
522
+ data.astype(int)[:, unique_idx],
523
+ return_index=True,
524
+ axis=0,
525
+ )
526
+ examples = data[indices]
527
+ labels, counts = np.unique(
528
+ unique_rows[:, label_idx], return_counts=True, axis=0
529
+ )
530
+ return examples, labels, counts
531
+
532
+
533
+ def compute_confusion_matrix(
496
534
  data: NDArray[np.floating],
497
535
  label_metadata: NDArray[np.int32],
498
536
  iou_thresholds: NDArray[np.floating],
499
537
  score_thresholds: NDArray[np.floating],
500
- n_samples: int,
501
- ) -> NDArray[np.int32]:
538
+ n_examples: int,
539
+ ) -> tuple[NDArray[np.floating], NDArray[np.floating], NDArray[np.int32]]:
540
+
502
541
  """
503
542
  Compute detailed counts.
504
543
 
@@ -512,19 +551,6 @@ def compute_detailed_counts(
512
551
  Index 5 - Prediction Label Index
513
552
  Index 6 - Score
514
553
 
515
- Outputs an array with shape (N_IoUs, N_Score, N_Labels, 5 * n_samples + 5):
516
-
517
- Index 0 - True Positive Count
518
- ... Datum ID Examples
519
- Index 2 * n_samples + 1 - False Positive Misclassification Count
520
- ... Datum ID Examples
521
- Index 4 * n_samples + 2 - False Positive Hallucination Count
522
- ... Datum ID Examples
523
- Index 6 * n_samples + 3 - False Negative Misclassification Count
524
- ... Datum ID Examples
525
- Index 8 * n_samples + 4 - False Negative Missing Prediction Count
526
- ... Datum ID Examples
527
-
528
554
  Parameters
529
555
  ----------
530
556
  data : NDArray[np.floating]
@@ -535,28 +561,37 @@ def compute_detailed_counts(
535
561
  A 1-D array containing IoU thresholds.
536
562
  score_thresholds : NDArray[np.floating]
537
563
  A 1-D array containing score thresholds.
538
- n_samples : int
539
- The number of examples to return per count.
564
+ n_examples : int
565
+ The maximum number of examples to return per count.
540
566
 
541
567
  Returns
542
568
  -------
569
+ NDArray[np.floating]
570
+ Confusion matrix.
571
+ NDArray[np.floating]
572
+ Hallucinations.
543
573
  NDArray[np.int32]
544
- The detailed counts with optional examples.
574
+ Missing Predictions.
545
575
  """
546
576
 
547
577
  n_labels = label_metadata.shape[0]
548
578
  n_ious = iou_thresholds.shape[0]
549
579
  n_scores = score_thresholds.shape[0]
550
- n_metrics = 5 * (2 * n_samples + 1)
551
-
552
- tp_idx = 0
553
- fp_misclf_idx = 2 * n_samples + 1
554
- fp_halluc_idx = 4 * n_samples + 2
555
- fn_misclf_idx = 6 * n_samples + 3
556
- fn_misprd_idx = 8 * n_samples + 4
557
580
 
558
- detailed_pr_curve = -1 * np.ones(
559
- (n_ious, n_scores, n_labels, n_metrics), dtype=np.int32
581
+ confusion_matrix = -1 * np.ones(
582
+ # (datum idx, gt idx, pd idx, pd score) * n_examples + count
583
+ (n_ious, n_scores, n_labels, n_labels, 4 * n_examples + 1),
584
+ dtype=np.float32,
585
+ )
586
+ hallucinations = -1 * np.ones(
587
+ # (datum idx, pd idx, pd score) * n_examples + count
588
+ (n_ious, n_scores, n_labels, 3 * n_examples + 1),
589
+ dtype=np.float32,
590
+ )
591
+ missing_predictions = -1 * np.ones(
592
+ # (datum idx, gt idx) * n_examples + count
593
+ (n_ious, n_scores, n_labels, 2 * n_examples + 1),
594
+ dtype=np.int32,
560
595
  )
561
596
 
562
597
  mask_gt_exists = data[:, 1] > -0.5
@@ -622,9 +657,9 @@ def compute_detailed_counts(
622
657
  ~mask_groundtruths_with_passing_score & mask_gt_exists
623
658
  )
624
659
 
660
+ # create category masks
625
661
  mask_tp = mask_score & mask_iou & mask_gt_pd_match
626
- mask_fp_misclf = mask_score & mask_iou & mask_gt_pd_mismatch
627
- mask_fn_misclf = mask_iou & (
662
+ mask_misclf = mask_iou & (
628
663
  (
629
664
  ~mask_score
630
665
  & mask_gt_pd_match
@@ -632,143 +667,164 @@ def compute_detailed_counts(
632
667
  )
633
668
  | (mask_score & mask_gt_pd_mismatch)
634
669
  )
635
- mask_fp_halluc = mask_score & mask_predictions_without_passing_ious
636
- mask_fn_misprd = (
670
+ mask_halluc = mask_score & mask_predictions_without_passing_ious
671
+ mask_misprd = (
637
672
  mask_groundtruths_without_passing_ious
638
673
  | mask_groundtruths_without_passing_score
639
674
  )
640
675
 
641
- tp_pds = np.unique(data[mask_tp][:, [0, 2, 5]], axis=0)
642
- tp_gts = np.unique(data[mask_tp][:, [0, 1, 4]], axis=0)
643
- fp_misclf = np.unique(data[mask_fp_misclf][:, [0, 2, 5]], axis=0)
644
- fp_halluc = np.unique(data[mask_fp_halluc][:, [0, 2, 5]], axis=0)
645
- fn_misclf = np.unique(data[mask_fn_misclf][:, [0, 1, 4]], axis=0)
646
- fn_misprd = np.unique(data[mask_fn_misprd][:, [0, 1, 4]], axis=0)
647
-
648
- mask_fp_misclf_is_tp = (
649
- (fp_misclf.reshape(-1, 1, 3) == tp_pds.reshape(1, -1, 3))
676
+ # filter out true-positives from misclf and misprd
677
+ mask_gts_with_tp_override = (
678
+ (
679
+ data[mask_misclf][:, [0, 1]].reshape(-1, 1, 2)
680
+ == data[mask_tp][:, [0, 1]].reshape(1, -1, 2)
681
+ )
650
682
  .all(axis=2)
651
683
  .any(axis=1)
652
684
  )
653
- mask_fn_misclf_is_tp = (
654
- (fn_misclf.reshape(-1, 1, 3) == tp_gts.reshape(1, -1, 3))
685
+ mask_pds_with_tp_override = (
686
+ (
687
+ data[mask_misclf][:, [0, 2]].reshape(-1, 1, 2)
688
+ == data[mask_tp][:, [0, 2]].reshape(1, -1, 2)
689
+ )
655
690
  .all(axis=2)
656
691
  .any(axis=1)
657
692
  )
693
+ mask_misprd[mask_misclf] |= (
694
+ ~mask_gts_with_tp_override & mask_pds_with_tp_override
695
+ )
696
+ mask_misclf[mask_misclf] &= (
697
+ ~mask_gts_with_tp_override & ~mask_pds_with_tp_override
698
+ )
658
699
 
659
- tp = tp_pds
660
- fp_misclf = fp_misclf[~mask_fp_misclf_is_tp]
661
- fp_halluc = fp_halluc
662
- fn_misclf = fn_misclf[~mask_fn_misclf_is_tp]
663
- fn_misprd = fn_misprd
664
-
665
- tp_count = np.bincount(tp[:, 2].astype(int), minlength=n_labels)
666
- fp_misclf_count = np.bincount(
667
- fp_misclf[:, 2].astype(int), minlength=n_labels
700
+ # count true positives
701
+ tp_examples, tp_labels, tp_counts = _count_with_examples(
702
+ data[mask_tp],
703
+ unique_idx=[0, 2, 5],
704
+ label_idx=2,
668
705
  )
669
- fp_halluc_count = np.bincount(
670
- fp_halluc[:, 2].astype(int), minlength=n_labels
706
+
707
+ # count misclassifications
708
+ (
709
+ misclf_examples,
710
+ misclf_labels,
711
+ misclf_counts,
712
+ ) = _count_with_examples(
713
+ data[mask_misclf], unique_idx=[0, 1, 2, 4, 5], label_idx=[3, 4]
671
714
  )
672
- fn_misclf_count = np.bincount(
673
- fn_misclf[:, 2].astype(int), minlength=n_labels
715
+
716
+ # count hallucinations
717
+ (
718
+ halluc_examples,
719
+ halluc_labels,
720
+ halluc_counts,
721
+ ) = _count_with_examples(
722
+ data[mask_halluc], unique_idx=[0, 2, 5], label_idx=2
674
723
  )
675
- fn_misprd_count = np.bincount(
676
- fn_misprd[:, 2].astype(int), minlength=n_labels
724
+
725
+ # count missing predictions
726
+ (
727
+ misprd_examples,
728
+ misprd_labels,
729
+ misprd_counts,
730
+ ) = _count_with_examples(
731
+ data[mask_misprd], unique_idx=[0, 1, 4], label_idx=2
677
732
  )
678
733
 
679
- detailed_pr_curve[iou_idx, score_idx, :, tp_idx] = tp_count
680
- detailed_pr_curve[
681
- iou_idx, score_idx, :, fp_misclf_idx
682
- ] = fp_misclf_count
683
- detailed_pr_curve[
684
- iou_idx, score_idx, :, fp_halluc_idx
685
- ] = fp_halluc_count
686
- detailed_pr_curve[
687
- iou_idx, score_idx, :, fn_misclf_idx
688
- ] = fn_misclf_count
689
- detailed_pr_curve[
690
- iou_idx, score_idx, :, fn_misprd_idx
691
- ] = fn_misprd_count
692
-
693
- if n_samples > 0:
734
+ # store the counts
735
+ confusion_matrix[
736
+ iou_idx, score_idx, tp_labels, tp_labels, 0
737
+ ] = tp_counts
738
+ confusion_matrix[
739
+ iou_idx,
740
+ score_idx,
741
+ misclf_labels[:, 0],
742
+ misclf_labels[:, 1],
743
+ 0,
744
+ ] = misclf_counts
745
+ hallucinations[
746
+ iou_idx,
747
+ score_idx,
748
+ halluc_labels,
749
+ 0,
750
+ ] = halluc_counts
751
+ missing_predictions[
752
+ iou_idx,
753
+ score_idx,
754
+ misprd_labels,
755
+ 0,
756
+ ] = misprd_counts
757
+
758
+ # store examples
759
+ if n_examples > 0:
694
760
  for label_idx in range(n_labels):
695
- tp_examples = (
696
- tp[tp[:, 2].astype(int) == label_idx][
697
- :n_samples, [0, 1]
698
- ]
699
- .astype(int)
700
- .flatten()
701
- )
702
- fp_misclf_examples = (
703
- fp_misclf[fp_misclf[:, 2].astype(int) == label_idx][
704
- :n_samples, [0, 1]
705
- ]
706
- .astype(int)
707
- .flatten()
708
- )
709
- fp_halluc_examples = (
710
- fp_halluc[fp_halluc[:, 2].astype(int) == label_idx][
711
- :n_samples, [0, 1]
712
- ]
713
- .astype(int)
714
- .flatten()
715
- )
716
- fn_misclf_examples = (
717
- fn_misclf[fn_misclf[:, 2].astype(int) == label_idx][
718
- :n_samples, [0, 1]
719
- ]
720
- .astype(int)
721
- .flatten()
722
- )
723
- fn_misprd_examples = (
724
- fn_misprd[fn_misprd[:, 2].astype(int) == label_idx][
725
- :n_samples, [0, 1]
761
+
762
+ # true-positive examples
763
+ mask_tp_label = tp_examples[:, 5] == label_idx
764
+ if mask_tp_label.sum() > 0:
765
+ tp_label_examples = tp_examples[mask_tp_label][
766
+ :n_examples
726
767
  ]
727
- .astype(int)
728
- .flatten()
729
- )
730
-
731
- detailed_pr_curve[
732
- iou_idx,
733
- score_idx,
734
- label_idx,
735
- tp_idx + 1 : tp_idx + 1 + tp_examples.shape[0],
736
- ] = tp_examples
737
- detailed_pr_curve[
738
- iou_idx,
739
- score_idx,
740
- label_idx,
741
- fp_misclf_idx
742
- + 1 : fp_misclf_idx
743
- + 1
744
- + fp_misclf_examples.shape[0],
745
- ] = fp_misclf_examples
746
- detailed_pr_curve[
747
- iou_idx,
748
- score_idx,
749
- label_idx,
750
- fp_halluc_idx
751
- + 1 : fp_halluc_idx
752
- + 1
753
- + fp_halluc_examples.shape[0],
754
- ] = fp_halluc_examples
755
- detailed_pr_curve[
756
- iou_idx,
757
- score_idx,
758
- label_idx,
759
- fn_misclf_idx
760
- + 1 : fn_misclf_idx
761
- + 1
762
- + fn_misclf_examples.shape[0],
763
- ] = fn_misclf_examples
764
- detailed_pr_curve[
765
- iou_idx,
766
- score_idx,
767
- label_idx,
768
- fn_misprd_idx
769
- + 1 : fn_misprd_idx
770
- + 1
771
- + fn_misprd_examples.shape[0],
772
- ] = fn_misprd_examples
773
-
774
- return detailed_pr_curve
768
+ confusion_matrix[
769
+ iou_idx,
770
+ score_idx,
771
+ label_idx,
772
+ label_idx,
773
+ 1 : 4 * tp_label_examples.shape[0] + 1,
774
+ ] = tp_label_examples[:, [0, 1, 2, 6]].flatten()
775
+
776
+ # misclassification examples
777
+ mask_misclf_gt_label = misclf_examples[:, 4] == label_idx
778
+ if mask_misclf_gt_label.sum() > 0:
779
+ for pd_label_idx in range(n_labels):
780
+ mask_misclf_pd_label = (
781
+ misclf_examples[:, 5] == pd_label_idx
782
+ )
783
+ mask_misclf_label_combo = (
784
+ mask_misclf_gt_label & mask_misclf_pd_label
785
+ )
786
+ if mask_misclf_label_combo.sum() > 0:
787
+ misclf_label_examples = misclf_examples[
788
+ mask_misclf_label_combo
789
+ ][:n_examples]
790
+ confusion_matrix[
791
+ iou_idx,
792
+ score_idx,
793
+ label_idx,
794
+ pd_label_idx,
795
+ 1 : 4 * misclf_label_examples.shape[0] + 1,
796
+ ] = misclf_label_examples[
797
+ :, [0, 1, 2, 6]
798
+ ].flatten()
799
+
800
+ # hallucination examples
801
+ mask_halluc_label = halluc_examples[:, 5] == label_idx
802
+ if mask_halluc_label.sum() > 0:
803
+ halluc_label_examples = halluc_examples[
804
+ mask_halluc_label
805
+ ][:n_examples]
806
+ hallucinations[
807
+ iou_idx,
808
+ score_idx,
809
+ label_idx,
810
+ 1 : 3 * halluc_label_examples.shape[0] + 1,
811
+ ] = halluc_label_examples[:, [0, 2, 6]].flatten()
812
+
813
+ # missing prediction examples
814
+ mask_misprd_label = misprd_examples[:, 4] == label_idx
815
+ if misprd_examples.size > 0:
816
+ misprd_label_examples = misprd_examples[
817
+ mask_misprd_label
818
+ ][:n_examples]
819
+ missing_predictions[
820
+ iou_idx,
821
+ score_idx,
822
+ label_idx,
823
+ 1 : 2 * misprd_label_examples.shape[0] + 1,
824
+ ] = misprd_label_examples[:, [0, 1]].flatten()
825
+
826
+ return (
827
+ confusion_matrix,
828
+ hallucinations,
829
+ missing_predictions,
830
+ )