valor-lite 0.33.13__py3-none-any.whl → 0.33.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valor-lite might be problematic. Click here for more details.

@@ -1,7 +1,6 @@
1
- from dataclasses import dataclass
2
1
  from enum import Enum
3
2
 
4
- from valor_lite.schemas import Metric
3
+ from valor_lite.schemas import BaseMetric
5
4
 
6
5
 
7
6
  class MetricType(Enum):
@@ -15,412 +14,372 @@ class MetricType(Enum):
15
14
  ConfusionMatrix = "ConfusionMatrix"
16
15
 
17
16
 
18
- @dataclass
19
- class Counts:
17
+ class Metric(BaseMetric):
20
18
  """
21
- Confusion matrix counts at specified score thresholds for binary classification.
22
-
23
- This class stores the true positive (`tp`), false positive (`fp`), false negative (`fn`), and true
24
- negative (`tn`) counts computed at various score thresholds for a binary classification task.
19
+ Classification Metric.
25
20
 
26
21
  Attributes
27
22
  ----------
28
- tp : list[int]
29
- True positive counts at each score threshold.
30
- fp : list[int]
31
- False positive counts at each score threshold.
32
- fn : list[int]
33
- False negative counts at each score threshold.
34
- tn : list[int]
35
- True negative counts at each score threshold.
36
- score_thresholds : list[float]
37
- Score thresholds at which the counts are computed.
38
- hardmax : bool
39
- Indicates whether hardmax thresholding was used.
40
- label : str
41
- The class label for which the counts are computed.
42
-
43
- Methods
44
- -------
45
- to_metric()
46
- Converts the instance to a generic `Metric` object.
47
- to_dict()
48
- Converts the instance to a dictionary representation.
23
+ type : str
24
+ The metric type.
25
+ value : int | float | dict
26
+ The metric value.
27
+ parameters : dict[str, Any]
28
+ A dictionary containing metric parameters.
49
29
  """
50
30
 
51
- tp: list[int]
52
- fp: list[int]
53
- fn: list[int]
54
- tn: list[int]
55
- score_thresholds: list[float]
56
- hardmax: bool
57
- label: str
58
-
59
- def to_metric(self) -> Metric:
60
- return Metric(
61
- type=type(self).__name__,
62
- value={
63
- "tp": self.tp,
64
- "fp": self.fp,
65
- "fn": self.fn,
66
- "tn": self.tn,
67
- },
31
+ @classmethod
32
+ def precision(
33
+ cls,
34
+ value: float,
35
+ score_threshold: float,
36
+ hardmax: bool,
37
+ label: str,
38
+ ):
39
+ """
40
+ Precision metric for a specific class label.
41
+
42
+ This class calculates the precision at a specific score threshold.
43
+ Precision is defined as the ratio of true positives to the sum of
44
+ true positives and false positives.
45
+
46
+ Parameters
47
+ ----------
48
+ value : float
49
+ Precision value computed at a specific score threshold.
50
+ score_threshold : float
51
+ Score threshold at which the precision value is computed.
52
+ hardmax : bool
53
+ Indicates whether hardmax thresholding was used.
54
+ label : str
55
+ The class label for which the precision is computed.
56
+
57
+ Returns
58
+ -------
59
+ Metric
60
+ """
61
+ return cls(
62
+ type=MetricType.Precision.value,
63
+ value=value,
68
64
  parameters={
69
- "score_thresholds": self.score_thresholds,
70
- "hardmax": self.hardmax,
71
- "label": self.label,
65
+ "score_threshold": score_threshold,
66
+ "hardmax": hardmax,
67
+ "label": label,
72
68
  },
73
69
  )
74
70
 
75
- def to_dict(self) -> dict:
76
- return self.to_metric().to_dict()
77
-
78
-
79
- @dataclass
80
- class _ThresholdValue:
81
- value: list[float]
82
- score_thresholds: list[float]
83
- hardmax: bool
84
- label: str
85
-
86
- def to_metric(self) -> Metric:
87
- return Metric(
88
- type=type(self).__name__,
89
- value=self.value,
71
+ @classmethod
72
+ def recall(
73
+ cls,
74
+ value: float,
75
+ score_threshold: float,
76
+ hardmax: bool,
77
+ label: str,
78
+ ):
79
+ """
80
+ Recall metric for a specific class label.
81
+
82
+ This class calculates the recall at a specific score threshold.
83
+ Recall is defined as the ratio of true positives to the sum of
84
+ true positives and false negatives.
85
+
86
+ Parameters
87
+ ----------
88
+ value : float
89
+ Recall value computed at a specific score threshold.
90
+ score_threshold : float
91
+ Score threshold at which the recall value is computed.
92
+ hardmax : bool
93
+ Indicates whether hardmax thresholding was used.
94
+ label : str
95
+ The class label for which the recall is computed.
96
+
97
+ Returns
98
+ -------
99
+ Metric
100
+ """
101
+ return cls(
102
+ type=MetricType.Recall.value,
103
+ value=value,
90
104
  parameters={
91
- "score_thresholds": self.score_thresholds,
92
- "hardmax": self.hardmax,
93
- "label": self.label,
105
+ "score_threshold": score_threshold,
106
+ "hardmax": hardmax,
107
+ "label": label,
94
108
  },
95
109
  )
96
110
 
97
- def to_dict(self) -> dict:
98
- return self.to_metric().to_dict()
99
-
100
-
101
- class Precision(_ThresholdValue):
102
- """
103
- Precision metric for a specific class label.
104
-
105
- This class calculates the precision at various score thresholds for a binary
106
- classification task. Precision is defined as the ratio of true positives to the
107
- sum of true positives and false positives.
108
-
109
- Attributes
110
- ----------
111
- value : list[float]
112
- Precision values computed at each score threshold.
113
- score_thresholds : list[float]
114
- Score thresholds at which the precision values are computed.
115
- hardmax : bool
116
- Indicates whether hardmax thresholding was used.
117
- label : str
118
- The class label for which the precision is computed.
119
-
120
- Methods
121
- -------
122
- to_metric()
123
- Converts the instance to a generic `Metric` object.
124
- to_dict()
125
- Converts the instance to a dictionary representation.
126
- """
127
-
128
- pass
129
-
130
-
131
- class Recall(_ThresholdValue):
132
- """
133
- Recall metric for a specific class label.
134
-
135
- This class calculates the recall at various score thresholds for a binary
136
- classification task. Recall is defined as the ratio of true positives to the
137
- sum of true positives and false negatives.
138
-
139
- Attributes
140
- ----------
141
- value : list[float]
142
- Recall values computed at each score threshold.
143
- score_thresholds : list[float]
144
- Score thresholds at which the recall values are computed.
145
- hardmax : bool
146
- Indicates whether hardmax thresholding was used.
147
- label : str
148
- The class label for which the recall is computed.
149
-
150
- Methods
151
- -------
152
- to_metric()
153
- Converts the instance to a generic `Metric` object.
154
- to_dict()
155
- Converts the instance to a dictionary representation.
156
- """
157
-
158
- pass
159
-
160
-
161
- class F1(_ThresholdValue):
162
- """
163
- F1 score for a specific class label.
164
-
165
- This class calculates the F1 score at various score thresholds for a binary
166
- classification task.
167
-
168
- Attributes
169
- ----------
170
- value : list[float]
171
- F1 scores computed at each score threshold.
172
- score_thresholds : list[float]
173
- Score thresholds at which the F1 scores are computed.
174
- hardmax : bool
175
- Indicates whether hardmax thresholding was used.
176
- label : str
177
- The class label for which the F1 score is computed.
178
-
179
- Methods
180
- -------
181
- to_metric()
182
- Converts the instance to a generic `Metric` object.
183
- to_dict()
184
- Converts the instance to a dictionary representation.
185
- """
186
-
187
- pass
188
-
189
-
190
- @dataclass
191
- class Accuracy:
192
- """
193
- Multiclass accuracy metric.
194
-
195
- This class calculates the accuracy at various score thresholds.
196
-
197
- Attributes
198
- ----------
199
- value : list[float]
200
- Accuracy values computed at each score threshold.
201
- score_thresholds : list[float]
202
- Score thresholds at which the accuracy values are computed.
203
- hardmax : bool
204
- Indicates whether hardmax thresholding was used.
205
-
206
- Methods
207
- -------
208
- to_metric()
209
- Converts the instance to a generic `Metric` object.
210
- to_dict()
211
- Converts the instance to a dictionary representation.
212
- """
213
-
214
- value: list[float]
215
- score_thresholds: list[float]
216
- hardmax: bool
217
-
218
- def to_metric(self) -> Metric:
219
- return Metric(
220
- type=type(self).__name__,
221
- value=self.value,
111
+ @classmethod
112
+ def f1_score(
113
+ cls,
114
+ value: float,
115
+ score_threshold: float,
116
+ hardmax: bool,
117
+ label: str,
118
+ ):
119
+ """
120
+ F1 score for a specific class label and confidence score threshold.
121
+
122
+ Parameters
123
+ ----------
124
+ value : float
125
+ F1 score computed at a specific score threshold.
126
+ score_threshold : float
127
+ Score threshold at which the F1 score is computed.
128
+ hardmax : bool
129
+ Indicates whether hardmax thresholding was used.
130
+ label : str
131
+ The class label for which the F1 score is computed.
132
+
133
+ Returns
134
+ -------
135
+ Metric
136
+ """
137
+ return cls(
138
+ type=MetricType.F1.value,
139
+ value=value,
222
140
  parameters={
223
- "score_thresholds": self.score_thresholds,
224
- "hardmax": self.hardmax,
141
+ "score_threshold": score_threshold,
142
+ "hardmax": hardmax,
143
+ "label": label,
225
144
  },
226
145
  )
227
146
 
228
- def to_dict(self) -> dict:
229
- return self.to_metric().to_dict()
230
-
231
-
232
- @dataclass
233
- class ROCAUC:
234
- """
235
- Receiver Operating Characteristic Area Under the Curve (ROC AUC).
236
-
237
- This class calculates the ROC AUC score for a specific class label in a multiclass classification task.
238
- ROC AUC is a performance measurement for classification problems at various threshold settings.
239
- It reflects the ability of the classifier to distinguish between the positive and negative classes.
240
-
241
- Parameters
242
- ----------
243
- value : float
244
- The computed ROC AUC score.
245
- label : str
246
- The class label for which the ROC AUC is computed.
247
-
248
- Methods
249
- -------
250
- to_metric()
251
- Converts the instance to a generic `Metric` object.
252
- to_dict()
253
- Converts the instance to a dictionary representation.
254
- """
255
-
256
- value: float
257
- label: str
258
-
259
- def to_metric(self) -> Metric:
260
- return Metric(
261
- type=type(self).__name__,
262
- value=self.value,
263
- parameters={"label": self.label},
147
+ @classmethod
148
+ def accuracy(
149
+ cls,
150
+ value: float,
151
+ score_threshold: float,
152
+ hardmax: bool,
153
+ ):
154
+ """
155
+ Multiclass accuracy metric.
156
+
157
+ This class calculates the accuracy at various score thresholds.
158
+
159
+ Parameters
160
+ ----------
161
+ value : float
162
+ Accuracy value computed at a specific score threshold.
163
+ score_threshold : float
164
+ Score threshold at which the accuracy value is computed.
165
+ hardmax : bool
166
+ Indicates whether hardmax thresholding was used.
167
+
168
+ Returns
169
+ -------
170
+ Metric
171
+ """
172
+ return cls(
173
+ type=MetricType.Accuracy.value,
174
+ value=value,
175
+ parameters={
176
+ "score_threshold": score_threshold,
177
+ "hardmax": hardmax,
178
+ },
264
179
  )
265
180
 
266
- def to_dict(self) -> dict:
267
- return self.to_metric().to_dict()
268
-
269
-
270
- @dataclass
271
- class mROCAUC:
272
- """
273
- Mean Receiver Operating Characteristic Area Under the Curve (mROC AUC).
274
-
275
- This class calculates the mean ROC AUC score over all classes in a multiclass classification task.
276
- It provides an aggregate measure of the model's ability to distinguish between classes.
277
-
278
- Parameters
279
- ----------
280
- value : float
281
- The computed mean ROC AUC score.
282
-
283
- Methods
284
- -------
285
- to_metric()
286
- Converts the instance to a generic `Metric` object.
287
- to_dict()
288
- Converts the instance to a dictionary representation.
289
- """
290
-
291
- value: float
292
-
293
- def to_metric(self) -> Metric:
294
- return Metric(
295
- type=type(self).__name__,
296
- value=self.value,
297
- parameters={},
181
+ @classmethod
182
+ def roc_auc(
183
+ cls,
184
+ value: float,
185
+ label: str,
186
+ ):
187
+ """
188
+ Receiver Operating Characteristic Area Under the Curve (ROC AUC).
189
+
190
+ This class calculates the ROC AUC score for a specific class label in a multiclass classification task.
191
+ ROC AUC is a performance measurement for classification problems at various threshold settings.
192
+ It reflects the ability of the classifier to distinguish between the positive and negative classes.
193
+
194
+ Parameters
195
+ ----------
196
+ value : float
197
+ The computed ROC AUC score.
198
+ label : str
199
+ The class label for which the ROC AUC is computed.
200
+
201
+ Returns
202
+ -------
203
+ Metric
204
+ """
205
+ return cls(
206
+ type=MetricType.ROCAUC.value,
207
+ value=value,
208
+ parameters={
209
+ "label": label,
210
+ },
298
211
  )
299
212
 
300
- def to_dict(self) -> dict:
301
- return self.to_metric().to_dict()
302
-
303
-
304
- @dataclass
305
- class ConfusionMatrix:
306
- """
307
- The confusion matrix and related metrics for the classification task.
213
+ @classmethod
214
+ def mean_roc_auc(cls, value: float):
215
+ """
216
+ Mean Receiver Operating Characteristic Area Under the Curve (mROC AUC).
217
+
218
+ This class calculates the mean ROC AUC score over all classes in a multiclass classification task.
219
+ It provides an aggregate measure of the model's ability to distinguish between classes.
220
+
221
+ Parameters
222
+ ----------
223
+ value : float
224
+ The computed mean ROC AUC score.
225
+
226
+ Returns
227
+ -------
228
+ Metric
229
+ """
230
+ return cls(type=MetricType.mROCAUC.value, value=value, parameters={})
231
+
232
+ @classmethod
233
+ def counts(
234
+ cls,
235
+ tp: int,
236
+ fp: int,
237
+ fn: int,
238
+ tn: int,
239
+ score_threshold: float,
240
+ hardmax: bool,
241
+ label: str,
242
+ ):
243
+ """
244
+ Confusion matrix counts at specified score thresholds for binary classification.
245
+
246
+ This class stores the true positive (`tp`), false positive (`fp`), false negative (`fn`), and true
247
+ negative (`tn`) counts computed at various score thresholds for a binary classification task.
248
+
249
+ Parameters
250
+ ----------
251
+ tp : int
252
+ True positive counts at each score threshold.
253
+ fp : int
254
+ False positive counts at each score threshold.
255
+ fn : int
256
+ False negative counts at each score threshold.
257
+ tn : int
258
+ True negative counts at each score threshold.
259
+ score_threshold : float
260
+ Score thresholds at which the counts are computed.
261
+ hardmax : bool
262
+ Indicates whether hardmax thresholding was used.
263
+ label : str
264
+ The class label for which the counts are computed.
265
+
266
+ Returns
267
+ -------
268
+ Metric
269
+ """
270
+ return cls(
271
+ type=MetricType.Counts.value,
272
+ value={
273
+ "tp": tp,
274
+ "fp": fp,
275
+ "fn": fn,
276
+ "tn": tn,
277
+ },
278
+ parameters={
279
+ "score_threshold": score_threshold,
280
+ "hardmax": hardmax,
281
+ "label": label,
282
+ },
283
+ )
308
284
 
309
- This class encapsulates detailed information about the model's performance, including correct
310
- predictions, misclassifications, hallucinations (false positives), and missing predictions
311
- (false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
285
+ @classmethod
286
+ def confusion_matrix(
287
+ cls,
288
+ confusion_matrix: dict[
289
+ str, # ground truth label value
290
+ dict[
291
+ str, # prediction label value
292
+ dict[
293
+ str, # either `count` or `examples`
294
+ int
295
+ | list[
296
+ dict[
297
+ str, # either `datum` or `score`
298
+ str | float, # datum uid # prediction score
299
+ ]
300
+ ],
301
+ ],
302
+ ],
303
+ ],
304
+ missing_predictions: dict[
305
+ str, # ground truth label value
306
+ dict[
307
+ str, # either `count` or `examples`
308
+ int | list[dict[str, str]], # count or datum examples
309
+ ],
310
+ ],
311
+ score_threshold: float,
312
+ maximum_number_of_examples: int,
313
+ ):
314
+ """
315
+ The confusion matrix and related metrics for the classification task.
316
+
317
+ This class encapsulates detailed information about the model's performance, including correct
318
+ predictions, misclassifications, hallucinations (false positives), and missing predictions
319
+ (false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
320
+
321
+ Confusion Matrix Structure:
322
+ {
323
+ ground_truth_label: {
324
+ predicted_label: {
325
+ 'count': int,
326
+ 'examples': [
327
+ {
328
+ 'datum': str,
329
+ 'groundtruth': dict, # {'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float}
330
+ 'prediction': dict, # {'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float}
331
+ 'score': float,
332
+ },
333
+ ...
334
+ ],
335
+ },
336
+ ...
337
+ },
338
+ ...
339
+ }
312
340
 
313
- Confusion Matrix Structure:
314
- {
315
- ground_truth_label: {
316
- predicted_label: {
341
+ Missing Prediction Structure:
342
+ {
343
+ ground_truth_label: {
317
344
  'count': int,
318
345
  'examples': [
319
346
  {
320
347
  'datum': str,
321
348
  'groundtruth': dict, # {'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float}
322
- 'prediction': dict, # {'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float}
323
- 'score': float,
324
349
  },
325
350
  ...
326
351
  ],
327
352
  },
328
353
  ...
329
- },
330
- ...
331
- }
332
-
333
- Hallucinations Structure:
334
- {
335
- prediction_label: {
336
- 'count': int,
337
- 'examples': [
338
- {
339
- 'datum': str,
340
- 'prediction': dict, # {'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float}
341
- 'score': float,
342
- },
343
- ...
344
- ],
345
- },
346
- ...
347
- }
348
-
349
- Missing Prediction Structure:
350
- {
351
- ground_truth_label: {
352
- 'count': int,
353
- 'examples': [
354
- {
355
- 'datum': str,
356
- 'groundtruth': dict, # {'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float}
357
- },
358
- ...
359
- ],
360
- },
361
- ...
362
- }
363
-
364
- Attributes
365
- ----------
366
- confusion_matrix : dict
367
- A nested dictionary where the first key is the ground truth label value, the second key
368
- is the prediction label value, and the innermost dictionary contains either a `count`
369
- or a list of `examples`. Each example includes the datum UID and prediction score.
370
- missing_predictions : dict
371
- A dictionary where each key is a ground truth label value for which the model failed to predict
372
- (false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
373
- Each example includes the datum UID.
374
- score_threshold : float
375
- The confidence score threshold used to filter predictions.
376
- number_of_examples : int
377
- The maximum number of examples per element.
378
-
379
- Methods
380
- -------
381
- to_metric()
382
- Converts the instance to a generic `Metric` object.
383
- to_dict()
384
- Converts the instance to a dictionary representation.
385
- """
386
-
387
- confusion_matrix: dict[
388
- str, # ground truth label value
389
- dict[
390
- str, # prediction label value
391
- dict[
392
- str, # either `count` or `examples`
393
- int
394
- | list[
395
- dict[
396
- str, # either `datum` or `score`
397
- str | float, # datum uid # prediction score
398
- ]
399
- ],
400
- ],
401
- ],
402
- ]
403
- missing_predictions: dict[
404
- str, # ground truth label value
405
- dict[
406
- str, # either `count` or `examples`
407
- int | list[dict[str, str]], # count or datum examples
408
- ],
409
- ]
410
- score_threshold: float
411
- number_of_examples: int
412
-
413
- def to_metric(self) -> Metric:
414
- return Metric(
415
- type=type(self).__name__,
354
+ }
355
+
356
+ Parameters
357
+ ----------
358
+ confusion_matrix : dict
359
+ A nested dictionary where the first key is the ground truth label value, the second key
360
+ is the prediction label value, and the innermost dictionary contains either a `count`
361
+ or a list of `examples`. Each example includes the datum UID and prediction score.
362
+ missing_predictions : dict
363
+ A dictionary where each key is a ground truth label value for which the model failed to predict
364
+ (false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
365
+ Each example includes the datum UID.
366
+ score_threshold : float
367
+ The confidence score threshold used to filter predictions.
368
+ maximum_number_of_examples : int
369
+ The maximum number of examples per element.
370
+
371
+ Returns
372
+ -------
373
+ Metric
374
+ """
375
+ return cls(
376
+ type=MetricType.ConfusionMatrix.value,
416
377
  value={
417
- "confusion_matrix": self.confusion_matrix,
418
- "missing_predictions": self.missing_predictions,
378
+ "confusion_matrix": confusion_matrix,
379
+ "missing_predictions": missing_predictions,
419
380
  },
420
381
  parameters={
421
- "score_threshold": self.score_threshold,
382
+ "score_threshold": score_threshold,
383
+ "maximum_number_of_examples": maximum_number_of_examples,
422
384
  },
423
385
  )
424
-
425
- def to_dict(self) -> dict:
426
- return self.to_metric().to_dict()