valor-lite 0.33.7__py3-none-any.whl → 0.33.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- valor_lite/LICENSE +21 -0
- valor_lite/classification/annotation.py +30 -2
- valor_lite/classification/computation.py +31 -52
- valor_lite/classification/manager.py +230 -323
- valor_lite/classification/metric.py +273 -50
- valor_lite/object_detection/annotation.py +274 -0
- valor_lite/{detection → object_detection}/computation.py +130 -92
- valor_lite/{detection → object_detection}/manager.py +425 -769
- valor_lite/object_detection/metric.py +789 -0
- valor_lite/semantic_segmentation/__init__.py +27 -0
- valor_lite/semantic_segmentation/annotation.py +96 -0
- valor_lite/semantic_segmentation/computation.py +186 -0
- valor_lite/semantic_segmentation/manager.py +549 -0
- valor_lite/semantic_segmentation/metric.py +278 -0
- valor_lite/text_generation/__init__.py +0 -0
- valor_lite-0.33.9.dist-info/METADATA +179 -0
- valor_lite-0.33.9.dist-info/RECORD +24 -0
- valor_lite/detection/annotation.py +0 -98
- valor_lite/detection/metric.py +0 -408
- valor_lite-0.33.7.dist-info/METADATA +0 -41
- valor_lite-0.33.7.dist-info/RECORD +0 -17
- /valor_lite/{detection → object_detection}/__init__.py +0 -0
- {valor_lite-0.33.7.dist-info → valor_lite-0.33.9.dist-info}/LICENSE +0 -0
- {valor_lite-0.33.7.dist-info → valor_lite-0.33.9.dist-info}/WHEEL +0 -0
- {valor_lite-0.33.7.dist-info → valor_lite-0.33.9.dist-info}/top_level.txt +0 -0
|
@@ -67,13 +67,8 @@ class Evaluator:
|
|
|
67
67
|
self.index_to_uid: dict[int, str] = dict()
|
|
68
68
|
|
|
69
69
|
# label reference
|
|
70
|
-
self.label_to_index: dict[
|
|
71
|
-
self.index_to_label: dict[int,
|
|
72
|
-
|
|
73
|
-
# label key reference
|
|
74
|
-
self.index_to_label_key: dict[int, str] = dict()
|
|
75
|
-
self.label_key_to_index: dict[str, int] = dict()
|
|
76
|
-
self.label_index_to_label_key_index: dict[int, int] = dict()
|
|
70
|
+
self.label_to_index: dict[str, int] = dict()
|
|
71
|
+
self.index_to_label: dict[int, str] = dict()
|
|
77
72
|
|
|
78
73
|
# computation caches
|
|
79
74
|
self._detailed_pairs = np.array([])
|
|
@@ -81,7 +76,7 @@ class Evaluator:
|
|
|
81
76
|
self._label_metadata_per_datum = np.array([], dtype=np.int32)
|
|
82
77
|
|
|
83
78
|
@property
|
|
84
|
-
def ignored_prediction_labels(self) -> list[
|
|
79
|
+
def ignored_prediction_labels(self) -> list[str]:
|
|
85
80
|
"""
|
|
86
81
|
Prediction labels that are not present in the ground truth set.
|
|
87
82
|
"""
|
|
@@ -92,7 +87,7 @@ class Evaluator:
|
|
|
92
87
|
]
|
|
93
88
|
|
|
94
89
|
@property
|
|
95
|
-
def missing_prediction_labels(self) -> list[
|
|
90
|
+
def missing_prediction_labels(self) -> list[str]:
|
|
96
91
|
"""
|
|
97
92
|
Ground truth labels that are not present in the prediction set.
|
|
98
93
|
"""
|
|
@@ -119,8 +114,7 @@ class Evaluator:
|
|
|
119
114
|
def create_filter(
|
|
120
115
|
self,
|
|
121
116
|
datum_uids: list[str] | NDArray[np.int32] | None = None,
|
|
122
|
-
labels: list[
|
|
123
|
-
label_keys: list[str] | NDArray[np.int32] | None = None,
|
|
117
|
+
labels: list[str] | NDArray[np.int32] | None = None,
|
|
124
118
|
) -> Filter:
|
|
125
119
|
"""
|
|
126
120
|
Creates a boolean mask that can be passed to an evaluation.
|
|
@@ -129,10 +123,8 @@ class Evaluator:
|
|
|
129
123
|
----------
|
|
130
124
|
datum_uids : list[str] | NDArray[np.int32], optional
|
|
131
125
|
An optional list of string uids or a numpy array of uid indices.
|
|
132
|
-
labels : list[
|
|
126
|
+
labels : list[str] | NDArray[np.int32], optional
|
|
133
127
|
An optional list of labels or a numpy array of label indices.
|
|
134
|
-
label_keys : list[str] | NDArray[np.int32], optional
|
|
135
|
-
An optional list of label keys or a numpy array of label key indices.
|
|
136
128
|
|
|
137
129
|
Returns
|
|
138
130
|
-------
|
|
@@ -179,36 +171,18 @@ class Evaluator:
|
|
|
179
171
|
mask[labels] = True
|
|
180
172
|
mask_labels &= mask
|
|
181
173
|
|
|
182
|
-
if label_keys is not None:
|
|
183
|
-
if isinstance(label_keys, list):
|
|
184
|
-
label_keys = np.array(
|
|
185
|
-
[self.label_key_to_index[key] for key in label_keys]
|
|
186
|
-
)
|
|
187
|
-
label_indices = np.where(
|
|
188
|
-
np.isclose(self._label_metadata[:, 2], label_keys)
|
|
189
|
-
)[0]
|
|
190
|
-
mask = np.zeros_like(mask_pairs, dtype=np.bool_)
|
|
191
|
-
mask[
|
|
192
|
-
np.isin(self._detailed_pairs[:, 1].astype(int), label_indices)
|
|
193
|
-
] = True
|
|
194
|
-
mask_pairs &= mask
|
|
195
|
-
|
|
196
|
-
mask = np.zeros_like(mask_labels, dtype=np.bool_)
|
|
197
|
-
mask[label_indices] = True
|
|
198
|
-
mask_labels &= mask
|
|
199
|
-
|
|
200
174
|
mask = mask_datums[:, np.newaxis] & mask_labels[np.newaxis, :]
|
|
201
175
|
label_metadata_per_datum = self._label_metadata_per_datum.copy()
|
|
202
176
|
label_metadata_per_datum[:, ~mask] = 0
|
|
203
177
|
|
|
204
178
|
label_metadata = np.zeros_like(self._label_metadata, dtype=np.int32)
|
|
205
|
-
label_metadata
|
|
179
|
+
label_metadata = np.transpose(
|
|
206
180
|
np.sum(
|
|
207
181
|
label_metadata_per_datum,
|
|
208
182
|
axis=1,
|
|
209
183
|
)
|
|
210
184
|
)
|
|
211
|
-
|
|
185
|
+
|
|
212
186
|
n_datums = int(np.sum(label_metadata[:, 0]))
|
|
213
187
|
|
|
214
188
|
return Filter(
|
|
@@ -217,12 +191,117 @@ class Evaluator:
|
|
|
217
191
|
n_datums=n_datums,
|
|
218
192
|
)
|
|
219
193
|
|
|
220
|
-
def
|
|
194
|
+
def _unpack_confusion_matrix(
|
|
195
|
+
self,
|
|
196
|
+
confusion_matrix: NDArray[np.float64],
|
|
197
|
+
number_of_labels: int,
|
|
198
|
+
number_of_examples: int,
|
|
199
|
+
) -> dict[
|
|
200
|
+
str,
|
|
201
|
+
dict[
|
|
202
|
+
str,
|
|
203
|
+
dict[
|
|
204
|
+
str,
|
|
205
|
+
int
|
|
206
|
+
| list[
|
|
207
|
+
dict[
|
|
208
|
+
str,
|
|
209
|
+
str | float,
|
|
210
|
+
]
|
|
211
|
+
],
|
|
212
|
+
],
|
|
213
|
+
],
|
|
214
|
+
]:
|
|
215
|
+
"""
|
|
216
|
+
Unpacks a numpy array of confusion matrix counts and examples.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
datum_idx = lambda gt_label_idx, pd_label_idx, example_idx: int( # noqa: E731 - lambda fn
|
|
220
|
+
confusion_matrix[
|
|
221
|
+
gt_label_idx,
|
|
222
|
+
pd_label_idx,
|
|
223
|
+
example_idx * 2 + 1,
|
|
224
|
+
]
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
score_idx = lambda gt_label_idx, pd_label_idx, example_idx: float( # noqa: E731 - lambda fn
|
|
228
|
+
confusion_matrix[
|
|
229
|
+
gt_label_idx,
|
|
230
|
+
pd_label_idx,
|
|
231
|
+
example_idx * 2 + 2,
|
|
232
|
+
]
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
return {
|
|
236
|
+
self.index_to_label[gt_label_idx]: {
|
|
237
|
+
self.index_to_label[pd_label_idx]: {
|
|
238
|
+
"count": max(
|
|
239
|
+
int(confusion_matrix[gt_label_idx, pd_label_idx, 0]),
|
|
240
|
+
0,
|
|
241
|
+
),
|
|
242
|
+
"examples": [
|
|
243
|
+
{
|
|
244
|
+
"datum": self.index_to_uid[
|
|
245
|
+
datum_idx(
|
|
246
|
+
gt_label_idx, pd_label_idx, example_idx
|
|
247
|
+
)
|
|
248
|
+
],
|
|
249
|
+
"score": score_idx(
|
|
250
|
+
gt_label_idx, pd_label_idx, example_idx
|
|
251
|
+
),
|
|
252
|
+
}
|
|
253
|
+
for example_idx in range(number_of_examples)
|
|
254
|
+
if datum_idx(gt_label_idx, pd_label_idx, example_idx)
|
|
255
|
+
>= 0
|
|
256
|
+
],
|
|
257
|
+
}
|
|
258
|
+
for pd_label_idx in range(number_of_labels)
|
|
259
|
+
}
|
|
260
|
+
for gt_label_idx in range(number_of_labels)
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
def _unpack_missing_predictions(
|
|
264
|
+
self,
|
|
265
|
+
missing_predictions: NDArray[np.int32],
|
|
266
|
+
number_of_labels: int,
|
|
267
|
+
number_of_examples: int,
|
|
268
|
+
) -> dict[str, dict[str, int | list[dict[str, str]]]]:
|
|
269
|
+
"""
|
|
270
|
+
Unpacks a numpy array of missing prediction counts and examples.
|
|
271
|
+
"""
|
|
272
|
+
|
|
273
|
+
datum_idx = (
|
|
274
|
+
lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn
|
|
275
|
+
missing_predictions[
|
|
276
|
+
gt_label_idx,
|
|
277
|
+
example_idx + 1,
|
|
278
|
+
]
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return {
|
|
283
|
+
self.index_to_label[gt_label_idx]: {
|
|
284
|
+
"count": max(
|
|
285
|
+
int(missing_predictions[gt_label_idx, 0]),
|
|
286
|
+
0,
|
|
287
|
+
),
|
|
288
|
+
"examples": [
|
|
289
|
+
{
|
|
290
|
+
"datum": self.index_to_uid[
|
|
291
|
+
datum_idx(gt_label_idx, example_idx)
|
|
292
|
+
]
|
|
293
|
+
}
|
|
294
|
+
for example_idx in range(number_of_examples)
|
|
295
|
+
if datum_idx(gt_label_idx, example_idx) >= 0
|
|
296
|
+
],
|
|
297
|
+
}
|
|
298
|
+
for gt_label_idx in range(number_of_labels)
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
def compute_precision_recall(
|
|
221
302
|
self,
|
|
222
|
-
metrics_to_return: list[MetricType] = MetricType.base(),
|
|
223
303
|
score_thresholds: list[float] = [0.0],
|
|
224
304
|
hardmax: bool = True,
|
|
225
|
-
number_of_examples: int = 0,
|
|
226
305
|
filter_: Filter | None = None,
|
|
227
306
|
as_dict: bool = False,
|
|
228
307
|
) -> dict[MetricType, list]:
|
|
@@ -231,14 +310,10 @@ class Evaluator:
|
|
|
231
310
|
|
|
232
311
|
Parameters
|
|
233
312
|
----------
|
|
234
|
-
metrics_to_return : list[MetricType]
|
|
235
|
-
A list of metrics to return in the results.
|
|
236
313
|
score_thresholds : list[float]
|
|
237
314
|
A list of score thresholds to compute metrics over.
|
|
238
315
|
hardmax : bool
|
|
239
316
|
Toggles whether a hardmax is applied to predictions.
|
|
240
|
-
number_of_examples : int, default=0
|
|
241
|
-
Maximum number of annotation examples to return in ConfusionMatrix.
|
|
242
317
|
filter_ : Filter, optional
|
|
243
318
|
An optional filter object.
|
|
244
319
|
as_dict : bool, default=False
|
|
@@ -279,7 +354,7 @@ class Evaluator:
|
|
|
279
354
|
|
|
280
355
|
metrics[MetricType.ROCAUC] = [
|
|
281
356
|
ROCAUC(
|
|
282
|
-
value=rocauc[label_idx],
|
|
357
|
+
value=float(rocauc[label_idx]),
|
|
283
358
|
label=self.index_to_label[label_idx],
|
|
284
359
|
)
|
|
285
360
|
for label_idx in range(label_metadata.shape[0])
|
|
@@ -288,10 +363,8 @@ class Evaluator:
|
|
|
288
363
|
|
|
289
364
|
metrics[MetricType.mROCAUC] = [
|
|
290
365
|
mROCAUC(
|
|
291
|
-
value=mean_rocauc
|
|
292
|
-
label_key=self.index_to_label_key[label_key_idx],
|
|
366
|
+
value=float(mean_rocauc),
|
|
293
367
|
)
|
|
294
|
-
for label_key_idx in range(len(self.label_key_to_index))
|
|
295
368
|
]
|
|
296
369
|
|
|
297
370
|
for label_idx, label in self.index_to_label.items():
|
|
@@ -304,10 +377,10 @@ class Evaluator:
|
|
|
304
377
|
row = counts[:, label_idx]
|
|
305
378
|
metrics[MetricType.Counts].append(
|
|
306
379
|
Counts(
|
|
307
|
-
tp=row[:, 0].tolist(),
|
|
308
|
-
fp=row[:, 1].tolist(),
|
|
309
|
-
fn=row[:, 2].tolist(),
|
|
310
|
-
tn=row[:, 3].tolist(),
|
|
380
|
+
tp=row[:, 0].astype(int).tolist(),
|
|
381
|
+
fp=row[:, 1].astype(int).tolist(),
|
|
382
|
+
fn=row[:, 2].astype(int).tolist(),
|
|
383
|
+
tn=row[:, 3].astype(int).tolist(),
|
|
311
384
|
**kwargs,
|
|
312
385
|
)
|
|
313
386
|
)
|
|
@@ -318,44 +391,29 @@ class Evaluator:
|
|
|
318
391
|
|
|
319
392
|
metrics[MetricType.Precision].append(
|
|
320
393
|
Precision(
|
|
321
|
-
value=precision[:, label_idx].tolist(),
|
|
394
|
+
value=precision[:, label_idx].astype(float).tolist(),
|
|
322
395
|
**kwargs,
|
|
323
396
|
)
|
|
324
397
|
)
|
|
325
398
|
metrics[MetricType.Recall].append(
|
|
326
399
|
Recall(
|
|
327
|
-
value=recall[:, label_idx].tolist(),
|
|
400
|
+
value=recall[:, label_idx].astype(float).tolist(),
|
|
328
401
|
**kwargs,
|
|
329
402
|
)
|
|
330
403
|
)
|
|
331
404
|
metrics[MetricType.Accuracy].append(
|
|
332
405
|
Accuracy(
|
|
333
|
-
value=accuracy[:, label_idx].tolist(),
|
|
406
|
+
value=accuracy[:, label_idx].astype(float).tolist(),
|
|
334
407
|
**kwargs,
|
|
335
408
|
)
|
|
336
409
|
)
|
|
337
410
|
metrics[MetricType.F1].append(
|
|
338
411
|
F1(
|
|
339
|
-
value=f1_score[:, label_idx].tolist(),
|
|
412
|
+
value=f1_score[:, label_idx].astype(float).tolist(),
|
|
340
413
|
**kwargs,
|
|
341
414
|
)
|
|
342
415
|
)
|
|
343
416
|
|
|
344
|
-
if MetricType.ConfusionMatrix in metrics_to_return:
|
|
345
|
-
metrics[
|
|
346
|
-
MetricType.ConfusionMatrix
|
|
347
|
-
] = self._compute_confusion_matrix(
|
|
348
|
-
data=data,
|
|
349
|
-
label_metadata=label_metadata,
|
|
350
|
-
score_thresholds=score_thresholds,
|
|
351
|
-
hardmax=hardmax,
|
|
352
|
-
number_of_examples=number_of_examples,
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
for metric in set(metrics.keys()):
|
|
356
|
-
if metric not in metrics_to_return:
|
|
357
|
-
del metrics[metric]
|
|
358
|
-
|
|
359
417
|
if as_dict:
|
|
360
418
|
return {
|
|
361
419
|
mtype: [metric.to_dict() for metric in mvalues]
|
|
@@ -364,157 +422,43 @@ class Evaluator:
|
|
|
364
422
|
|
|
365
423
|
return metrics
|
|
366
424
|
|
|
367
|
-
def
|
|
368
|
-
self,
|
|
369
|
-
confusion_matrix: NDArray[np.floating],
|
|
370
|
-
label_key_idx: int,
|
|
371
|
-
number_of_labels: int,
|
|
372
|
-
number_of_examples: int,
|
|
373
|
-
) -> dict[
|
|
374
|
-
str,
|
|
375
|
-
dict[
|
|
376
|
-
str,
|
|
377
|
-
dict[
|
|
378
|
-
str,
|
|
379
|
-
int
|
|
380
|
-
| list[
|
|
381
|
-
dict[
|
|
382
|
-
str,
|
|
383
|
-
str | float,
|
|
384
|
-
]
|
|
385
|
-
],
|
|
386
|
-
],
|
|
387
|
-
],
|
|
388
|
-
]:
|
|
389
|
-
"""
|
|
390
|
-
Unpacks a numpy array of confusion matrix counts and examples.
|
|
391
|
-
"""
|
|
392
|
-
|
|
393
|
-
datum_idx = lambda gt_label_idx, pd_label_idx, example_idx: int( # noqa: E731 - lambda fn
|
|
394
|
-
confusion_matrix[
|
|
395
|
-
gt_label_idx,
|
|
396
|
-
pd_label_idx,
|
|
397
|
-
example_idx * 2 + 1,
|
|
398
|
-
]
|
|
399
|
-
)
|
|
400
|
-
|
|
401
|
-
score_idx = lambda gt_label_idx, pd_label_idx, example_idx: float( # noqa: E731 - lambda fn
|
|
402
|
-
confusion_matrix[
|
|
403
|
-
gt_label_idx,
|
|
404
|
-
pd_label_idx,
|
|
405
|
-
example_idx * 2 + 2,
|
|
406
|
-
]
|
|
407
|
-
)
|
|
408
|
-
|
|
409
|
-
return {
|
|
410
|
-
self.index_to_label[gt_label_idx][1]: {
|
|
411
|
-
self.index_to_label[pd_label_idx][1]: {
|
|
412
|
-
"count": max(
|
|
413
|
-
int(confusion_matrix[gt_label_idx, pd_label_idx, 0]),
|
|
414
|
-
0,
|
|
415
|
-
),
|
|
416
|
-
"examples": [
|
|
417
|
-
{
|
|
418
|
-
"datum": self.index_to_uid[
|
|
419
|
-
datum_idx(
|
|
420
|
-
gt_label_idx, pd_label_idx, example_idx
|
|
421
|
-
)
|
|
422
|
-
],
|
|
423
|
-
"score": score_idx(
|
|
424
|
-
gt_label_idx, pd_label_idx, example_idx
|
|
425
|
-
),
|
|
426
|
-
}
|
|
427
|
-
for example_idx in range(number_of_examples)
|
|
428
|
-
if datum_idx(gt_label_idx, pd_label_idx, example_idx)
|
|
429
|
-
>= 0
|
|
430
|
-
],
|
|
431
|
-
}
|
|
432
|
-
for pd_label_idx in range(number_of_labels)
|
|
433
|
-
if (
|
|
434
|
-
self.label_index_to_label_key_index[pd_label_idx]
|
|
435
|
-
== label_key_idx
|
|
436
|
-
)
|
|
437
|
-
}
|
|
438
|
-
for gt_label_idx in range(number_of_labels)
|
|
439
|
-
if (
|
|
440
|
-
self.label_index_to_label_key_index[gt_label_idx]
|
|
441
|
-
== label_key_idx
|
|
442
|
-
)
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
def _unpack_missing_predictions(
|
|
425
|
+
def compute_confusion_matrix(
|
|
446
426
|
self,
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
Unpacks a numpy array of missing prediction counts and examples.
|
|
454
|
-
"""
|
|
455
|
-
|
|
456
|
-
datum_idx = (
|
|
457
|
-
lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn
|
|
458
|
-
missing_predictions[
|
|
459
|
-
gt_label_idx,
|
|
460
|
-
example_idx + 1,
|
|
461
|
-
]
|
|
462
|
-
)
|
|
463
|
-
)
|
|
464
|
-
|
|
465
|
-
return {
|
|
466
|
-
self.index_to_label[gt_label_idx][1]: {
|
|
467
|
-
"count": max(
|
|
468
|
-
int(missing_predictions[gt_label_idx, 0]),
|
|
469
|
-
0,
|
|
470
|
-
),
|
|
471
|
-
"examples": [
|
|
472
|
-
{
|
|
473
|
-
"datum": self.index_to_uid[
|
|
474
|
-
datum_idx(gt_label_idx, example_idx)
|
|
475
|
-
]
|
|
476
|
-
}
|
|
477
|
-
for example_idx in range(number_of_examples)
|
|
478
|
-
if datum_idx(gt_label_idx, example_idx) >= 0
|
|
479
|
-
],
|
|
480
|
-
}
|
|
481
|
-
for gt_label_idx in range(number_of_labels)
|
|
482
|
-
if (
|
|
483
|
-
self.label_index_to_label_key_index[gt_label_idx]
|
|
484
|
-
== label_key_idx
|
|
485
|
-
)
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
def _compute_confusion_matrix(
|
|
489
|
-
self,
|
|
490
|
-
data: NDArray[np.floating],
|
|
491
|
-
label_metadata: NDArray[np.int32],
|
|
492
|
-
score_thresholds: list[float],
|
|
493
|
-
hardmax: bool,
|
|
494
|
-
number_of_examples: int,
|
|
495
|
-
) -> list[ConfusionMatrix]:
|
|
427
|
+
score_thresholds: list[float] = [0.0],
|
|
428
|
+
hardmax: bool = True,
|
|
429
|
+
number_of_examples: int = 0,
|
|
430
|
+
filter_: Filter | None = None,
|
|
431
|
+
as_dict: bool = False,
|
|
432
|
+
) -> list:
|
|
496
433
|
"""
|
|
497
434
|
Computes a detailed confusion matrix..
|
|
498
435
|
|
|
499
436
|
Parameters
|
|
500
437
|
----------
|
|
501
|
-
data : NDArray[np.floating]
|
|
502
|
-
A data array containing classification pairs.
|
|
503
|
-
label_metadata : NDArray[np.int32]
|
|
504
|
-
An integer array containing label metadata.
|
|
505
438
|
score_thresholds : list[float]
|
|
506
439
|
A list of score thresholds to compute metrics over.
|
|
507
440
|
hardmax : bool
|
|
508
441
|
Toggles whether a hardmax is applied to predictions.
|
|
509
442
|
number_of_examples : int, default=0
|
|
510
443
|
The number of examples to return per count.
|
|
444
|
+
filter_ : Filter, optional
|
|
445
|
+
An optional filter object.
|
|
446
|
+
as_dict : bool, default=False
|
|
447
|
+
An option to return metrics as dictionaries.
|
|
511
448
|
|
|
512
449
|
Returns
|
|
513
450
|
-------
|
|
514
|
-
list[ConfusionMatrix]
|
|
515
|
-
A list of
|
|
451
|
+
list[ConfusionMatrix] | list[dict]
|
|
452
|
+
A list of confusion matrices.
|
|
516
453
|
"""
|
|
517
454
|
|
|
455
|
+
# apply filters
|
|
456
|
+
data = self._detailed_pairs
|
|
457
|
+
label_metadata = self._label_metadata
|
|
458
|
+
if filter_ is not None:
|
|
459
|
+
data = data[filter_.indices]
|
|
460
|
+
label_metadata = filter_.label_metadata
|
|
461
|
+
|
|
518
462
|
if data.size == 0:
|
|
519
463
|
return list()
|
|
520
464
|
|
|
@@ -527,28 +471,74 @@ class Evaluator:
|
|
|
527
471
|
)
|
|
528
472
|
|
|
529
473
|
n_scores, n_labels, _, _ = confusion_matrix.shape
|
|
530
|
-
|
|
474
|
+
results = [
|
|
531
475
|
ConfusionMatrix(
|
|
532
476
|
score_threshold=score_thresholds[score_idx],
|
|
533
|
-
label_key=label_key,
|
|
534
477
|
number_of_examples=number_of_examples,
|
|
535
478
|
confusion_matrix=self._unpack_confusion_matrix(
|
|
536
479
|
confusion_matrix=confusion_matrix[score_idx, :, :, :],
|
|
537
|
-
label_key_idx=label_key_idx,
|
|
538
480
|
number_of_labels=n_labels,
|
|
539
481
|
number_of_examples=number_of_examples,
|
|
540
482
|
),
|
|
541
483
|
missing_predictions=self._unpack_missing_predictions(
|
|
542
484
|
missing_predictions=missing_predictions[score_idx, :, :],
|
|
543
|
-
label_key_idx=label_key_idx,
|
|
544
485
|
number_of_labels=n_labels,
|
|
545
486
|
number_of_examples=number_of_examples,
|
|
546
487
|
),
|
|
547
488
|
)
|
|
548
|
-
for label_key_idx, label_key in self.index_to_label_key.items()
|
|
549
489
|
for score_idx in range(n_scores)
|
|
550
490
|
]
|
|
551
491
|
|
|
492
|
+
if as_dict:
|
|
493
|
+
return [m.to_dict() for m in results]
|
|
494
|
+
|
|
495
|
+
return results
|
|
496
|
+
|
|
497
|
+
def evaluate(
|
|
498
|
+
self,
|
|
499
|
+
score_thresholds: list[float] = [0.0],
|
|
500
|
+
hardmax: bool = True,
|
|
501
|
+
number_of_examples: int = 0,
|
|
502
|
+
filter_: Filter | None = None,
|
|
503
|
+
as_dict: bool = False,
|
|
504
|
+
) -> dict[MetricType, list]:
|
|
505
|
+
"""
|
|
506
|
+
Computes a detailed confusion matrix..
|
|
507
|
+
|
|
508
|
+
Parameters
|
|
509
|
+
----------
|
|
510
|
+
score_thresholds : list[float]
|
|
511
|
+
A list of score thresholds to compute metrics over.
|
|
512
|
+
hardmax : bool
|
|
513
|
+
Toggles whether a hardmax is applied to predictions.
|
|
514
|
+
number_of_examples : int, default=0
|
|
515
|
+
The number of examples to return per count.
|
|
516
|
+
filter_ : Filter, optional
|
|
517
|
+
An optional filter object.
|
|
518
|
+
as_dict : bool, default=False
|
|
519
|
+
An option to return metrics as dictionaries.
|
|
520
|
+
|
|
521
|
+
Returns
|
|
522
|
+
-------
|
|
523
|
+
list[ConfusionMatrix] | list[dict]
|
|
524
|
+
A list of confusion matrices.
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
results = self.compute_precision_recall(
|
|
528
|
+
score_thresholds=score_thresholds,
|
|
529
|
+
hardmax=hardmax,
|
|
530
|
+
filter_=filter_,
|
|
531
|
+
as_dict=as_dict,
|
|
532
|
+
)
|
|
533
|
+
results[MetricType.ConfusionMatrix] = self.compute_confusion_matrix(
|
|
534
|
+
score_thresholds=score_thresholds,
|
|
535
|
+
hardmax=hardmax,
|
|
536
|
+
number_of_examples=number_of_examples,
|
|
537
|
+
filter_=filter_,
|
|
538
|
+
as_dict=as_dict,
|
|
539
|
+
)
|
|
540
|
+
return results
|
|
541
|
+
|
|
552
542
|
|
|
553
543
|
class DataLoader:
|
|
554
544
|
"""
|
|
@@ -580,77 +570,50 @@ class DataLoader:
|
|
|
580
570
|
self._evaluator.index_to_uid[index] = uid
|
|
581
571
|
return self._evaluator.uid_to_index[uid]
|
|
582
572
|
|
|
583
|
-
def _add_label(self, label:
|
|
573
|
+
def _add_label(self, label: str) -> int:
|
|
584
574
|
"""
|
|
585
575
|
Helper function for adding a label to the cache.
|
|
586
576
|
|
|
587
577
|
Parameters
|
|
588
578
|
----------
|
|
589
|
-
label :
|
|
590
|
-
|
|
579
|
+
label : str
|
|
580
|
+
A string representing a label.
|
|
591
581
|
|
|
592
582
|
Returns
|
|
593
583
|
-------
|
|
594
584
|
int
|
|
595
585
|
Label index.
|
|
596
|
-
int
|
|
597
|
-
Label key index.
|
|
598
586
|
"""
|
|
599
587
|
label_id = len(self._evaluator.index_to_label)
|
|
600
|
-
label_key_id = len(self._evaluator.index_to_label_key)
|
|
601
588
|
if label not in self._evaluator.label_to_index:
|
|
602
589
|
self._evaluator.label_to_index[label] = label_id
|
|
603
590
|
self._evaluator.index_to_label[label_id] = label
|
|
604
591
|
|
|
605
|
-
# update label key index
|
|
606
|
-
if label[0] not in self._evaluator.label_key_to_index:
|
|
607
|
-
self._evaluator.label_key_to_index[label[0]] = label_key_id
|
|
608
|
-
self._evaluator.index_to_label_key[label_key_id] = label[0]
|
|
609
|
-
label_key_id += 1
|
|
610
|
-
|
|
611
|
-
self._evaluator.label_index_to_label_key_index[
|
|
612
|
-
label_id
|
|
613
|
-
] = self._evaluator.label_key_to_index[label[0]]
|
|
614
592
|
label_id += 1
|
|
615
593
|
|
|
616
|
-
return
|
|
617
|
-
self._evaluator.label_to_index[label],
|
|
618
|
-
self._evaluator.label_key_to_index[label[0]],
|
|
619
|
-
)
|
|
594
|
+
return self._evaluator.label_to_index[label]
|
|
620
595
|
|
|
621
596
|
def _add_data(
|
|
622
597
|
self,
|
|
623
598
|
uid_index: int,
|
|
624
|
-
|
|
625
|
-
|
|
599
|
+
groundtruth: int,
|
|
600
|
+
predictions: list[tuple[int, float]],
|
|
626
601
|
):
|
|
627
|
-
gt_keys = set(keyed_groundtruths.keys())
|
|
628
|
-
pd_keys = set(keyed_predictions.keys())
|
|
629
|
-
joint_keys = gt_keys.intersection(pd_keys)
|
|
630
|
-
|
|
631
|
-
gt_unique_keys = gt_keys - pd_keys
|
|
632
|
-
pd_unique_keys = pd_keys - gt_keys
|
|
633
|
-
if gt_unique_keys or pd_unique_keys:
|
|
634
|
-
raise ValueError(
|
|
635
|
-
"Label keys must match between ground truths and predictions."
|
|
636
|
-
)
|
|
637
602
|
|
|
638
603
|
pairs = list()
|
|
639
|
-
for
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
(
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
float(score),
|
|
651
|
-
float(max_score_idx == idx),
|
|
652
|
-
)
|
|
604
|
+
scores = np.array([score for _, score in predictions])
|
|
605
|
+
max_score_idx = np.argmax(scores)
|
|
606
|
+
|
|
607
|
+
for idx, (plabel, score) in enumerate(predictions):
|
|
608
|
+
pairs.append(
|
|
609
|
+
(
|
|
610
|
+
float(uid_index),
|
|
611
|
+
float(groundtruth),
|
|
612
|
+
float(plabel),
|
|
613
|
+
float(score),
|
|
614
|
+
float(max_score_idx == idx),
|
|
653
615
|
)
|
|
616
|
+
)
|
|
654
617
|
|
|
655
618
|
if self._evaluator._detailed_pairs.size == 0:
|
|
656
619
|
self._evaluator._detailed_pairs = np.array(pairs)
|
|
@@ -682,27 +645,29 @@ class DataLoader:
|
|
|
682
645
|
disable_tqdm = not show_progress
|
|
683
646
|
for classification in tqdm(classifications, disable=disable_tqdm):
|
|
684
647
|
|
|
648
|
+
if len(classification.predictions) == 0:
|
|
649
|
+
raise ValueError(
|
|
650
|
+
"Classifications must contain at least one prediction."
|
|
651
|
+
)
|
|
685
652
|
# update metadata
|
|
686
653
|
self._evaluator.n_datums += 1
|
|
687
|
-
self._evaluator.n_groundtruths +=
|
|
654
|
+
self._evaluator.n_groundtruths += 1
|
|
688
655
|
self._evaluator.n_predictions += len(classification.predictions)
|
|
689
656
|
|
|
690
657
|
# update datum uid index
|
|
691
658
|
uid_index = self._add_datum(uid=classification.uid)
|
|
692
659
|
|
|
693
660
|
# cache labels and annotations
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
for idx, (plabel, pscore) in enumerate(
|
|
701
|
-
zip(classification.predictions, classification.scores)
|
|
661
|
+
groundtruth = self._add_label(classification.groundtruth)
|
|
662
|
+
self.groundtruth_count[groundtruth][uid_index] += 1
|
|
663
|
+
|
|
664
|
+
predictions = list()
|
|
665
|
+
for plabel, pscore in zip(
|
|
666
|
+
classification.predictions, classification.scores
|
|
702
667
|
):
|
|
703
|
-
label_idx
|
|
668
|
+
label_idx = self._add_label(plabel)
|
|
704
669
|
self.prediction_count[label_idx][uid_index] += 1
|
|
705
|
-
|
|
670
|
+
predictions.append(
|
|
706
671
|
(
|
|
707
672
|
label_idx,
|
|
708
673
|
pscore,
|
|
@@ -711,65 +676,8 @@ class DataLoader:
|
|
|
711
676
|
|
|
712
677
|
self._add_data(
|
|
713
678
|
uid_index=uid_index,
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
)
|
|
717
|
-
|
|
718
|
-
def add_data_from_valor_dict(
|
|
719
|
-
self,
|
|
720
|
-
classifications: list[tuple[dict, dict]],
|
|
721
|
-
show_progress: bool = False,
|
|
722
|
-
):
|
|
723
|
-
"""
|
|
724
|
-
Adds Valor-format classifications to the cache.
|
|
725
|
-
|
|
726
|
-
Parameters
|
|
727
|
-
----------
|
|
728
|
-
classifications : list[tuple[dict, dict]]
|
|
729
|
-
A list of groundtruth, prediction pairs in Valor-format dictionaries.
|
|
730
|
-
show_progress : bool, default=False
|
|
731
|
-
Toggle for tqdm progress bar.
|
|
732
|
-
"""
|
|
733
|
-
|
|
734
|
-
disable_tqdm = not show_progress
|
|
735
|
-
for groundtruth, prediction in tqdm(
|
|
736
|
-
classifications, disable=disable_tqdm
|
|
737
|
-
):
|
|
738
|
-
|
|
739
|
-
# update metadata
|
|
740
|
-
self._evaluator.n_datums += 1
|
|
741
|
-
self._evaluator.n_groundtruths += len(groundtruth["annotations"])
|
|
742
|
-
self._evaluator.n_predictions += len(prediction["annotations"])
|
|
743
|
-
|
|
744
|
-
# update datum uid index
|
|
745
|
-
uid_index = self._add_datum(uid=groundtruth["datum"]["uid"])
|
|
746
|
-
|
|
747
|
-
# cache labels and annotations
|
|
748
|
-
keyed_groundtruths = defaultdict(int)
|
|
749
|
-
keyed_predictions = defaultdict(list)
|
|
750
|
-
for gann in groundtruth["annotations"]:
|
|
751
|
-
for valor_label in gann["labels"]:
|
|
752
|
-
glabel = (valor_label["key"], valor_label["value"])
|
|
753
|
-
label_idx, label_key_idx = self._add_label(glabel)
|
|
754
|
-
self.groundtruth_count[label_idx][uid_index] += 1
|
|
755
|
-
keyed_groundtruths[label_key_idx] = label_idx
|
|
756
|
-
for pann in prediction["annotations"]:
|
|
757
|
-
for valor_label in pann["labels"]:
|
|
758
|
-
plabel = (valor_label["key"], valor_label["value"])
|
|
759
|
-
pscore = valor_label["score"]
|
|
760
|
-
label_idx, label_key_idx = self._add_label(plabel)
|
|
761
|
-
self.prediction_count[label_idx][uid_index] += 1
|
|
762
|
-
keyed_predictions[label_key_idx].append(
|
|
763
|
-
(
|
|
764
|
-
label_idx,
|
|
765
|
-
pscore,
|
|
766
|
-
)
|
|
767
|
-
)
|
|
768
|
-
|
|
769
|
-
self._add_data(
|
|
770
|
-
uid_index=uid_index,
|
|
771
|
-
keyed_groundtruths=keyed_groundtruths,
|
|
772
|
-
keyed_predictions=keyed_predictions,
|
|
679
|
+
groundtruth=groundtruth,
|
|
680
|
+
predictions=predictions,
|
|
773
681
|
)
|
|
774
682
|
|
|
775
683
|
def finalize(self) -> Evaluator:
|
|
@@ -822,7 +730,6 @@ class DataLoader:
|
|
|
822
730
|
1, :, label_idx
|
|
823
731
|
]
|
|
824
732
|
),
|
|
825
|
-
self._evaluator.label_index_to_label_key_index[label_idx],
|
|
826
733
|
]
|
|
827
734
|
for label_idx in range(n_labels)
|
|
828
735
|
],
|