valor-lite 0.33.8__py3-none-any.whl → 0.33.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valor-lite might be problematic. Click here for more details.

valor_lite/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Striveworks
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -3,6 +3,30 @@ from dataclasses import dataclass
3
3
 
4
4
  @dataclass
5
5
  class Classification:
6
+ """
7
+ Classification data structure containing a ground truth label and a list of predictions.
8
+
9
+ Parameters
10
+ ----------
11
+ uid : str
12
+ Unique identifier for the instance.
13
+ groundtruth : str
14
+ The true label for the instance.
15
+ predictions : list of str
16
+ List of predicted labels.
17
+ scores : list of float
18
+ Confidence scores corresponding to each predicted label.
19
+
20
+ Examples
21
+ --------
22
+ >>> classification = Classification(
23
+ ... uid='123',
24
+ ... groundtruth='cat',
25
+ ... predictions=['cat', 'dog', 'bird'],
26
+ ... scores=[0.9, 0.05, 0.05]
27
+ ... )
28
+ """
29
+
6
30
  uid: str
7
31
  groundtruth: str
8
32
  predictions: list[str]
@@ -191,12 +191,117 @@ class Evaluator:
191
191
  n_datums=n_datums,
192
192
  )
193
193
 
194
- def evaluate(
194
+ def _unpack_confusion_matrix(
195
+ self,
196
+ confusion_matrix: NDArray[np.float64],
197
+ number_of_labels: int,
198
+ number_of_examples: int,
199
+ ) -> dict[
200
+ str,
201
+ dict[
202
+ str,
203
+ dict[
204
+ str,
205
+ int
206
+ | list[
207
+ dict[
208
+ str,
209
+ str | float,
210
+ ]
211
+ ],
212
+ ],
213
+ ],
214
+ ]:
215
+ """
216
+ Unpacks a numpy array of confusion matrix counts and examples.
217
+ """
218
+
219
+ datum_idx = lambda gt_label_idx, pd_label_idx, example_idx: int( # noqa: E731 - lambda fn
220
+ confusion_matrix[
221
+ gt_label_idx,
222
+ pd_label_idx,
223
+ example_idx * 2 + 1,
224
+ ]
225
+ )
226
+
227
+ score_idx = lambda gt_label_idx, pd_label_idx, example_idx: float( # noqa: E731 - lambda fn
228
+ confusion_matrix[
229
+ gt_label_idx,
230
+ pd_label_idx,
231
+ example_idx * 2 + 2,
232
+ ]
233
+ )
234
+
235
+ return {
236
+ self.index_to_label[gt_label_idx]: {
237
+ self.index_to_label[pd_label_idx]: {
238
+ "count": max(
239
+ int(confusion_matrix[gt_label_idx, pd_label_idx, 0]),
240
+ 0,
241
+ ),
242
+ "examples": [
243
+ {
244
+ "datum": self.index_to_uid[
245
+ datum_idx(
246
+ gt_label_idx, pd_label_idx, example_idx
247
+ )
248
+ ],
249
+ "score": score_idx(
250
+ gt_label_idx, pd_label_idx, example_idx
251
+ ),
252
+ }
253
+ for example_idx in range(number_of_examples)
254
+ if datum_idx(gt_label_idx, pd_label_idx, example_idx)
255
+ >= 0
256
+ ],
257
+ }
258
+ for pd_label_idx in range(number_of_labels)
259
+ }
260
+ for gt_label_idx in range(number_of_labels)
261
+ }
262
+
263
+ def _unpack_missing_predictions(
264
+ self,
265
+ missing_predictions: NDArray[np.int32],
266
+ number_of_labels: int,
267
+ number_of_examples: int,
268
+ ) -> dict[str, dict[str, int | list[dict[str, str]]]]:
269
+ """
270
+ Unpacks a numpy array of missing prediction counts and examples.
271
+ """
272
+
273
+ datum_idx = (
274
+ lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn
275
+ missing_predictions[
276
+ gt_label_idx,
277
+ example_idx + 1,
278
+ ]
279
+ )
280
+ )
281
+
282
+ return {
283
+ self.index_to_label[gt_label_idx]: {
284
+ "count": max(
285
+ int(missing_predictions[gt_label_idx, 0]),
286
+ 0,
287
+ ),
288
+ "examples": [
289
+ {
290
+ "datum": self.index_to_uid[
291
+ datum_idx(gt_label_idx, example_idx)
292
+ ]
293
+ }
294
+ for example_idx in range(number_of_examples)
295
+ if datum_idx(gt_label_idx, example_idx) >= 0
296
+ ],
297
+ }
298
+ for gt_label_idx in range(number_of_labels)
299
+ }
300
+
301
+ def compute_precision_recall(
195
302
  self,
196
- metrics_to_return: list[MetricType] = MetricType.base(),
197
303
  score_thresholds: list[float] = [0.0],
198
304
  hardmax: bool = True,
199
- number_of_examples: int = 0,
200
305
  filter_: Filter | None = None,
201
306
  as_dict: bool = False,
202
307
  ) -> dict[MetricType, list]:
@@ -205,14 +310,10 @@ class Evaluator:
205
310
 
206
311
  Parameters
207
312
  ----------
208
- metrics_to_return : list[MetricType]
209
- A list of metrics to return in the results.
210
313
  score_thresholds : list[float]
211
314
  A list of score thresholds to compute metrics over.
212
315
  hardmax : bool
213
316
  Toggles whether a hardmax is applied to predictions.
214
- number_of_examples : int, default=0
215
- Maximum number of annotation examples to return in ConfusionMatrix.
216
317
  filter_ : Filter, optional
217
318
  An optional filter object.
218
319
  as_dict : bool, default=False
@@ -253,7 +354,7 @@ class Evaluator:
253
354
 
254
355
  metrics[MetricType.ROCAUC] = [
255
356
  ROCAUC(
256
- value=rocauc[label_idx],
357
+ value=float(rocauc[label_idx]),
257
358
  label=self.index_to_label[label_idx],
258
359
  )
259
360
  for label_idx in range(label_metadata.shape[0])
@@ -262,7 +363,7 @@ class Evaluator:
262
363
 
263
364
  metrics[MetricType.mROCAUC] = [
264
365
  mROCAUC(
265
- value=mean_rocauc,
366
+ value=float(mean_rocauc),
266
367
  )
267
368
  ]
268
369
 
@@ -276,10 +377,10 @@ class Evaluator:
276
377
  row = counts[:, label_idx]
277
378
  metrics[MetricType.Counts].append(
278
379
  Counts(
279
- tp=row[:, 0].tolist(),
280
- fp=row[:, 1].tolist(),
281
- fn=row[:, 2].tolist(),
282
- tn=row[:, 3].tolist(),
380
+ tp=row[:, 0].astype(int).tolist(),
381
+ fp=row[:, 1].astype(int).tolist(),
382
+ fn=row[:, 2].astype(int).tolist(),
383
+ tn=row[:, 3].astype(int).tolist(),
283
384
  **kwargs,
284
385
  )
285
386
  )
@@ -290,44 +391,29 @@ class Evaluator:
290
391
 
291
392
  metrics[MetricType.Precision].append(
292
393
  Precision(
293
- value=precision[:, label_idx].tolist(),
394
+ value=precision[:, label_idx].astype(float).tolist(),
294
395
  **kwargs,
295
396
  )
296
397
  )
297
398
  metrics[MetricType.Recall].append(
298
399
  Recall(
299
- value=recall[:, label_idx].tolist(),
400
+ value=recall[:, label_idx].astype(float).tolist(),
300
401
  **kwargs,
301
402
  )
302
403
  )
303
404
  metrics[MetricType.Accuracy].append(
304
405
  Accuracy(
305
- value=accuracy[:, label_idx].tolist(),
406
+ value=accuracy[:, label_idx].astype(float).tolist(),
306
407
  **kwargs,
307
408
  )
308
409
  )
309
410
  metrics[MetricType.F1].append(
310
411
  F1(
311
- value=f1_score[:, label_idx].tolist(),
412
+ value=f1_score[:, label_idx].astype(float).tolist(),
312
413
  **kwargs,
313
414
  )
314
415
  )
315
416
 
316
- if MetricType.ConfusionMatrix in metrics_to_return:
317
- metrics[
318
- MetricType.ConfusionMatrix
319
- ] = self._compute_confusion_matrix(
320
- data=data,
321
- label_metadata=label_metadata,
322
- score_thresholds=score_thresholds,
323
- hardmax=hardmax,
324
- number_of_examples=number_of_examples,
325
- )
326
-
327
- for metric in set(metrics.keys()):
328
- if metric not in metrics_to_return:
329
- del metrics[metric]
330
-
331
417
  if as_dict:
332
418
  return {
333
419
  mtype: [metric.to_dict() for metric in mvalues]
@@ -336,143 +422,43 @@ class Evaluator:
336
422
 
337
423
  return metrics
338
424
 
339
- def _unpack_confusion_matrix(
425
+ def compute_confusion_matrix(
340
426
  self,
341
- confusion_matrix: NDArray[np.float64],
342
- number_of_labels: int,
343
- number_of_examples: int,
344
- ) -> dict[
345
- str,
346
- dict[
347
- str,
348
- dict[
349
- str,
350
- int
351
- | list[
352
- dict[
353
- str,
354
- str | float,
355
- ]
356
- ],
357
- ],
358
- ],
359
- ]:
360
- """
361
- Unpacks a numpy array of confusion matrix counts and examples.
362
- """
363
-
364
- datum_idx = lambda gt_label_idx, pd_label_idx, example_idx: int( # noqa: E731 - lambda fn
365
- confusion_matrix[
366
- gt_label_idx,
367
- pd_label_idx,
368
- example_idx * 2 + 1,
369
- ]
370
- )
371
-
372
- score_idx = lambda gt_label_idx, pd_label_idx, example_idx: float( # noqa: E731 - lambda fn
373
- confusion_matrix[
374
- gt_label_idx,
375
- pd_label_idx,
376
- example_idx * 2 + 2,
377
- ]
378
- )
379
-
380
- return {
381
- self.index_to_label[gt_label_idx]: {
382
- self.index_to_label[pd_label_idx]: {
383
- "count": max(
384
- int(confusion_matrix[gt_label_idx, pd_label_idx, 0]),
385
- 0,
386
- ),
387
- "examples": [
388
- {
389
- "datum": self.index_to_uid[
390
- datum_idx(
391
- gt_label_idx, pd_label_idx, example_idx
392
- )
393
- ],
394
- "score": score_idx(
395
- gt_label_idx, pd_label_idx, example_idx
396
- ),
397
- }
398
- for example_idx in range(number_of_examples)
399
- if datum_idx(gt_label_idx, pd_label_idx, example_idx)
400
- >= 0
401
- ],
402
- }
403
- for pd_label_idx in range(number_of_labels)
404
- }
405
- for gt_label_idx in range(number_of_labels)
406
- }
407
-
408
- def _unpack_missing_predictions(
409
- self,
410
- missing_predictions: NDArray[np.int32],
411
- number_of_labels: int,
412
- number_of_examples: int,
413
- ) -> dict[str, dict[str, int | list[dict[str, str]]]]:
414
- """
415
- Unpacks a numpy array of missing prediction counts and examples.
416
- """
417
-
418
- datum_idx = (
419
- lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn
420
- missing_predictions[
421
- gt_label_idx,
422
- example_idx + 1,
423
- ]
424
- )
425
- )
426
-
427
- return {
428
- self.index_to_label[gt_label_idx]: {
429
- "count": max(
430
- int(missing_predictions[gt_label_idx, 0]),
431
- 0,
432
- ),
433
- "examples": [
434
- {
435
- "datum": self.index_to_uid[
436
- datum_idx(gt_label_idx, example_idx)
437
- ]
438
- }
439
- for example_idx in range(number_of_examples)
440
- if datum_idx(gt_label_idx, example_idx) >= 0
441
- ],
442
- }
443
- for gt_label_idx in range(number_of_labels)
444
- }
445
-
446
- def _compute_confusion_matrix(
447
- self,
448
- data: NDArray[np.float64],
449
- label_metadata: NDArray[np.int32],
450
- score_thresholds: list[float],
451
- hardmax: bool,
452
- number_of_examples: int,
453
- ) -> list[ConfusionMatrix]:
427
+ score_thresholds: list[float] = [0.0],
428
+ hardmax: bool = True,
429
+ number_of_examples: int = 0,
430
+ filter_: Filter | None = None,
431
+ as_dict: bool = False,
432
+ ) -> list:
454
433
  """
455
434
  Computes a detailed confusion matrix..
456
435
 
457
436
  Parameters
458
437
  ----------
459
- data : NDArray[np.float64]
460
- A data array containing classification pairs.
461
- label_metadata : NDArray[np.int32]
462
- An integer array containing label metadata.
463
438
  score_thresholds : list[float]
464
439
  A list of score thresholds to compute metrics over.
465
440
  hardmax : bool
466
441
  Toggles whether a hardmax is applied to predictions.
467
442
  number_of_examples : int, default=0
468
443
  The number of examples to return per count.
444
+ filter_ : Filter, optional
445
+ An optional filter object.
446
+ as_dict : bool, default=False
447
+ An option to return metrics as dictionaries.
469
448
 
470
449
  Returns
471
450
  -------
472
- list[ConfusionMatrix]
473
- A list of ConfusionMatrix objects.
451
+ list[ConfusionMatrix] | list[dict]
452
+ A list of confusion matrices.
474
453
  """
475
454
 
455
+ # apply filters
456
+ data = self._detailed_pairs
457
+ label_metadata = self._label_metadata
458
+ if filter_ is not None:
459
+ data = data[filter_.indices]
460
+ label_metadata = filter_.label_metadata
461
+
476
462
  if data.size == 0:
477
463
  return list()
478
464
 
@@ -485,7 +471,7 @@ class Evaluator:
485
471
  )
486
472
 
487
473
  n_scores, n_labels, _, _ = confusion_matrix.shape
488
- return [
474
+ results = [
489
475
  ConfusionMatrix(
490
476
  score_threshold=score_thresholds[score_idx],
491
477
  number_of_examples=number_of_examples,
@@ -503,6 +489,56 @@ class Evaluator:
503
489
  for score_idx in range(n_scores)
504
490
  ]
505
491
 
492
+ if as_dict:
493
+ return [m.to_dict() for m in results]
494
+
495
+ return results
496
+
497
+ def evaluate(
498
+ self,
499
+ score_thresholds: list[float] = [0.0],
500
+ hardmax: bool = True,
501
+ number_of_examples: int = 0,
502
+ filter_: Filter | None = None,
503
+ as_dict: bool = False,
504
+ ) -> dict[MetricType, list]:
505
+ """
506
+ Computes a detailed confusion matrix..
507
+
508
+ Parameters
509
+ ----------
510
+ score_thresholds : list[float]
511
+ A list of score thresholds to compute metrics over.
512
+ hardmax : bool
513
+ Toggles whether a hardmax is applied to predictions.
514
+ number_of_examples : int, default=0
515
+ The number of examples to return per count.
516
+ filter_ : Filter, optional
517
+ An optional filter object.
518
+ as_dict : bool, default=False
519
+ An option to return metrics as dictionaries.
520
+
521
+ Returns
522
+ -------
523
+ list[ConfusionMatrix] | list[dict]
524
+ A list of confusion matrices.
525
+ """
526
+
527
+ results = self.compute_precision_recall(
528
+ score_thresholds=score_thresholds,
529
+ hardmax=hardmax,
530
+ filter_=filter_,
531
+ as_dict=as_dict,
532
+ )
533
+ results[MetricType.ConfusionMatrix] = self.compute_confusion_matrix(
534
+ score_thresholds=score_thresholds,
535
+ hardmax=hardmax,
536
+ number_of_examples=number_of_examples,
537
+ filter_=filter_,
538
+ as_dict=as_dict,
539
+ )
540
+ return results
541
+
506
542
 
507
543
  class DataLoader:
508
544
  """
@@ -644,70 +680,6 @@ class DataLoader:
644
680
  predictions=predictions,
645
681
  )
646
682
 
647
- def add_data_from_valor_dict(
648
- self,
649
- classifications: list[tuple[dict, dict]],
650
- show_progress: bool = False,
651
- ):
652
- """
653
- Adds Valor-format classifications to the cache.
654
-
655
- Parameters
656
- ----------
657
- classifications : list[tuple[dict, dict]]
658
- A list of groundtruth, prediction pairs in Valor-format dictionaries.
659
- show_progress : bool, default=False
660
- Toggle for tqdm progress bar.
661
- """
662
-
663
- disable_tqdm = not show_progress
664
- for groundtruth, prediction in tqdm(
665
- classifications, disable=disable_tqdm
666
- ):
667
-
668
- # update metadata
669
- self._evaluator.n_datums += 1
670
- self._evaluator.n_groundtruths += len(groundtruth["annotations"])
671
- self._evaluator.n_predictions += len(prediction["annotations"])
672
-
673
- # update datum uid index
674
- uid_index = self._add_datum(uid=groundtruth["datum"]["uid"])
675
-
676
- # cache labels and annotations
677
- predictions = list()
678
- groundtruths = None
679
- for gann in groundtruth["annotations"]:
680
- for valor_label in gann["labels"]:
681
- glabel = f'{valor_label["key"]}_{valor_label["value"]}'
682
- label_idx = self._add_label(glabel)
683
- self.groundtruth_count[label_idx][uid_index] += 1
684
- groundtruths = label_idx
685
- for pann in prediction["annotations"]:
686
- for valor_label in pann["labels"]:
687
- plabel = f'{valor_label["key"]}_{valor_label["value"]}'
688
- pscore = valor_label["score"]
689
- label_idx = self._add_label(plabel)
690
- self.prediction_count[label_idx][uid_index] += 1
691
- predictions.append(
692
- (
693
- label_idx,
694
- pscore,
695
- )
696
- )
697
-
698
- # fix type error where groundtruths can possibly be unbound now that it's a float
699
- # in practice, this error should never be hit since groundtruths can't be empty without throwing a ValueError earlier in the flow
700
- if groundtruths is None:
701
- raise ValueError(
702
- "Expected a value for groundtruths, but got None."
703
- )
704
-
705
- self._add_data(
706
- uid_index=uid_index,
707
- groundtruth=groundtruths,
708
- predictions=predictions,
709
- )
710
-
711
683
  def finalize(self) -> Evaluator:
712
684
  """
713
685
  Performs data finalization and some preprocessing steps.