valor-lite 0.33.18__tar.gz → 0.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of valor-lite might be problematic. Click here for more details.

Files changed (126) hide show
  1. {valor_lite-0.33.18/valor_lite.egg-info → valor_lite-0.34.0}/PKG-INFO +14 -31
  2. {valor_lite-0.33.18 → valor_lite-0.34.0}/pyproject.toml +13 -7
  3. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/computation.py +11 -9
  4. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/manager.py +3 -3
  5. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/metric.py +6 -6
  6. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/utilities.py +11 -8
  7. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/computation.py +30 -26
  8. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/manager.py +5 -3
  9. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/metric.py +12 -12
  10. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/utilities.py +24 -23
  11. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/computation.py +9 -9
  12. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/manager.py +2 -1
  13. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/metric.py +10 -10
  14. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/utilities.py +7 -6
  15. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/computation.py +1 -0
  16. {valor_lite-0.33.18 → valor_lite-0.34.0/valor_lite.egg-info}/PKG-INFO +14 -31
  17. valor_lite-0.34.0/valor_lite.egg-info/SOURCES.txt +42 -0
  18. valor_lite-0.34.0/valor_lite.egg-info/requires.txt +28 -0
  19. valor_lite-0.33.18/benchmarks/.gitignore +0 -2
  20. valor_lite-0.33.18/benchmarks/benchmark_classification.py +0 -272
  21. valor_lite-0.33.18/benchmarks/benchmark_objdet.py +0 -410
  22. valor_lite-0.33.18/benchmarks/synthetic/benchmark_semantic_segmentation.py +0 -94
  23. valor_lite-0.33.18/examples/.gitignore +0 -1
  24. valor_lite-0.33.18/examples/benchmarking.ipynb +0 -279
  25. valor_lite-0.33.18/examples/object-detection.ipynb +0 -3023
  26. valor_lite-0.33.18/examples/tabular_classification.ipynb +0 -633
  27. valor_lite-0.33.18/examples/text_generation.ipynb +0 -500
  28. valor_lite-0.33.18/tests/classification/conftest.py +0 -331
  29. valor_lite-0.33.18/tests/classification/test_accuracy.py +0 -226
  30. valor_lite-0.33.18/tests/classification/test_confusion_matrix.py +0 -618
  31. valor_lite-0.33.18/tests/classification/test_counts.py +0 -1318
  32. valor_lite-0.33.18/tests/classification/test_dataloader.py +0 -18
  33. valor_lite-0.33.18/tests/classification/test_evaluator.py +0 -64
  34. valor_lite-0.33.18/tests/classification/test_f1.py +0 -394
  35. valor_lite-0.33.18/tests/classification/test_filtering.py +0 -570
  36. valor_lite-0.33.18/tests/classification/test_metric.py +0 -39
  37. valor_lite-0.33.18/tests/classification/test_precision.py +0 -392
  38. valor_lite-0.33.18/tests/classification/test_recall.py +0 -327
  39. valor_lite-0.33.18/tests/classification/test_rocauc.py +0 -396
  40. valor_lite-0.33.18/tests/classification/test_schemas.py +0 -30
  41. valor_lite-0.33.18/tests/classification/test_stability.py +0 -70
  42. valor_lite-0.33.18/tests/object_detection/__init__.py +0 -0
  43. valor_lite-0.33.18/tests/object_detection/conftest.py +0 -1123
  44. valor_lite-0.33.18/tests/object_detection/test_accuracy.py +0 -483
  45. valor_lite-0.33.18/tests/object_detection/test_average_precision.py +0 -854
  46. valor_lite-0.33.18/tests/object_detection/test_average_recall.py +0 -604
  47. valor_lite-0.33.18/tests/object_detection/test_confusion_matrix.py +0 -1789
  48. valor_lite-0.33.18/tests/object_detection/test_counts.py +0 -601
  49. valor_lite-0.33.18/tests/object_detection/test_dataloader.py +0 -144
  50. valor_lite-0.33.18/tests/object_detection/test_evaluator.py +0 -130
  51. valor_lite-0.33.18/tests/object_detection/test_f1.py +0 -470
  52. valor_lite-0.33.18/tests/object_detection/test_filtering.py +0 -586
  53. valor_lite-0.33.18/tests/object_detection/test_iou.py +0 -371
  54. valor_lite-0.33.18/tests/object_detection/test_metric.py +0 -39
  55. valor_lite-0.33.18/tests/object_detection/test_pr_curve.py +0 -234
  56. valor_lite-0.33.18/tests/object_detection/test_precision.py +0 -470
  57. valor_lite-0.33.18/tests/object_detection/test_recall.py +0 -469
  58. valor_lite-0.33.18/tests/object_detection/test_schemas.py +0 -160
  59. valor_lite-0.33.18/tests/object_detection/test_stability.py +0 -103
  60. valor_lite-0.33.18/tests/semantic_segmentation/__init__.py +0 -0
  61. valor_lite-0.33.18/tests/semantic_segmentation/conftest.py +0 -132
  62. valor_lite-0.33.18/tests/semantic_segmentation/test_accuracy.py +0 -67
  63. valor_lite-0.33.18/tests/semantic_segmentation/test_annotation.py +0 -136
  64. valor_lite-0.33.18/tests/semantic_segmentation/test_confusion_matrix.py +0 -153
  65. valor_lite-0.33.18/tests/semantic_segmentation/test_dataloader.py +0 -8
  66. valor_lite-0.33.18/tests/semantic_segmentation/test_evaluator.py +0 -66
  67. valor_lite-0.33.18/tests/semantic_segmentation/test_f1.py +0 -94
  68. valor_lite-0.33.18/tests/semantic_segmentation/test_filtering.py +0 -75
  69. valor_lite-0.33.18/tests/semantic_segmentation/test_iou.py +0 -125
  70. valor_lite-0.33.18/tests/semantic_segmentation/test_metric.py +0 -39
  71. valor_lite-0.33.18/tests/semantic_segmentation/test_precision.py +0 -96
  72. valor_lite-0.33.18/tests/semantic_segmentation/test_recall.py +0 -94
  73. valor_lite-0.33.18/tests/semantic_segmentation/test_stability.py +0 -85
  74. valor_lite-0.33.18/tests/text_generation/__init__.py +0 -0
  75. valor_lite-0.33.18/tests/text_generation/conftest.py +0 -240
  76. valor_lite-0.33.18/tests/text_generation/llm/__init__.py +0 -0
  77. valor_lite-0.33.18/tests/text_generation/llm/test_generation.py +0 -140
  78. valor_lite-0.33.18/tests/text_generation/llm/test_integrations.py +0 -364
  79. valor_lite-0.33.18/tests/text_generation/llm/test_utilities.py +0 -83
  80. valor_lite-0.33.18/tests/text_generation/llm/test_validators.py +0 -107
  81. valor_lite-0.33.18/tests/text_generation/metrics/test_answer_correctness.py +0 -87
  82. valor_lite-0.33.18/tests/text_generation/metrics/test_answer_relevance.py +0 -109
  83. valor_lite-0.33.18/tests/text_generation/metrics/test_bias.py +0 -104
  84. valor_lite-0.33.18/tests/text_generation/metrics/test_context_precision.py +0 -195
  85. valor_lite-0.33.18/tests/text_generation/metrics/test_context_recall.py +0 -194
  86. valor_lite-0.33.18/tests/text_generation/metrics/test_context_relevance.py +0 -135
  87. valor_lite-0.33.18/tests/text_generation/metrics/test_faithfulness.py +0 -155
  88. valor_lite-0.33.18/tests/text_generation/metrics/test_hallucination.py +0 -118
  89. valor_lite-0.33.18/tests/text_generation/metrics/test_metric.py +0 -39
  90. valor_lite-0.33.18/tests/text_generation/metrics/test_rouge.py +0 -242
  91. valor_lite-0.33.18/tests/text_generation/metrics/test_sentence_bleu.py +0 -224
  92. valor_lite-0.33.18/tests/text_generation/metrics/test_summary_coherence.py +0 -88
  93. valor_lite-0.33.18/tests/text_generation/metrics/test_toxicity.py +0 -105
  94. valor_lite-0.33.18/tests/text_generation/test_evaluator.py +0 -170
  95. valor_lite-0.33.18/tests/text_generation/test_manager.py +0 -110
  96. valor_lite-0.33.18/valor_lite/LICENSE +0 -21
  97. valor_lite-0.33.18/valor_lite/__init__.py +0 -0
  98. valor_lite-0.33.18/valor_lite/text_generation/llm/__init__.py +0 -0
  99. valor_lite-0.33.18/valor_lite.egg-info/SOURCES.txt +0 -122
  100. valor_lite-0.33.18/valor_lite.egg-info/requires.txt +0 -22
  101. {valor_lite-0.33.18 → valor_lite-0.34.0}/README.md +0 -0
  102. {valor_lite-0.33.18 → valor_lite-0.34.0}/setup.cfg +0 -0
  103. {valor_lite-0.33.18 → valor_lite-0.34.0/valor_lite}/LICENSE +0 -0
  104. {valor_lite-0.33.18/tests → valor_lite-0.34.0/valor_lite}/__init__.py +0 -0
  105. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/__init__.py +0 -0
  106. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/annotation.py +0 -0
  107. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/__init__.py +0 -0
  108. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/annotation.py +0 -0
  109. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/profiling.py +0 -0
  110. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/schemas.py +0 -0
  111. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/__init__.py +0 -0
  112. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/annotation.py +0 -0
  113. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/benchmark.py +0 -0
  114. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/__init__.py +0 -0
  115. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/annotation.py +0 -0
  116. {valor_lite-0.33.18/tests/classification → valor_lite-0.34.0/valor_lite/text_generation/llm}/__init__.py +0 -0
  117. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/exceptions.py +0 -0
  118. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/generation.py +0 -0
  119. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/instructions.py +0 -0
  120. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/integrations.py +0 -0
  121. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/utilities.py +0 -0
  122. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/validators.py +0 -0
  123. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/manager.py +0 -0
  124. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/metric.py +0 -0
  125. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite.egg-info/dependency_links.txt +0 -0
  126. {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite.egg-info/top_level.txt +0 -0
@@ -1,42 +1,16 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: valor-lite
3
- Version: 0.33.18
4
- Summary: Compute valor metrics locally.
5
- License: MIT License
6
-
7
- Copyright (c) 2023 Striveworks
8
-
9
- Permission is hereby granted, free of charge, to any person obtaining a copy
10
- of this software and associated documentation files (the "Software"), to deal
11
- in the Software without restriction, including without limitation the rights
12
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
- copies of the Software, and to permit persons to whom the Software is
14
- furnished to do so, subject to the following conditions:
15
-
16
- The above copyright notice and this permission notice shall be included in all
17
- copies or substantial portions of the Software.
18
-
19
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
- SOFTWARE.
26
-
3
+ Version: 0.34.0
4
+ Summary: Evaluate machine learning models.
27
5
  Project-URL: homepage, https://www.striveworks.com
28
6
  Requires-Python: >=3.10
29
7
  Description-Content-Type: text/markdown
30
- License-File: LICENSE
8
+ Requires-Dist: numpy
9
+ Requires-Dist: tqdm
10
+ Requires-Dist: shapely
31
11
  Requires-Dist: evaluate
32
- Requires-Dist: importlib_metadata; python_version < "3.8"
33
12
  Requires-Dist: nltk
34
- Requires-Dist: numpy
35
- Requires-Dist: Pillow>=9.1.0
36
- Requires-Dist: requests
37
13
  Requires-Dist: rouge_score
38
- Requires-Dist: shapely
39
- Requires-Dist: tqdm
40
14
  Provides-Extra: mistral
41
15
  Requires-Dist: mistralai>=1.0; extra == "mistral"
42
16
  Provides-Extra: openai
@@ -45,6 +19,15 @@ Provides-Extra: test
45
19
  Requires-Dist: pytest; extra == "test"
46
20
  Requires-Dist: coverage; extra == "test"
47
21
  Requires-Dist: pre-commit; extra == "test"
22
+ Provides-Extra: docs
23
+ Requires-Dist: mkdocs; extra == "docs"
24
+ Requires-Dist: mkdocs-material; extra == "docs"
25
+ Requires-Dist: mkdocstrings; extra == "docs"
26
+ Requires-Dist: mkdocstrings-python; extra == "docs"
27
+ Requires-Dist: mkdocs-include-dir-to-nav; extra == "docs"
28
+ Requires-Dist: mkdocs-swagger-ui-tag; extra == "docs"
29
+ Provides-Extra: dev
30
+ Requires-Dist: valor-lite[docs,mistral,openai,test]; extra == "dev"
48
31
 
49
32
  # valor-lite: Fast, local machine learning evaluation.
50
33
 
@@ -1,20 +1,17 @@
1
1
  [project]
2
2
  name = "valor-lite"
3
3
  dynamic = ["version"]
4
- description = "Compute valor metrics locally."
4
+ description = "Evaluate machine learning models."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
7
7
  license = { file = "LICENSE" }
8
8
  dependencies = [
9
+ "numpy",
10
+ "tqdm",
11
+ "shapely",
9
12
  "evaluate",
10
- "importlib_metadata; python_version < '3.8'",
11
13
  "nltk",
12
- "numpy",
13
- "Pillow >= 9.1.0",
14
- "requests",
15
14
  "rouge_score",
16
- "shapely",
17
- "tqdm",
18
15
  ]
19
16
 
20
17
  [project.urls]
@@ -28,6 +25,15 @@ build-backend = "setuptools.build_meta"
28
25
  mistral = ["mistralai >= 1.0"]
29
26
  openai = ["openai"]
30
27
  test = ["pytest", "coverage", "pre-commit"]
28
+ docs = [
29
+ "mkdocs",
30
+ "mkdocs-material",
31
+ "mkdocstrings",
32
+ "mkdocstrings-python",
33
+ "mkdocs-include-dir-to-nav",
34
+ "mkdocs-swagger-ui-tag",
35
+ ]
36
+ dev = ["valor-lite[openai, mistral, test, docs]"]
31
37
 
32
38
  [tool.black]
33
39
  line-length = 79
@@ -212,7 +212,7 @@ def _count_with_examples(
212
212
  data: NDArray[np.float64],
213
213
  unique_idx: int | list[int],
214
214
  label_idx: int | list[int],
215
- ) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.int32]]:
215
+ ) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.intp]]:
216
216
  """
217
217
  Helper function for counting occurences of unique detailed pairs.
218
218
 
@@ -231,7 +231,7 @@ def _count_with_examples(
231
231
  Examples drawn from the data input.
232
232
  NDArray[np.int32]
233
233
  Unique label indices.
234
- NDArray[np.int32]
234
+ NDArray[np.intp]
235
235
  Counts for each unique label index.
236
236
  """
237
237
  unique_rows, indices = np.unique(
@@ -282,18 +282,20 @@ def compute_confusion_matrix(
282
282
  NDArray[np.float64]
283
283
  Confusion matrix.
284
284
  NDArray[np.int32]
285
- Ground truths with missing predictions.
285
+ Unmatched Ground Truths.
286
286
  """
287
287
 
288
288
  n_labels = label_metadata.shape[0]
289
289
  n_scores = score_thresholds.shape[0]
290
290
 
291
- confusion_matrix = -1 * np.ones(
291
+ confusion_matrix = np.full(
292
292
  (n_scores, n_labels, n_labels, 2 * n_examples + 1),
293
+ fill_value=-1.0,
293
294
  dtype=np.float32,
294
295
  )
295
- missing_predictions = -1 * np.ones(
296
+ unmatched_ground_truths = np.full(
296
297
  (n_scores, n_labels, n_examples + 1),
298
+ fill_value=-1,
297
299
  dtype=np.int32,
298
300
  )
299
301
 
@@ -339,7 +341,7 @@ def compute_confusion_matrix(
339
341
  score_idx, misclf_labels[:, 0], misclf_labels[:, 1], 0
340
342
  ] = misclf_counts
341
343
 
342
- missing_predictions[score_idx, misprd_labels, 0] = misprd_counts
344
+ unmatched_ground_truths[score_idx, misprd_labels, 0] = misprd_counts
343
345
 
344
346
  if n_examples > 0:
345
347
  for label_idx in range(n_labels):
@@ -375,16 +377,16 @@ def compute_confusion_matrix(
375
377
  1 : 2 * misclf_label_examples.shape[0] + 1,
376
378
  ] = misclf_label_examples[:, [0, 3]].flatten()
377
379
 
378
- # missing prediction examples
380
+ # unmatched ground truth examples
379
381
  mask_misprd_label = misprd_examples[:, 1] == label_idx
380
382
  if misprd_examples.size > 0:
381
383
  misprd_label_examples = misprd_examples[mask_misprd_label][
382
384
  :n_examples
383
385
  ]
384
- missing_predictions[
386
+ unmatched_ground_truths[
385
387
  score_idx,
386
388
  label_idx,
387
389
  1 : misprd_label_examples.shape[0] + 1,
388
390
  ] = misprd_label_examples[:, 0].flatten()
389
391
 
390
- return confusion_matrix, missing_predictions
392
+ return confusion_matrix, unmatched_ground_truths # type: ignore[reportReturnType]
@@ -4,6 +4,7 @@ from dataclasses import dataclass
4
4
  import numpy as np
5
5
  from numpy.typing import NDArray
6
6
  from tqdm import tqdm
7
+
7
8
  from valor_lite.classification.annotation import Classification
8
9
  from valor_lite.classification.computation import (
9
10
  compute_confusion_matrix,
@@ -38,7 +39,7 @@ filtered_metrics = evaluator.evaluate(filter_mask=filter_mask)
38
39
 
39
40
  @dataclass
40
41
  class Filter:
41
- indices: NDArray[np.int32]
42
+ indices: NDArray[np.intp]
42
43
  label_metadata: NDArray[np.int32]
43
44
  n_datums: int
44
45
 
@@ -169,8 +170,7 @@ class Evaluator:
169
170
  label_metadata_per_datum = self._label_metadata_per_datum.copy()
170
171
  label_metadata_per_datum[:, ~mask] = 0
171
172
 
172
- label_metadata = np.zeros_like(self._label_metadata, dtype=np.int32)
173
- label_metadata = np.transpose(
173
+ label_metadata: NDArray[np.int32] = np.transpose(
174
174
  np.sum(
175
175
  label_metadata_per_datum,
176
176
  axis=1,
@@ -321,7 +321,7 @@ class Metric(BaseMetric):
321
321
  ],
322
322
  ],
323
323
  ],
324
- missing_predictions: dict[
324
+ unmatched_ground_truths: dict[
325
325
  str, # ground truth label value
326
326
  dict[
327
327
  str, # either `count` or `examples`
@@ -335,8 +335,8 @@ class Metric(BaseMetric):
335
335
  The confusion matrix and related metrics for the classification task.
336
336
 
337
337
  This class encapsulates detailed information about the model's performance, including correct
338
- predictions, misclassifications, hallucinations (false positives), and missing predictions
339
- (false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
338
+ predictions, misclassifications, unmatched predictions (subset of false positives), and unmatched ground truths
339
+ (subset of false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
340
340
 
341
341
  Confusion Matrix Structure:
342
342
  {
@@ -358,7 +358,7 @@ class Metric(BaseMetric):
358
358
  ...
359
359
  }
360
360
 
361
- Missing Prediction Structure:
361
+ Unmatched Ground Truths Structure:
362
362
  {
363
363
  ground_truth_label: {
364
364
  'count': int,
@@ -379,7 +379,7 @@ class Metric(BaseMetric):
379
379
  A nested dictionary where the first key is the ground truth label value, the second key
380
380
  is the prediction label value, and the innermost dictionary contains either a `count`
381
381
  or a list of `examples`. Each example includes the datum UID and prediction score.
382
- missing_predictions : dict
382
+ unmatched_ground_truths : dict
383
383
  A dictionary where each key is a ground truth label value for which the model failed to predict
384
384
  (false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
385
385
  Each example includes the datum UID.
@@ -396,7 +396,7 @@ class Metric(BaseMetric):
396
396
  type=MetricType.ConfusionMatrix.value,
397
397
  value={
398
398
  "confusion_matrix": confusion_matrix,
399
- "missing_predictions": missing_predictions,
399
+ "unmatched_ground_truths": unmatched_ground_truths,
400
400
  },
401
401
  parameters={
402
402
  "score_threshold": score_threshold,
@@ -2,6 +2,7 @@ from collections import defaultdict
2
2
 
3
3
  import numpy as np
4
4
  from numpy.typing import NDArray
5
+
5
6
  from valor_lite.classification.metric import Metric, MetricType
6
7
 
7
8
 
@@ -153,20 +154,20 @@ def _unpack_confusion_matrix_value(
153
154
  }
154
155
 
155
156
 
156
- def _unpack_missing_predictions_value(
157
- missing_predictions: NDArray[np.int32],
157
+ def _unpack_unmatched_ground_truths_value(
158
+ unmatched_ground_truths: NDArray[np.int32],
158
159
  number_of_labels: int,
159
160
  number_of_examples: int,
160
161
  index_to_uid: dict[int, str],
161
162
  index_to_label: dict[int, str],
162
163
  ) -> dict[str, dict[str, int | list[dict[str, str]]]]:
163
164
  """
164
- Unpacks a numpy array of missing prediction counts and examples.
165
+ Unpacks a numpy array of unmatched ground truth counts and examples.
165
166
  """
166
167
 
167
168
  datum_idx = (
168
169
  lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn
169
- missing_predictions[
170
+ unmatched_ground_truths[
170
171
  gt_label_idx,
171
172
  example_idx + 1,
172
173
  ]
@@ -176,7 +177,7 @@ def _unpack_missing_predictions_value(
176
177
  return {
177
178
  index_to_label[gt_label_idx]: {
178
179
  "count": max(
179
- int(missing_predictions[gt_label_idx, 0]),
180
+ int(unmatched_ground_truths[gt_label_idx, 0]),
180
181
  0,
181
182
  ),
182
183
  "examples": [
@@ -197,7 +198,7 @@ def unpack_confusion_matrix_into_metric_list(
197
198
  index_to_label: dict[int, str],
198
199
  ) -> list[Metric]:
199
200
 
200
- (confusion_matrix, missing_predictions) = results
201
+ (confusion_matrix, unmatched_ground_truths) = results
201
202
  n_scores, n_labels, _, _ = confusion_matrix.shape
202
203
  return [
203
204
  Metric.confusion_matrix(
@@ -210,8 +211,10 @@ def unpack_confusion_matrix_into_metric_list(
210
211
  index_to_label=index_to_label,
211
212
  index_to_uid=index_to_uid,
212
213
  ),
213
- missing_predictions=_unpack_missing_predictions_value(
214
- missing_predictions=missing_predictions[score_idx, :, :],
214
+ unmatched_ground_truths=_unpack_unmatched_ground_truths_value(
215
+ unmatched_ground_truths=unmatched_ground_truths[
216
+ score_idx, :, :
217
+ ],
215
218
  number_of_labels=n_labels,
216
219
  number_of_examples=number_of_examples,
217
220
  index_to_label=index_to_label,
@@ -381,9 +381,9 @@ def compute_precion_recall(
381
381
  _, indices_gt_unique = np.unique(
382
382
  tp_candidates[:, [0, 1, 4]], axis=0, return_index=True
383
383
  )
384
- mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=bool)
384
+ mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=np.bool_)
385
385
  mask_gt_unique[indices_gt_unique] = True
386
- true_positives_mask = np.zeros(n_rows, dtype=bool)
386
+ true_positives_mask = np.zeros(n_rows, dtype=np.bool_)
387
387
  true_positives_mask[mask_tp_inner] = mask_gt_unique
388
388
 
389
389
  # calculate intermediates
@@ -452,9 +452,9 @@ def compute_precion_recall(
452
452
  _, indices_gt_unique = np.unique(
453
453
  tp_candidates[:, [0, 1, 4]], axis=0, return_index=True
454
454
  )
455
- mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=bool)
455
+ mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=np.bool_)
456
456
  mask_gt_unique[indices_gt_unique] = True
457
- true_positives_mask = np.zeros(n_rows, dtype=bool)
457
+ true_positives_mask = np.zeros(n_rows, dtype=np.bool_)
458
458
  true_positives_mask[mask_tp_outer] = mask_gt_unique
459
459
 
460
460
  # count running tp and total for AP
@@ -501,8 +501,8 @@ def compute_precion_recall(
501
501
  )
502
502
 
503
503
  # calculate average precision
504
- running_max_precision = np.zeros((n_ious, n_labels))
505
- running_max_score = np.zeros((n_labels))
504
+ running_max_precision = np.zeros((n_ious, n_labels), dtype=np.float64)
505
+ running_max_score = np.zeros((n_labels), dtype=np.float64)
506
506
  for recall in range(100, -1, -1):
507
507
 
508
508
  # running max precision
@@ -528,8 +528,12 @@ def compute_precion_recall(
528
528
 
529
529
  # calculate mAP and mAR
530
530
  if unique_pd_labels.size > 0:
531
- mAP = average_precision[:, unique_pd_labels].mean(axis=1)
532
- mAR = average_recall[:, unique_pd_labels].mean(axis=1)
531
+ mAP: NDArray[np.float64] = average_precision[:, unique_pd_labels].mean(
532
+ axis=1
533
+ )
534
+ mAR: NDArray[np.float64] = average_recall[:, unique_pd_labels].mean(
535
+ axis=1
536
+ )
533
537
  else:
534
538
  mAP = np.zeros(n_ious, dtype=np.float64)
535
539
  mAR = np.zeros(n_scores, dtype=np.float64)
@@ -561,14 +565,14 @@ def compute_precion_recall(
561
565
  accuracy,
562
566
  counts,
563
567
  pr_curve,
564
- )
568
+ ) # type: ignore[reportReturnType]
565
569
 
566
570
 
567
571
  def _count_with_examples(
568
572
  data: NDArray[np.float64],
569
573
  unique_idx: int | list[int],
570
574
  label_idx: int | list[int],
571
- ) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.int32]]:
575
+ ) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.intp]]:
572
576
  """
573
577
  Helper function for counting occurences of unique detailed pairs.
574
578
 
@@ -587,7 +591,7 @@ def _count_with_examples(
587
591
  Examples drawn from the data input.
588
592
  NDArray[np.int32]
589
593
  Unique label indices.
590
- NDArray[np.int32]
594
+ NDArray[np.intp]
591
595
  Counts for each unique label index.
592
596
  """
593
597
  unique_rows, indices = np.unique(
@@ -669,9 +673,9 @@ def compute_confusion_matrix(
669
673
  NDArray[np.float64]
670
674
  Confusion matrix.
671
675
  NDArray[np.float64]
672
- Hallucinations.
676
+ Unmatched Predictions.
673
677
  NDArray[np.int32]
674
- Missing Predictions.
678
+ Unmatched Ground Truths.
675
679
  """
676
680
 
677
681
  n_labels = label_metadata.shape[0]
@@ -683,12 +687,12 @@ def compute_confusion_matrix(
683
687
  (n_ious, n_scores, n_labels, n_labels, 4 * n_examples + 1),
684
688
  dtype=np.float32,
685
689
  )
686
- hallucinations = -1 * np.ones(
690
+ unmatched_predictions = -1 * np.ones(
687
691
  # (datum idx, pd idx, pd score) * n_examples + count
688
692
  (n_ious, n_scores, n_labels, 3 * n_examples + 1),
689
693
  dtype=np.float32,
690
694
  )
691
- missing_predictions = -1 * np.ones(
695
+ unmatched_ground_truths = -1 * np.ones(
692
696
  # (datum idx, gt idx) * n_examples + count
693
697
  (n_ious, n_scores, n_labels, 2 * n_examples + 1),
694
698
  dtype=np.int32,
@@ -793,7 +797,7 @@ def compute_confusion_matrix(
793
797
  data[mask_misclf], unique_idx=[0, 1, 2, 4, 5], label_idx=[3, 4]
794
798
  )
795
799
 
796
- # count hallucinations
800
+ # count unmatched predictions
797
801
  (
798
802
  halluc_examples,
799
803
  halluc_labels,
@@ -802,7 +806,7 @@ def compute_confusion_matrix(
802
806
  data[mask_halluc], unique_idx=[0, 2, 5], label_idx=2
803
807
  )
804
808
 
805
- # count missing predictions
809
+ # count unmatched ground truths
806
810
  (
807
811
  misprd_examples,
808
812
  misprd_labels,
@@ -822,13 +826,13 @@ def compute_confusion_matrix(
822
826
  misclf_labels[:, 1],
823
827
  0,
824
828
  ] = misclf_counts
825
- hallucinations[
829
+ unmatched_predictions[
826
830
  iou_idx,
827
831
  score_idx,
828
832
  halluc_labels,
829
833
  0,
830
834
  ] = halluc_counts
831
- missing_predictions[
835
+ unmatched_ground_truths[
832
836
  iou_idx,
833
837
  score_idx,
834
838
  misprd_labels,
@@ -877,26 +881,26 @@ def compute_confusion_matrix(
877
881
  :, [0, 1, 2, 6]
878
882
  ].flatten()
879
883
 
880
- # hallucination examples
884
+ # unmatched prediction examples
881
885
  mask_halluc_label = halluc_examples[:, 5] == label_idx
882
886
  if mask_halluc_label.sum() > 0:
883
887
  halluc_label_examples = halluc_examples[
884
888
  mask_halluc_label
885
889
  ][:n_examples]
886
- hallucinations[
890
+ unmatched_predictions[
887
891
  iou_idx,
888
892
  score_idx,
889
893
  label_idx,
890
894
  1 : 3 * halluc_label_examples.shape[0] + 1,
891
895
  ] = halluc_label_examples[:, [0, 2, 6]].flatten()
892
896
 
893
- # missing prediction examples
897
+ # unmatched ground truth examples
894
898
  mask_misprd_label = misprd_examples[:, 4] == label_idx
895
899
  if misprd_examples.size > 0:
896
900
  misprd_label_examples = misprd_examples[
897
901
  mask_misprd_label
898
902
  ][:n_examples]
899
- missing_predictions[
903
+ unmatched_ground_truths[
900
904
  iou_idx,
901
905
  score_idx,
902
906
  label_idx,
@@ -905,6 +909,6 @@ def compute_confusion_matrix(
905
909
 
906
910
  return (
907
911
  confusion_matrix,
908
- hallucinations,
909
- missing_predictions,
910
- )
912
+ unmatched_predictions,
913
+ unmatched_ground_truths,
914
+ ) # type: ignore[reportReturnType]
@@ -4,6 +4,7 @@ from dataclasses import dataclass
4
4
  import numpy as np
5
5
  from numpy.typing import NDArray
6
6
  from tqdm import tqdm
7
+
7
8
  from valor_lite.object_detection.annotation import Detection
8
9
  from valor_lite.object_detection.computation import (
9
10
  compute_bbox_iou,
@@ -42,8 +43,8 @@ filtered_metrics = evaluator.evaluate(iou_thresholds=[0.5], filter_mask=filter_m
42
43
 
43
44
  @dataclass
44
45
  class Filter:
45
- ranked_indices: NDArray[np.int32]
46
- detailed_indices: NDArray[np.int32]
46
+ ranked_indices: NDArray[np.intp]
47
+ detailed_indices: NDArray[np.intp]
47
48
  label_metadata: NDArray[np.int32]
48
49
 
49
50
 
@@ -569,7 +570,8 @@ class DataLoader:
569
570
  [gt.extrema, pd.extrema]
570
571
  for pd in detection.predictions
571
572
  for gt in detection.groundtruths
572
- ]
573
+ ],
574
+ dtype=np.float64,
573
575
  )
574
576
  ).reshape(len(detection.predictions), len(detection.groundtruths))
575
577
  for detection in detections
@@ -619,7 +619,7 @@ class Metric(BaseMetric):
619
619
  ],
620
620
  ],
621
621
  ],
622
- hallucinations: dict[
622
+ unmatched_predictions: dict[
623
623
  str, # prediction label value
624
624
  dict[
625
625
  str, # either `count` or `examples`
@@ -636,7 +636,7 @@ class Metric(BaseMetric):
636
636
  ],
637
637
  ],
638
638
  ],
639
- missing_predictions: dict[
639
+ unmatched_ground_truths: dict[
640
640
  str, # ground truth label value
641
641
  dict[
642
642
  str, # either `count` or `examples`
@@ -660,8 +660,8 @@ class Metric(BaseMetric):
660
660
  Confusion matrix for object detection tasks.
661
661
 
662
662
  This class encapsulates detailed information about the model's performance, including correct
663
- predictions, misclassifications, hallucinations (false positives), and missing predictions
664
- (false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
663
+ predictions, misclassifications, unmatched_predictions (subset of false positives), and unmatched ground truths
664
+ (subset of false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
665
665
 
666
666
  Confusion Matrix Format:
667
667
  {
@@ -683,7 +683,7 @@ class Metric(BaseMetric):
683
683
  ...
684
684
  }
685
685
 
686
- Hallucinations Format:
686
+ Unmatched Predictions Format:
687
687
  {
688
688
  <prediction label>: {
689
689
  'count': int,
@@ -699,7 +699,7 @@ class Metric(BaseMetric):
699
699
  ...
700
700
  }
701
701
 
702
- Missing Prediction Format:
702
+ Unmatched Ground Truths Format:
703
703
  {
704
704
  <ground truth label>: {
705
705
  'count': int,
@@ -721,13 +721,13 @@ class Metric(BaseMetric):
721
721
  is the prediction label value, and the innermost dictionary contains either a `count`
722
722
  or a list of `examples`. Each example includes the datum UID, ground truth bounding box,
723
723
  predicted bounding box, and prediction scores.
724
- hallucinations : dict
724
+ unmatched_predictions : dict
725
725
  A dictionary where each key is a prediction label value with no corresponding ground truth
726
- (false positives). The value is a dictionary containing either a `count` or a list of
726
+ (subset of false positives). The value is a dictionary containing either a `count` or a list of
727
727
  `examples`. Each example includes the datum UID, predicted bounding box, and prediction score.
728
- missing_predictions : dict
728
+ unmatched_ground_truths : dict
729
729
  A dictionary where each key is a ground truth label value for which the model failed to predict
730
- (false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
730
+ (subset of false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
731
731
  Each example includes the datum UID and ground truth bounding box.
732
732
  score_threshold : float
733
733
  The confidence score threshold used to filter predictions.
@@ -744,8 +744,8 @@ class Metric(BaseMetric):
744
744
  type=MetricType.ConfusionMatrix.value,
745
745
  value={
746
746
  "confusion_matrix": confusion_matrix,
747
- "hallucinations": hallucinations,
748
- "missing_predictions": missing_predictions,
747
+ "unmatched_predictions": unmatched_predictions,
748
+ "unmatched_ground_truths": unmatched_ground_truths,
749
749
  },
750
750
  parameters={
751
751
  "score_threshold": score_threshold,