valor-lite 0.33.18__tar.gz → 0.34.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of valor-lite might be problematic. Click here for more details.
- {valor_lite-0.33.18/valor_lite.egg-info → valor_lite-0.34.0}/PKG-INFO +14 -31
- {valor_lite-0.33.18 → valor_lite-0.34.0}/pyproject.toml +13 -7
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/computation.py +11 -9
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/manager.py +3 -3
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/metric.py +6 -6
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/utilities.py +11 -8
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/computation.py +30 -26
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/manager.py +5 -3
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/metric.py +12 -12
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/utilities.py +24 -23
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/computation.py +9 -9
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/manager.py +2 -1
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/metric.py +10 -10
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/utilities.py +7 -6
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/computation.py +1 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0/valor_lite.egg-info}/PKG-INFO +14 -31
- valor_lite-0.34.0/valor_lite.egg-info/SOURCES.txt +42 -0
- valor_lite-0.34.0/valor_lite.egg-info/requires.txt +28 -0
- valor_lite-0.33.18/benchmarks/.gitignore +0 -2
- valor_lite-0.33.18/benchmarks/benchmark_classification.py +0 -272
- valor_lite-0.33.18/benchmarks/benchmark_objdet.py +0 -410
- valor_lite-0.33.18/benchmarks/synthetic/benchmark_semantic_segmentation.py +0 -94
- valor_lite-0.33.18/examples/.gitignore +0 -1
- valor_lite-0.33.18/examples/benchmarking.ipynb +0 -279
- valor_lite-0.33.18/examples/object-detection.ipynb +0 -3023
- valor_lite-0.33.18/examples/tabular_classification.ipynb +0 -633
- valor_lite-0.33.18/examples/text_generation.ipynb +0 -500
- valor_lite-0.33.18/tests/classification/conftest.py +0 -331
- valor_lite-0.33.18/tests/classification/test_accuracy.py +0 -226
- valor_lite-0.33.18/tests/classification/test_confusion_matrix.py +0 -618
- valor_lite-0.33.18/tests/classification/test_counts.py +0 -1318
- valor_lite-0.33.18/tests/classification/test_dataloader.py +0 -18
- valor_lite-0.33.18/tests/classification/test_evaluator.py +0 -64
- valor_lite-0.33.18/tests/classification/test_f1.py +0 -394
- valor_lite-0.33.18/tests/classification/test_filtering.py +0 -570
- valor_lite-0.33.18/tests/classification/test_metric.py +0 -39
- valor_lite-0.33.18/tests/classification/test_precision.py +0 -392
- valor_lite-0.33.18/tests/classification/test_recall.py +0 -327
- valor_lite-0.33.18/tests/classification/test_rocauc.py +0 -396
- valor_lite-0.33.18/tests/classification/test_schemas.py +0 -30
- valor_lite-0.33.18/tests/classification/test_stability.py +0 -70
- valor_lite-0.33.18/tests/object_detection/__init__.py +0 -0
- valor_lite-0.33.18/tests/object_detection/conftest.py +0 -1123
- valor_lite-0.33.18/tests/object_detection/test_accuracy.py +0 -483
- valor_lite-0.33.18/tests/object_detection/test_average_precision.py +0 -854
- valor_lite-0.33.18/tests/object_detection/test_average_recall.py +0 -604
- valor_lite-0.33.18/tests/object_detection/test_confusion_matrix.py +0 -1789
- valor_lite-0.33.18/tests/object_detection/test_counts.py +0 -601
- valor_lite-0.33.18/tests/object_detection/test_dataloader.py +0 -144
- valor_lite-0.33.18/tests/object_detection/test_evaluator.py +0 -130
- valor_lite-0.33.18/tests/object_detection/test_f1.py +0 -470
- valor_lite-0.33.18/tests/object_detection/test_filtering.py +0 -586
- valor_lite-0.33.18/tests/object_detection/test_iou.py +0 -371
- valor_lite-0.33.18/tests/object_detection/test_metric.py +0 -39
- valor_lite-0.33.18/tests/object_detection/test_pr_curve.py +0 -234
- valor_lite-0.33.18/tests/object_detection/test_precision.py +0 -470
- valor_lite-0.33.18/tests/object_detection/test_recall.py +0 -469
- valor_lite-0.33.18/tests/object_detection/test_schemas.py +0 -160
- valor_lite-0.33.18/tests/object_detection/test_stability.py +0 -103
- valor_lite-0.33.18/tests/semantic_segmentation/__init__.py +0 -0
- valor_lite-0.33.18/tests/semantic_segmentation/conftest.py +0 -132
- valor_lite-0.33.18/tests/semantic_segmentation/test_accuracy.py +0 -67
- valor_lite-0.33.18/tests/semantic_segmentation/test_annotation.py +0 -136
- valor_lite-0.33.18/tests/semantic_segmentation/test_confusion_matrix.py +0 -153
- valor_lite-0.33.18/tests/semantic_segmentation/test_dataloader.py +0 -8
- valor_lite-0.33.18/tests/semantic_segmentation/test_evaluator.py +0 -66
- valor_lite-0.33.18/tests/semantic_segmentation/test_f1.py +0 -94
- valor_lite-0.33.18/tests/semantic_segmentation/test_filtering.py +0 -75
- valor_lite-0.33.18/tests/semantic_segmentation/test_iou.py +0 -125
- valor_lite-0.33.18/tests/semantic_segmentation/test_metric.py +0 -39
- valor_lite-0.33.18/tests/semantic_segmentation/test_precision.py +0 -96
- valor_lite-0.33.18/tests/semantic_segmentation/test_recall.py +0 -94
- valor_lite-0.33.18/tests/semantic_segmentation/test_stability.py +0 -85
- valor_lite-0.33.18/tests/text_generation/__init__.py +0 -0
- valor_lite-0.33.18/tests/text_generation/conftest.py +0 -240
- valor_lite-0.33.18/tests/text_generation/llm/__init__.py +0 -0
- valor_lite-0.33.18/tests/text_generation/llm/test_generation.py +0 -140
- valor_lite-0.33.18/tests/text_generation/llm/test_integrations.py +0 -364
- valor_lite-0.33.18/tests/text_generation/llm/test_utilities.py +0 -83
- valor_lite-0.33.18/tests/text_generation/llm/test_validators.py +0 -107
- valor_lite-0.33.18/tests/text_generation/metrics/test_answer_correctness.py +0 -87
- valor_lite-0.33.18/tests/text_generation/metrics/test_answer_relevance.py +0 -109
- valor_lite-0.33.18/tests/text_generation/metrics/test_bias.py +0 -104
- valor_lite-0.33.18/tests/text_generation/metrics/test_context_precision.py +0 -195
- valor_lite-0.33.18/tests/text_generation/metrics/test_context_recall.py +0 -194
- valor_lite-0.33.18/tests/text_generation/metrics/test_context_relevance.py +0 -135
- valor_lite-0.33.18/tests/text_generation/metrics/test_faithfulness.py +0 -155
- valor_lite-0.33.18/tests/text_generation/metrics/test_hallucination.py +0 -118
- valor_lite-0.33.18/tests/text_generation/metrics/test_metric.py +0 -39
- valor_lite-0.33.18/tests/text_generation/metrics/test_rouge.py +0 -242
- valor_lite-0.33.18/tests/text_generation/metrics/test_sentence_bleu.py +0 -224
- valor_lite-0.33.18/tests/text_generation/metrics/test_summary_coherence.py +0 -88
- valor_lite-0.33.18/tests/text_generation/metrics/test_toxicity.py +0 -105
- valor_lite-0.33.18/tests/text_generation/test_evaluator.py +0 -170
- valor_lite-0.33.18/tests/text_generation/test_manager.py +0 -110
- valor_lite-0.33.18/valor_lite/LICENSE +0 -21
- valor_lite-0.33.18/valor_lite/__init__.py +0 -0
- valor_lite-0.33.18/valor_lite/text_generation/llm/__init__.py +0 -0
- valor_lite-0.33.18/valor_lite.egg-info/SOURCES.txt +0 -122
- valor_lite-0.33.18/valor_lite.egg-info/requires.txt +0 -22
- {valor_lite-0.33.18 → valor_lite-0.34.0}/README.md +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/setup.cfg +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0/valor_lite}/LICENSE +0 -0
- {valor_lite-0.33.18/tests → valor_lite-0.34.0/valor_lite}/__init__.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/__init__.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/classification/annotation.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/__init__.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/object_detection/annotation.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/profiling.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/schemas.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/__init__.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/annotation.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/semantic_segmentation/benchmark.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/__init__.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/annotation.py +0 -0
- {valor_lite-0.33.18/tests/classification → valor_lite-0.34.0/valor_lite/text_generation/llm}/__init__.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/exceptions.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/generation.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/instructions.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/integrations.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/utilities.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/llm/validators.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/manager.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite/text_generation/metric.py +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite.egg-info/dependency_links.txt +0 -0
- {valor_lite-0.33.18 → valor_lite-0.34.0}/valor_lite.egg-info/top_level.txt +0 -0
|
@@ -1,42 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: valor-lite
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
5
|
-
License: MIT License
|
|
6
|
-
|
|
7
|
-
Copyright (c) 2023 Striveworks
|
|
8
|
-
|
|
9
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
-
in the Software without restriction, including without limitation the rights
|
|
12
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
-
furnished to do so, subject to the following conditions:
|
|
15
|
-
|
|
16
|
-
The above copyright notice and this permission notice shall be included in all
|
|
17
|
-
copies or substantial portions of the Software.
|
|
18
|
-
|
|
19
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
-
SOFTWARE.
|
|
26
|
-
|
|
3
|
+
Version: 0.34.0
|
|
4
|
+
Summary: Evaluate machine learning models.
|
|
27
5
|
Project-URL: homepage, https://www.striveworks.com
|
|
28
6
|
Requires-Python: >=3.10
|
|
29
7
|
Description-Content-Type: text/markdown
|
|
30
|
-
|
|
8
|
+
Requires-Dist: numpy
|
|
9
|
+
Requires-Dist: tqdm
|
|
10
|
+
Requires-Dist: shapely
|
|
31
11
|
Requires-Dist: evaluate
|
|
32
|
-
Requires-Dist: importlib_metadata; python_version < "3.8"
|
|
33
12
|
Requires-Dist: nltk
|
|
34
|
-
Requires-Dist: numpy
|
|
35
|
-
Requires-Dist: Pillow>=9.1.0
|
|
36
|
-
Requires-Dist: requests
|
|
37
13
|
Requires-Dist: rouge_score
|
|
38
|
-
Requires-Dist: shapely
|
|
39
|
-
Requires-Dist: tqdm
|
|
40
14
|
Provides-Extra: mistral
|
|
41
15
|
Requires-Dist: mistralai>=1.0; extra == "mistral"
|
|
42
16
|
Provides-Extra: openai
|
|
@@ -45,6 +19,15 @@ Provides-Extra: test
|
|
|
45
19
|
Requires-Dist: pytest; extra == "test"
|
|
46
20
|
Requires-Dist: coverage; extra == "test"
|
|
47
21
|
Requires-Dist: pre-commit; extra == "test"
|
|
22
|
+
Provides-Extra: docs
|
|
23
|
+
Requires-Dist: mkdocs; extra == "docs"
|
|
24
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
25
|
+
Requires-Dist: mkdocstrings; extra == "docs"
|
|
26
|
+
Requires-Dist: mkdocstrings-python; extra == "docs"
|
|
27
|
+
Requires-Dist: mkdocs-include-dir-to-nav; extra == "docs"
|
|
28
|
+
Requires-Dist: mkdocs-swagger-ui-tag; extra == "docs"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: valor-lite[docs,mistral,openai,test]; extra == "dev"
|
|
48
31
|
|
|
49
32
|
# valor-lite: Fast, local machine learning evaluation.
|
|
50
33
|
|
|
@@ -1,20 +1,17 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "valor-lite"
|
|
3
3
|
dynamic = ["version"]
|
|
4
|
-
description = "
|
|
4
|
+
description = "Evaluate machine learning models."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10"
|
|
7
7
|
license = { file = "LICENSE" }
|
|
8
8
|
dependencies = [
|
|
9
|
+
"numpy",
|
|
10
|
+
"tqdm",
|
|
11
|
+
"shapely",
|
|
9
12
|
"evaluate",
|
|
10
|
-
"importlib_metadata; python_version < '3.8'",
|
|
11
13
|
"nltk",
|
|
12
|
-
"numpy",
|
|
13
|
-
"Pillow >= 9.1.0",
|
|
14
|
-
"requests",
|
|
15
14
|
"rouge_score",
|
|
16
|
-
"shapely",
|
|
17
|
-
"tqdm",
|
|
18
15
|
]
|
|
19
16
|
|
|
20
17
|
[project.urls]
|
|
@@ -28,6 +25,15 @@ build-backend = "setuptools.build_meta"
|
|
|
28
25
|
mistral = ["mistralai >= 1.0"]
|
|
29
26
|
openai = ["openai"]
|
|
30
27
|
test = ["pytest", "coverage", "pre-commit"]
|
|
28
|
+
docs = [
|
|
29
|
+
"mkdocs",
|
|
30
|
+
"mkdocs-material",
|
|
31
|
+
"mkdocstrings",
|
|
32
|
+
"mkdocstrings-python",
|
|
33
|
+
"mkdocs-include-dir-to-nav",
|
|
34
|
+
"mkdocs-swagger-ui-tag",
|
|
35
|
+
]
|
|
36
|
+
dev = ["valor-lite[openai, mistral, test, docs]"]
|
|
31
37
|
|
|
32
38
|
[tool.black]
|
|
33
39
|
line-length = 79
|
|
@@ -212,7 +212,7 @@ def _count_with_examples(
|
|
|
212
212
|
data: NDArray[np.float64],
|
|
213
213
|
unique_idx: int | list[int],
|
|
214
214
|
label_idx: int | list[int],
|
|
215
|
-
) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.
|
|
215
|
+
) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.intp]]:
|
|
216
216
|
"""
|
|
217
217
|
Helper function for counting occurences of unique detailed pairs.
|
|
218
218
|
|
|
@@ -231,7 +231,7 @@ def _count_with_examples(
|
|
|
231
231
|
Examples drawn from the data input.
|
|
232
232
|
NDArray[np.int32]
|
|
233
233
|
Unique label indices.
|
|
234
|
-
NDArray[np.
|
|
234
|
+
NDArray[np.intp]
|
|
235
235
|
Counts for each unique label index.
|
|
236
236
|
"""
|
|
237
237
|
unique_rows, indices = np.unique(
|
|
@@ -282,18 +282,20 @@ def compute_confusion_matrix(
|
|
|
282
282
|
NDArray[np.float64]
|
|
283
283
|
Confusion matrix.
|
|
284
284
|
NDArray[np.int32]
|
|
285
|
-
Ground
|
|
285
|
+
Unmatched Ground Truths.
|
|
286
286
|
"""
|
|
287
287
|
|
|
288
288
|
n_labels = label_metadata.shape[0]
|
|
289
289
|
n_scores = score_thresholds.shape[0]
|
|
290
290
|
|
|
291
|
-
confusion_matrix =
|
|
291
|
+
confusion_matrix = np.full(
|
|
292
292
|
(n_scores, n_labels, n_labels, 2 * n_examples + 1),
|
|
293
|
+
fill_value=-1.0,
|
|
293
294
|
dtype=np.float32,
|
|
294
295
|
)
|
|
295
|
-
|
|
296
|
+
unmatched_ground_truths = np.full(
|
|
296
297
|
(n_scores, n_labels, n_examples + 1),
|
|
298
|
+
fill_value=-1,
|
|
297
299
|
dtype=np.int32,
|
|
298
300
|
)
|
|
299
301
|
|
|
@@ -339,7 +341,7 @@ def compute_confusion_matrix(
|
|
|
339
341
|
score_idx, misclf_labels[:, 0], misclf_labels[:, 1], 0
|
|
340
342
|
] = misclf_counts
|
|
341
343
|
|
|
342
|
-
|
|
344
|
+
unmatched_ground_truths[score_idx, misprd_labels, 0] = misprd_counts
|
|
343
345
|
|
|
344
346
|
if n_examples > 0:
|
|
345
347
|
for label_idx in range(n_labels):
|
|
@@ -375,16 +377,16 @@ def compute_confusion_matrix(
|
|
|
375
377
|
1 : 2 * misclf_label_examples.shape[0] + 1,
|
|
376
378
|
] = misclf_label_examples[:, [0, 3]].flatten()
|
|
377
379
|
|
|
378
|
-
#
|
|
380
|
+
# unmatched ground truth examples
|
|
379
381
|
mask_misprd_label = misprd_examples[:, 1] == label_idx
|
|
380
382
|
if misprd_examples.size > 0:
|
|
381
383
|
misprd_label_examples = misprd_examples[mask_misprd_label][
|
|
382
384
|
:n_examples
|
|
383
385
|
]
|
|
384
|
-
|
|
386
|
+
unmatched_ground_truths[
|
|
385
387
|
score_idx,
|
|
386
388
|
label_idx,
|
|
387
389
|
1 : misprd_label_examples.shape[0] + 1,
|
|
388
390
|
] = misprd_label_examples[:, 0].flatten()
|
|
389
391
|
|
|
390
|
-
return confusion_matrix,
|
|
392
|
+
return confusion_matrix, unmatched_ground_truths # type: ignore[reportReturnType]
|
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
from numpy.typing import NDArray
|
|
6
6
|
from tqdm import tqdm
|
|
7
|
+
|
|
7
8
|
from valor_lite.classification.annotation import Classification
|
|
8
9
|
from valor_lite.classification.computation import (
|
|
9
10
|
compute_confusion_matrix,
|
|
@@ -38,7 +39,7 @@ filtered_metrics = evaluator.evaluate(filter_mask=filter_mask)
|
|
|
38
39
|
|
|
39
40
|
@dataclass
|
|
40
41
|
class Filter:
|
|
41
|
-
indices: NDArray[np.
|
|
42
|
+
indices: NDArray[np.intp]
|
|
42
43
|
label_metadata: NDArray[np.int32]
|
|
43
44
|
n_datums: int
|
|
44
45
|
|
|
@@ -169,8 +170,7 @@ class Evaluator:
|
|
|
169
170
|
label_metadata_per_datum = self._label_metadata_per_datum.copy()
|
|
170
171
|
label_metadata_per_datum[:, ~mask] = 0
|
|
171
172
|
|
|
172
|
-
label_metadata
|
|
173
|
-
label_metadata = np.transpose(
|
|
173
|
+
label_metadata: NDArray[np.int32] = np.transpose(
|
|
174
174
|
np.sum(
|
|
175
175
|
label_metadata_per_datum,
|
|
176
176
|
axis=1,
|
|
@@ -321,7 +321,7 @@ class Metric(BaseMetric):
|
|
|
321
321
|
],
|
|
322
322
|
],
|
|
323
323
|
],
|
|
324
|
-
|
|
324
|
+
unmatched_ground_truths: dict[
|
|
325
325
|
str, # ground truth label value
|
|
326
326
|
dict[
|
|
327
327
|
str, # either `count` or `examples`
|
|
@@ -335,8 +335,8 @@ class Metric(BaseMetric):
|
|
|
335
335
|
The confusion matrix and related metrics for the classification task.
|
|
336
336
|
|
|
337
337
|
This class encapsulates detailed information about the model's performance, including correct
|
|
338
|
-
predictions, misclassifications,
|
|
339
|
-
(false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
|
|
338
|
+
predictions, misclassifications, unmatched predictions (subset of false positives), and unmatched ground truths
|
|
339
|
+
(subset of false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
|
|
340
340
|
|
|
341
341
|
Confusion Matrix Structure:
|
|
342
342
|
{
|
|
@@ -358,7 +358,7 @@ class Metric(BaseMetric):
|
|
|
358
358
|
...
|
|
359
359
|
}
|
|
360
360
|
|
|
361
|
-
|
|
361
|
+
Unmatched Ground Truths Structure:
|
|
362
362
|
{
|
|
363
363
|
ground_truth_label: {
|
|
364
364
|
'count': int,
|
|
@@ -379,7 +379,7 @@ class Metric(BaseMetric):
|
|
|
379
379
|
A nested dictionary where the first key is the ground truth label value, the second key
|
|
380
380
|
is the prediction label value, and the innermost dictionary contains either a `count`
|
|
381
381
|
or a list of `examples`. Each example includes the datum UID and prediction score.
|
|
382
|
-
|
|
382
|
+
unmatched_ground_truths : dict
|
|
383
383
|
A dictionary where each key is a ground truth label value for which the model failed to predict
|
|
384
384
|
(false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
|
|
385
385
|
Each example includes the datum UID.
|
|
@@ -396,7 +396,7 @@ class Metric(BaseMetric):
|
|
|
396
396
|
type=MetricType.ConfusionMatrix.value,
|
|
397
397
|
value={
|
|
398
398
|
"confusion_matrix": confusion_matrix,
|
|
399
|
-
"
|
|
399
|
+
"unmatched_ground_truths": unmatched_ground_truths,
|
|
400
400
|
},
|
|
401
401
|
parameters={
|
|
402
402
|
"score_threshold": score_threshold,
|
|
@@ -2,6 +2,7 @@ from collections import defaultdict
|
|
|
2
2
|
|
|
3
3
|
import numpy as np
|
|
4
4
|
from numpy.typing import NDArray
|
|
5
|
+
|
|
5
6
|
from valor_lite.classification.metric import Metric, MetricType
|
|
6
7
|
|
|
7
8
|
|
|
@@ -153,20 +154,20 @@ def _unpack_confusion_matrix_value(
|
|
|
153
154
|
}
|
|
154
155
|
|
|
155
156
|
|
|
156
|
-
def
|
|
157
|
-
|
|
157
|
+
def _unpack_unmatched_ground_truths_value(
|
|
158
|
+
unmatched_ground_truths: NDArray[np.int32],
|
|
158
159
|
number_of_labels: int,
|
|
159
160
|
number_of_examples: int,
|
|
160
161
|
index_to_uid: dict[int, str],
|
|
161
162
|
index_to_label: dict[int, str],
|
|
162
163
|
) -> dict[str, dict[str, int | list[dict[str, str]]]]:
|
|
163
164
|
"""
|
|
164
|
-
Unpacks a numpy array of
|
|
165
|
+
Unpacks a numpy array of unmatched ground truth counts and examples.
|
|
165
166
|
"""
|
|
166
167
|
|
|
167
168
|
datum_idx = (
|
|
168
169
|
lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn
|
|
169
|
-
|
|
170
|
+
unmatched_ground_truths[
|
|
170
171
|
gt_label_idx,
|
|
171
172
|
example_idx + 1,
|
|
172
173
|
]
|
|
@@ -176,7 +177,7 @@ def _unpack_missing_predictions_value(
|
|
|
176
177
|
return {
|
|
177
178
|
index_to_label[gt_label_idx]: {
|
|
178
179
|
"count": max(
|
|
179
|
-
int(
|
|
180
|
+
int(unmatched_ground_truths[gt_label_idx, 0]),
|
|
180
181
|
0,
|
|
181
182
|
),
|
|
182
183
|
"examples": [
|
|
@@ -197,7 +198,7 @@ def unpack_confusion_matrix_into_metric_list(
|
|
|
197
198
|
index_to_label: dict[int, str],
|
|
198
199
|
) -> list[Metric]:
|
|
199
200
|
|
|
200
|
-
(confusion_matrix,
|
|
201
|
+
(confusion_matrix, unmatched_ground_truths) = results
|
|
201
202
|
n_scores, n_labels, _, _ = confusion_matrix.shape
|
|
202
203
|
return [
|
|
203
204
|
Metric.confusion_matrix(
|
|
@@ -210,8 +211,10 @@ def unpack_confusion_matrix_into_metric_list(
|
|
|
210
211
|
index_to_label=index_to_label,
|
|
211
212
|
index_to_uid=index_to_uid,
|
|
212
213
|
),
|
|
213
|
-
|
|
214
|
-
|
|
214
|
+
unmatched_ground_truths=_unpack_unmatched_ground_truths_value(
|
|
215
|
+
unmatched_ground_truths=unmatched_ground_truths[
|
|
216
|
+
score_idx, :, :
|
|
217
|
+
],
|
|
215
218
|
number_of_labels=n_labels,
|
|
216
219
|
number_of_examples=number_of_examples,
|
|
217
220
|
index_to_label=index_to_label,
|
|
@@ -381,9 +381,9 @@ def compute_precion_recall(
|
|
|
381
381
|
_, indices_gt_unique = np.unique(
|
|
382
382
|
tp_candidates[:, [0, 1, 4]], axis=0, return_index=True
|
|
383
383
|
)
|
|
384
|
-
mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=
|
|
384
|
+
mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=np.bool_)
|
|
385
385
|
mask_gt_unique[indices_gt_unique] = True
|
|
386
|
-
true_positives_mask = np.zeros(n_rows, dtype=
|
|
386
|
+
true_positives_mask = np.zeros(n_rows, dtype=np.bool_)
|
|
387
387
|
true_positives_mask[mask_tp_inner] = mask_gt_unique
|
|
388
388
|
|
|
389
389
|
# calculate intermediates
|
|
@@ -452,9 +452,9 @@ def compute_precion_recall(
|
|
|
452
452
|
_, indices_gt_unique = np.unique(
|
|
453
453
|
tp_candidates[:, [0, 1, 4]], axis=0, return_index=True
|
|
454
454
|
)
|
|
455
|
-
mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=
|
|
455
|
+
mask_gt_unique = np.zeros(tp_candidates.shape[0], dtype=np.bool_)
|
|
456
456
|
mask_gt_unique[indices_gt_unique] = True
|
|
457
|
-
true_positives_mask = np.zeros(n_rows, dtype=
|
|
457
|
+
true_positives_mask = np.zeros(n_rows, dtype=np.bool_)
|
|
458
458
|
true_positives_mask[mask_tp_outer] = mask_gt_unique
|
|
459
459
|
|
|
460
460
|
# count running tp and total for AP
|
|
@@ -501,8 +501,8 @@ def compute_precion_recall(
|
|
|
501
501
|
)
|
|
502
502
|
|
|
503
503
|
# calculate average precision
|
|
504
|
-
running_max_precision = np.zeros((n_ious, n_labels))
|
|
505
|
-
running_max_score = np.zeros((n_labels))
|
|
504
|
+
running_max_precision = np.zeros((n_ious, n_labels), dtype=np.float64)
|
|
505
|
+
running_max_score = np.zeros((n_labels), dtype=np.float64)
|
|
506
506
|
for recall in range(100, -1, -1):
|
|
507
507
|
|
|
508
508
|
# running max precision
|
|
@@ -528,8 +528,12 @@ def compute_precion_recall(
|
|
|
528
528
|
|
|
529
529
|
# calculate mAP and mAR
|
|
530
530
|
if unique_pd_labels.size > 0:
|
|
531
|
-
mAP = average_precision[:, unique_pd_labels].mean(
|
|
532
|
-
|
|
531
|
+
mAP: NDArray[np.float64] = average_precision[:, unique_pd_labels].mean(
|
|
532
|
+
axis=1
|
|
533
|
+
)
|
|
534
|
+
mAR: NDArray[np.float64] = average_recall[:, unique_pd_labels].mean(
|
|
535
|
+
axis=1
|
|
536
|
+
)
|
|
533
537
|
else:
|
|
534
538
|
mAP = np.zeros(n_ious, dtype=np.float64)
|
|
535
539
|
mAR = np.zeros(n_scores, dtype=np.float64)
|
|
@@ -561,14 +565,14 @@ def compute_precion_recall(
|
|
|
561
565
|
accuracy,
|
|
562
566
|
counts,
|
|
563
567
|
pr_curve,
|
|
564
|
-
)
|
|
568
|
+
) # type: ignore[reportReturnType]
|
|
565
569
|
|
|
566
570
|
|
|
567
571
|
def _count_with_examples(
|
|
568
572
|
data: NDArray[np.float64],
|
|
569
573
|
unique_idx: int | list[int],
|
|
570
574
|
label_idx: int | list[int],
|
|
571
|
-
) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.
|
|
575
|
+
) -> tuple[NDArray[np.float64], NDArray[np.int32], NDArray[np.intp]]:
|
|
572
576
|
"""
|
|
573
577
|
Helper function for counting occurences of unique detailed pairs.
|
|
574
578
|
|
|
@@ -587,7 +591,7 @@ def _count_with_examples(
|
|
|
587
591
|
Examples drawn from the data input.
|
|
588
592
|
NDArray[np.int32]
|
|
589
593
|
Unique label indices.
|
|
590
|
-
NDArray[np.
|
|
594
|
+
NDArray[np.intp]
|
|
591
595
|
Counts for each unique label index.
|
|
592
596
|
"""
|
|
593
597
|
unique_rows, indices = np.unique(
|
|
@@ -669,9 +673,9 @@ def compute_confusion_matrix(
|
|
|
669
673
|
NDArray[np.float64]
|
|
670
674
|
Confusion matrix.
|
|
671
675
|
NDArray[np.float64]
|
|
672
|
-
|
|
676
|
+
Unmatched Predictions.
|
|
673
677
|
NDArray[np.int32]
|
|
674
|
-
|
|
678
|
+
Unmatched Ground Truths.
|
|
675
679
|
"""
|
|
676
680
|
|
|
677
681
|
n_labels = label_metadata.shape[0]
|
|
@@ -683,12 +687,12 @@ def compute_confusion_matrix(
|
|
|
683
687
|
(n_ious, n_scores, n_labels, n_labels, 4 * n_examples + 1),
|
|
684
688
|
dtype=np.float32,
|
|
685
689
|
)
|
|
686
|
-
|
|
690
|
+
unmatched_predictions = -1 * np.ones(
|
|
687
691
|
# (datum idx, pd idx, pd score) * n_examples + count
|
|
688
692
|
(n_ious, n_scores, n_labels, 3 * n_examples + 1),
|
|
689
693
|
dtype=np.float32,
|
|
690
694
|
)
|
|
691
|
-
|
|
695
|
+
unmatched_ground_truths = -1 * np.ones(
|
|
692
696
|
# (datum idx, gt idx) * n_examples + count
|
|
693
697
|
(n_ious, n_scores, n_labels, 2 * n_examples + 1),
|
|
694
698
|
dtype=np.int32,
|
|
@@ -793,7 +797,7 @@ def compute_confusion_matrix(
|
|
|
793
797
|
data[mask_misclf], unique_idx=[0, 1, 2, 4, 5], label_idx=[3, 4]
|
|
794
798
|
)
|
|
795
799
|
|
|
796
|
-
# count
|
|
800
|
+
# count unmatched predictions
|
|
797
801
|
(
|
|
798
802
|
halluc_examples,
|
|
799
803
|
halluc_labels,
|
|
@@ -802,7 +806,7 @@ def compute_confusion_matrix(
|
|
|
802
806
|
data[mask_halluc], unique_idx=[0, 2, 5], label_idx=2
|
|
803
807
|
)
|
|
804
808
|
|
|
805
|
-
# count
|
|
809
|
+
# count unmatched ground truths
|
|
806
810
|
(
|
|
807
811
|
misprd_examples,
|
|
808
812
|
misprd_labels,
|
|
@@ -822,13 +826,13 @@ def compute_confusion_matrix(
|
|
|
822
826
|
misclf_labels[:, 1],
|
|
823
827
|
0,
|
|
824
828
|
] = misclf_counts
|
|
825
|
-
|
|
829
|
+
unmatched_predictions[
|
|
826
830
|
iou_idx,
|
|
827
831
|
score_idx,
|
|
828
832
|
halluc_labels,
|
|
829
833
|
0,
|
|
830
834
|
] = halluc_counts
|
|
831
|
-
|
|
835
|
+
unmatched_ground_truths[
|
|
832
836
|
iou_idx,
|
|
833
837
|
score_idx,
|
|
834
838
|
misprd_labels,
|
|
@@ -877,26 +881,26 @@ def compute_confusion_matrix(
|
|
|
877
881
|
:, [0, 1, 2, 6]
|
|
878
882
|
].flatten()
|
|
879
883
|
|
|
880
|
-
#
|
|
884
|
+
# unmatched prediction examples
|
|
881
885
|
mask_halluc_label = halluc_examples[:, 5] == label_idx
|
|
882
886
|
if mask_halluc_label.sum() > 0:
|
|
883
887
|
halluc_label_examples = halluc_examples[
|
|
884
888
|
mask_halluc_label
|
|
885
889
|
][:n_examples]
|
|
886
|
-
|
|
890
|
+
unmatched_predictions[
|
|
887
891
|
iou_idx,
|
|
888
892
|
score_idx,
|
|
889
893
|
label_idx,
|
|
890
894
|
1 : 3 * halluc_label_examples.shape[0] + 1,
|
|
891
895
|
] = halluc_label_examples[:, [0, 2, 6]].flatten()
|
|
892
896
|
|
|
893
|
-
#
|
|
897
|
+
# unmatched ground truth examples
|
|
894
898
|
mask_misprd_label = misprd_examples[:, 4] == label_idx
|
|
895
899
|
if misprd_examples.size > 0:
|
|
896
900
|
misprd_label_examples = misprd_examples[
|
|
897
901
|
mask_misprd_label
|
|
898
902
|
][:n_examples]
|
|
899
|
-
|
|
903
|
+
unmatched_ground_truths[
|
|
900
904
|
iou_idx,
|
|
901
905
|
score_idx,
|
|
902
906
|
label_idx,
|
|
@@ -905,6 +909,6 @@ def compute_confusion_matrix(
|
|
|
905
909
|
|
|
906
910
|
return (
|
|
907
911
|
confusion_matrix,
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
)
|
|
912
|
+
unmatched_predictions,
|
|
913
|
+
unmatched_ground_truths,
|
|
914
|
+
) # type: ignore[reportReturnType]
|
|
@@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
from numpy.typing import NDArray
|
|
6
6
|
from tqdm import tqdm
|
|
7
|
+
|
|
7
8
|
from valor_lite.object_detection.annotation import Detection
|
|
8
9
|
from valor_lite.object_detection.computation import (
|
|
9
10
|
compute_bbox_iou,
|
|
@@ -42,8 +43,8 @@ filtered_metrics = evaluator.evaluate(iou_thresholds=[0.5], filter_mask=filter_m
|
|
|
42
43
|
|
|
43
44
|
@dataclass
|
|
44
45
|
class Filter:
|
|
45
|
-
ranked_indices: NDArray[np.
|
|
46
|
-
detailed_indices: NDArray[np.
|
|
46
|
+
ranked_indices: NDArray[np.intp]
|
|
47
|
+
detailed_indices: NDArray[np.intp]
|
|
47
48
|
label_metadata: NDArray[np.int32]
|
|
48
49
|
|
|
49
50
|
|
|
@@ -569,7 +570,8 @@ class DataLoader:
|
|
|
569
570
|
[gt.extrema, pd.extrema]
|
|
570
571
|
for pd in detection.predictions
|
|
571
572
|
for gt in detection.groundtruths
|
|
572
|
-
]
|
|
573
|
+
],
|
|
574
|
+
dtype=np.float64,
|
|
573
575
|
)
|
|
574
576
|
).reshape(len(detection.predictions), len(detection.groundtruths))
|
|
575
577
|
for detection in detections
|
|
@@ -619,7 +619,7 @@ class Metric(BaseMetric):
|
|
|
619
619
|
],
|
|
620
620
|
],
|
|
621
621
|
],
|
|
622
|
-
|
|
622
|
+
unmatched_predictions: dict[
|
|
623
623
|
str, # prediction label value
|
|
624
624
|
dict[
|
|
625
625
|
str, # either `count` or `examples`
|
|
@@ -636,7 +636,7 @@ class Metric(BaseMetric):
|
|
|
636
636
|
],
|
|
637
637
|
],
|
|
638
638
|
],
|
|
639
|
-
|
|
639
|
+
unmatched_ground_truths: dict[
|
|
640
640
|
str, # ground truth label value
|
|
641
641
|
dict[
|
|
642
642
|
str, # either `count` or `examples`
|
|
@@ -660,8 +660,8 @@ class Metric(BaseMetric):
|
|
|
660
660
|
Confusion matrix for object detection tasks.
|
|
661
661
|
|
|
662
662
|
This class encapsulates detailed information about the model's performance, including correct
|
|
663
|
-
predictions, misclassifications,
|
|
664
|
-
(false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
|
|
663
|
+
predictions, misclassifications, unmatched_predictions (subset of false positives), and unmatched ground truths
|
|
664
|
+
(subset of false negatives). It provides counts and examples for each category to facilitate in-depth analysis.
|
|
665
665
|
|
|
666
666
|
Confusion Matrix Format:
|
|
667
667
|
{
|
|
@@ -683,7 +683,7 @@ class Metric(BaseMetric):
|
|
|
683
683
|
...
|
|
684
684
|
}
|
|
685
685
|
|
|
686
|
-
|
|
686
|
+
Unmatched Predictions Format:
|
|
687
687
|
{
|
|
688
688
|
<prediction label>: {
|
|
689
689
|
'count': int,
|
|
@@ -699,7 +699,7 @@ class Metric(BaseMetric):
|
|
|
699
699
|
...
|
|
700
700
|
}
|
|
701
701
|
|
|
702
|
-
|
|
702
|
+
Unmatched Ground Truths Format:
|
|
703
703
|
{
|
|
704
704
|
<ground truth label>: {
|
|
705
705
|
'count': int,
|
|
@@ -721,13 +721,13 @@ class Metric(BaseMetric):
|
|
|
721
721
|
is the prediction label value, and the innermost dictionary contains either a `count`
|
|
722
722
|
or a list of `examples`. Each example includes the datum UID, ground truth bounding box,
|
|
723
723
|
predicted bounding box, and prediction scores.
|
|
724
|
-
|
|
724
|
+
unmatched_predictions : dict
|
|
725
725
|
A dictionary where each key is a prediction label value with no corresponding ground truth
|
|
726
|
-
(false positives). The value is a dictionary containing either a `count` or a list of
|
|
726
|
+
(subset of false positives). The value is a dictionary containing either a `count` or a list of
|
|
727
727
|
`examples`. Each example includes the datum UID, predicted bounding box, and prediction score.
|
|
728
|
-
|
|
728
|
+
unmatched_ground_truths : dict
|
|
729
729
|
A dictionary where each key is a ground truth label value for which the model failed to predict
|
|
730
|
-
(false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
|
|
730
|
+
(subset of false negatives). The value is a dictionary containing either a `count` or a list of `examples`.
|
|
731
731
|
Each example includes the datum UID and ground truth bounding box.
|
|
732
732
|
score_threshold : float
|
|
733
733
|
The confidence score threshold used to filter predictions.
|
|
@@ -744,8 +744,8 @@ class Metric(BaseMetric):
|
|
|
744
744
|
type=MetricType.ConfusionMatrix.value,
|
|
745
745
|
value={
|
|
746
746
|
"confusion_matrix": confusion_matrix,
|
|
747
|
-
"
|
|
748
|
-
"
|
|
747
|
+
"unmatched_predictions": unmatched_predictions,
|
|
748
|
+
"unmatched_ground_truths": unmatched_ground_truths,
|
|
749
749
|
},
|
|
750
750
|
parameters={
|
|
751
751
|
"score_threshold": score_threshold,
|