dataeval 0.72.0__py3-none-any.whl → 0.72.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/detectors/__init__.py +4 -3
- dataeval/detectors/drift/__init__.py +10 -11
- dataeval/{_internal/detectors → detectors}/drift/base.py +51 -102
- dataeval/{_internal/detectors → detectors}/drift/cvm.py +9 -8
- dataeval/{_internal/detectors → detectors}/drift/ks.py +11 -10
- dataeval/{_internal/detectors → detectors}/drift/mmd.py +33 -34
- dataeval/{_internal/detectors → detectors}/drift/torch.py +15 -13
- dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +12 -9
- dataeval/detectors/drift/updates.py +61 -0
- dataeval/detectors/linters/__init__.py +3 -3
- dataeval/{_internal/detectors → detectors/linters}/clusterer.py +47 -45
- dataeval/{_internal/detectors → detectors/linters}/duplicates.py +20 -10
- dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
- dataeval/{_internal/detectors → detectors/linters}/outliers.py +19 -26
- dataeval/detectors/ood/__init__.py +8 -16
- dataeval/{_internal/detectors → detectors}/ood/ae.py +9 -9
- dataeval/{_internal/detectors → detectors}/ood/aegmm.py +10 -30
- dataeval/{_internal/detectors → detectors}/ood/base.py +27 -21
- dataeval/{_internal/detectors → detectors}/ood/llr.py +27 -23
- dataeval/detectors/ood/metadata_ks_compare.py +99 -0
- dataeval/detectors/ood/metadata_least_likely.py +119 -0
- dataeval/detectors/ood/metadata_ood_mi.py +92 -0
- dataeval/{_internal/detectors → detectors}/ood/vae.py +11 -13
- dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
- dataeval/{_internal/interop.py → interop.py} +12 -7
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +4 -4
- dataeval/{_internal/metrics → metrics/bias}/balance.py +70 -4
- dataeval/{_internal/metrics → metrics/bias}/coverage.py +10 -8
- dataeval/{_internal/metrics → metrics/bias}/diversity.py +54 -20
- dataeval/metrics/bias/metadata.py +275 -0
- dataeval/{_internal/metrics → metrics/bias}/parity.py +21 -17
- dataeval/metrics/estimators/__init__.py +3 -3
- dataeval/{_internal/metrics → metrics/estimators}/ber.py +31 -28
- dataeval/{_internal/metrics → metrics/estimators}/divergence.py +15 -16
- dataeval/{_internal/metrics → metrics/estimators}/uap.py +8 -6
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/{_internal/metrics → metrics}/stats/base.py +66 -40
- dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +19 -15
- dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +19 -17
- dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +12 -10
- dataeval/metrics/stats/hashstats.py +156 -0
- dataeval/{_internal/metrics → metrics}/stats/labelstats.py +8 -6
- dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +12 -11
- dataeval/{_internal/metrics → metrics}/stats/visualstats.py +14 -13
- dataeval/{_internal/output.py → output.py} +26 -6
- dataeval/utils/__init__.py +8 -4
- dataeval/utils/image.py +71 -0
- dataeval/utils/shared.py +151 -0
- dataeval/utils/split_dataset.py +486 -0
- dataeval/utils/tensorflow/__init__.py +9 -7
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +64 -68
- dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +10 -9
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +18 -22
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +18 -18
- dataeval/utils/tensorflow/loss/__init__.py +6 -2
- dataeval/utils/torch/__init__.py +7 -3
- dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
- dataeval/{_internal → utils/torch}/datasets.py +49 -43
- dataeval/utils/torch/models.py +138 -0
- dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +12 -141
- dataeval/{_internal → utils/torch}/utils.py +3 -1
- dataeval/workflows/__init__.py +1 -1
- dataeval/{_internal/workflows → workflows}/sufficiency.py +42 -37
- {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/METADATA +7 -5
- dataeval-0.72.2.dist-info/RECORD +72 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/stats/hashstats.py +0 -75
- dataeval/_internal/metrics/utils.py +0 -447
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/utils.py +0 -67
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/detectors/drift/kernels/__init__.py +0 -10
- dataeval/detectors/drift/updates/__init__.py +0 -7
- dataeval/utils/tensorflow/models/__init__.py +0 -9
- dataeval/utils/tensorflow/recon/__init__.py +0 -3
- dataeval/utils/torch/datasets/__init__.py +0 -12
- dataeval/utils/torch/models/__init__.py +0 -11
- dataeval/utils/torch/trainer/__init__.py +0 -7
- dataeval-0.72.0.dist-info/RECORD +0 -80
- /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
- {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.0.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
This module contains the implementation of the
|
3
3
|
FR Test Statistic based estimate and the
|
4
|
-
KNN based estimate for the Bayes Error Rate
|
4
|
+
KNN based estimate for the :term:`Bayes error rate<Bayes Error Rate (BER)>`
|
5
5
|
|
6
6
|
Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
|
7
7
|
https://arxiv.org/abs/1811.06419
|
@@ -9,6 +9,8 @@ https://arxiv.org/abs/1811.06419
|
|
9
9
|
|
10
10
|
from __future__ import annotations
|
11
11
|
|
12
|
+
__all__ = ["BEROutput", "ber"]
|
13
|
+
|
12
14
|
from dataclasses import dataclass
|
13
15
|
from typing import Literal
|
14
16
|
|
@@ -17,9 +19,9 @@ from numpy.typing import ArrayLike, NDArray
|
|
17
19
|
from scipy.sparse import coo_matrix
|
18
20
|
from scipy.stats import mode
|
19
21
|
|
20
|
-
from dataeval.
|
21
|
-
from dataeval.
|
22
|
-
from dataeval.
|
22
|
+
from dataeval.interop import as_numpy
|
23
|
+
from dataeval.output import OutputMetadata, set_metadata
|
24
|
+
from dataeval.utils.shared import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
|
23
25
|
|
24
26
|
|
25
27
|
@dataclass(frozen=True)
|
@@ -30,7 +32,7 @@ class BEROutput(OutputMetadata):
|
|
30
32
|
Attributes
|
31
33
|
----------
|
32
34
|
ber : float
|
33
|
-
The upper bounds of the Bayes Error Rate
|
35
|
+
The upper bounds of the :term:`Bayes error rate<Bayes Error Rate (BER)>`
|
34
36
|
ber_lower : float
|
35
37
|
The lower bounds of the Bayes Error Rate
|
36
38
|
"""
|
@@ -39,51 +41,55 @@ class BEROutput(OutputMetadata):
|
|
39
41
|
ber_lower: float
|
40
42
|
|
41
43
|
|
42
|
-
def ber_mst(
|
43
|
-
"""Calculates the Bayes Error Rate using a minimum spanning tree
|
44
|
+
def ber_mst(images: NDArray[np.float64], labels: NDArray[np.int_], k: int = 1) -> tuple[float, float]:
|
45
|
+
"""Calculates the :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree
|
44
46
|
|
45
47
|
Parameters
|
46
48
|
----------
|
47
|
-
|
49
|
+
images : NDArray, shape - (N, ... )
|
48
50
|
n_samples containing n_features
|
49
|
-
|
51
|
+
labels : NDArray, shape - (N, 1)
|
50
52
|
Labels corresponding to each sample
|
53
|
+
k : int
|
54
|
+
Unused
|
51
55
|
|
52
56
|
Returns
|
53
57
|
-------
|
54
58
|
Tuple[float, float]
|
55
59
|
The upper and lower bounds of the bayes error rate
|
56
60
|
"""
|
57
|
-
M, N = get_classes_counts(
|
61
|
+
M, N = get_classes_counts(labels)
|
58
62
|
|
59
|
-
tree = coo_matrix(minimum_spanning_tree(
|
60
|
-
matches = np.sum([
|
63
|
+
tree = coo_matrix(minimum_spanning_tree(images))
|
64
|
+
matches = np.sum([labels[tree.row[i]] != labels[tree.col[i]] for i in range(N - 1)])
|
61
65
|
deltas = matches / (2 * N)
|
62
66
|
upper = 2 * deltas
|
63
67
|
lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
|
64
68
|
return upper, lower
|
65
69
|
|
66
70
|
|
67
|
-
def ber_knn(
|
68
|
-
"""Calculates the Bayes Error Rate using K-nearest neighbors
|
71
|
+
def ber_knn(images: NDArray[np.float64], labels: NDArray[np.int_], k: int) -> tuple[float, float]:
|
72
|
+
"""Calculates the :term:`Bayes error rate<Bayes Error Rate (BER)>` using K-nearest neighbors
|
69
73
|
|
70
74
|
Parameters
|
71
75
|
----------
|
72
|
-
|
76
|
+
images : NDArray, shape - (N, ... )
|
73
77
|
n_samples containing n_features
|
74
|
-
|
78
|
+
labels : NDArray, shape - (N, 1)
|
75
79
|
Labels corresponding to each sample
|
80
|
+
k : int
|
81
|
+
The number of neighbors to find
|
76
82
|
|
77
83
|
Returns
|
78
84
|
-------
|
79
85
|
Tuple[float, float]
|
80
86
|
The upper and lower bounds of the bayes error rate
|
81
87
|
"""
|
82
|
-
M, N = get_classes_counts(
|
83
|
-
nn_indices = compute_neighbors(
|
88
|
+
M, N = get_classes_counts(labels)
|
89
|
+
nn_indices = compute_neighbors(images, images, k=k)
|
84
90
|
nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
|
85
|
-
modal_class = mode(
|
86
|
-
upper = float(np.count_nonzero(modal_class -
|
91
|
+
modal_class = mode(labels[nn_indices], axis=1, keepdims=True).mode.squeeze()
|
92
|
+
upper = float(np.count_nonzero(modal_class - labels) / N)
|
87
93
|
lower = knn_lowerbound(upper, M, k)
|
88
94
|
return upper, lower
|
89
95
|
|
@@ -108,18 +114,15 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
|
|
108
114
|
return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
|
109
115
|
|
110
116
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
@set_metadata("dataeval.metrics")
|
117
|
+
@set_metadata()
|
115
118
|
def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
|
116
119
|
"""
|
117
|
-
An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
|
120
|
+
An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using FR or KNN test statistic basis
|
118
121
|
|
119
122
|
Parameters
|
120
123
|
----------
|
121
124
|
images : ArrayLike (N, ... )
|
122
|
-
Array of images or image embeddings
|
125
|
+
Array of images or image :term:`embeddings<Embeddings>`
|
123
126
|
labels : ArrayLike (N, 1)
|
124
127
|
Array of labels for each image or image embedding
|
125
128
|
k : int, default 1
|
@@ -146,8 +149,8 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
|
|
146
149
|
>>> ber(images, labels)
|
147
150
|
BEROutput(ber=0.04, ber_lower=0.020416847668728033)
|
148
151
|
"""
|
149
|
-
ber_fn = get_method(
|
152
|
+
ber_fn = get_method({"KNN": ber_knn, "MST": ber_mst}, method)
|
150
153
|
X = as_numpy(images)
|
151
154
|
y = as_numpy(labels)
|
152
|
-
upper, lower = ber_fn(X, y, k)
|
155
|
+
upper, lower = ber_fn(X, y, k)
|
153
156
|
return BEROutput(upper, lower)
|
@@ -1,19 +1,21 @@
|
|
1
1
|
"""
|
2
|
-
This module contains the implementation of HP Divergence
|
2
|
+
This module contains the implementation of HP :term:`divergence<Divergence>`
|
3
3
|
using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
|
4
4
|
"""
|
5
5
|
|
6
6
|
from __future__ import annotations
|
7
7
|
|
8
|
+
__all__ = ["DivergenceOutput", "divergence"]
|
9
|
+
|
8
10
|
from dataclasses import dataclass
|
9
11
|
from typing import Literal
|
10
12
|
|
11
13
|
import numpy as np
|
12
14
|
from numpy.typing import ArrayLike, NDArray
|
13
15
|
|
14
|
-
from dataeval.
|
15
|
-
from dataeval.
|
16
|
-
from dataeval.
|
16
|
+
from dataeval.interop import as_numpy
|
17
|
+
from dataeval.output import OutputMetadata, set_metadata
|
18
|
+
from dataeval.utils.shared import compute_neighbors, get_method, minimum_spanning_tree
|
17
19
|
|
18
20
|
|
19
21
|
@dataclass(frozen=True)
|
@@ -24,7 +26,7 @@ class DivergenceOutput(OutputMetadata):
|
|
24
26
|
Attributes
|
25
27
|
----------
|
26
28
|
divergence : float
|
27
|
-
Divergence value calculated between 2 datasets ranging between 0.0 and 1.0
|
29
|
+
:term:`Divergence` value calculated between 2 datasets ranging between 0.0 and 1.0
|
28
30
|
errors : int
|
29
31
|
The number of differing edges between the datasets
|
30
32
|
"""
|
@@ -33,7 +35,7 @@ class DivergenceOutput(OutputMetadata):
|
|
33
35
|
errors: int
|
34
36
|
|
35
37
|
|
36
|
-
def divergence_mst(data: NDArray, labels: NDArray) -> int:
|
38
|
+
def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
37
39
|
"""
|
38
40
|
Calculates the estimated label errors based on the minimum spanning tree
|
39
41
|
|
@@ -55,7 +57,7 @@ def divergence_mst(data: NDArray, labels: NDArray) -> int:
|
|
55
57
|
return errors
|
56
58
|
|
57
59
|
|
58
|
-
def divergence_fnn(data: NDArray, labels: NDArray) -> int:
|
60
|
+
def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
59
61
|
"""
|
60
62
|
Calculates the estimated label errors based on their nearest neighbors
|
61
63
|
|
@@ -76,13 +78,10 @@ def divergence_fnn(data: NDArray, labels: NDArray) -> int:
|
|
76
78
|
return errors
|
77
79
|
|
78
80
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
@set_metadata("dataeval.metrics")
|
81
|
+
@set_metadata()
|
83
82
|
def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
84
83
|
"""
|
85
|
-
Calculates the divergence and any errors between the datasets
|
84
|
+
Calculates the :term`divergence` and any errors between the datasets
|
86
85
|
|
87
86
|
Parameters
|
88
87
|
----------
|
@@ -93,7 +92,7 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
93
92
|
A dataset in an ArrayLike format to compare.
|
94
93
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
95
94
|
method : Literal["MST, "FNN"], default "FNN"
|
96
|
-
Method used to estimate dataset divergence
|
95
|
+
Method used to estimate dataset :term:`divergence<Divergence>`
|
97
96
|
|
98
97
|
Returns
|
99
98
|
-------
|
@@ -124,16 +123,16 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
124
123
|
Evaluate the datasets:
|
125
124
|
|
126
125
|
>>> divergence(datasetA, datasetB)
|
127
|
-
DivergenceOutput(divergence=0.28, errors=36
|
126
|
+
DivergenceOutput(divergence=0.28, errors=36)
|
128
127
|
"""
|
129
|
-
div_fn = get_method(
|
128
|
+
div_fn = get_method({"FNN": divergence_fnn, "MST": divergence_mst}, method)
|
130
129
|
a = as_numpy(data_a)
|
131
130
|
b = as_numpy(data_b)
|
132
131
|
N = a.shape[0]
|
133
132
|
M = b.shape[0]
|
134
133
|
|
135
134
|
stacked_data = np.vstack((a, b))
|
136
|
-
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
135
|
+
labels = np.vstack([np.zeros([N, 1], dtype=np.int_), np.ones([M, 1], dtype=np.int_)])
|
137
136
|
|
138
137
|
errors = div_fn(stacked_data, labels)
|
139
138
|
dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
|
@@ -1,18 +1,20 @@
|
|
1
1
|
"""
|
2
2
|
This module contains the implementation of the
|
3
|
-
FR Test Statistic based estimate for the
|
4
|
-
average precision using empirical mean precision
|
3
|
+
FR Test Statistic based estimate for the :term:`upper-bound
|
4
|
+
average precision<Upper-Bound Average Precision (UAP)>` using empirical mean precision
|
5
5
|
"""
|
6
6
|
|
7
7
|
from __future__ import annotations
|
8
8
|
|
9
|
+
__all__ = ["UAPOutput", "uap"]
|
10
|
+
|
9
11
|
from dataclasses import dataclass
|
10
12
|
|
11
13
|
from numpy.typing import ArrayLike
|
12
14
|
from sklearn.metrics import average_precision_score
|
13
15
|
|
14
|
-
from dataeval.
|
15
|
-
from dataeval.
|
16
|
+
from dataeval.interop import as_numpy
|
17
|
+
from dataeval.output import OutputMetadata, set_metadata
|
16
18
|
|
17
19
|
|
18
20
|
@dataclass(frozen=True)
|
@@ -29,7 +31,7 @@ class UAPOutput(OutputMetadata):
|
|
29
31
|
uap: float
|
30
32
|
|
31
33
|
|
32
|
-
@set_metadata(
|
34
|
+
@set_metadata()
|
33
35
|
def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
|
34
36
|
"""
|
35
37
|
FR Test Statistic based estimate of the empirical mean precision for
|
@@ -38,7 +40,7 @@ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
|
|
38
40
|
Parameters
|
39
41
|
----------
|
40
42
|
labels : ArrayLike
|
41
|
-
A
|
43
|
+
A term:`NumPy` array of n_samples of class labels with M unique classes.
|
42
44
|
scores : ArrayLike
|
43
45
|
A 2D array of class probabilities per image
|
44
46
|
|
@@ -3,18 +3,18 @@ Statistics metrics calculate a variety of image properties and pixel statistics
|
|
3
3
|
and label statistics against the images and labels of a dataset.
|
4
4
|
"""
|
5
5
|
|
6
|
-
from dataeval.
|
7
|
-
from dataeval.
|
6
|
+
from dataeval.metrics.stats.boxratiostats import boxratiostats
|
7
|
+
from dataeval.metrics.stats.datasetstats import (
|
8
8
|
ChannelStatsOutput,
|
9
9
|
DatasetStatsOutput,
|
10
10
|
channelstats,
|
11
11
|
datasetstats,
|
12
12
|
)
|
13
|
-
from dataeval.
|
14
|
-
from dataeval.
|
15
|
-
from dataeval.
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
13
|
+
from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
|
14
|
+
from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
|
15
|
+
from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
|
16
|
+
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
|
17
|
+
from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
|
18
18
|
|
19
19
|
__all__ = [
|
20
20
|
"boxratiostats",
|
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = []
|
4
|
+
|
3
5
|
import re
|
4
6
|
import warnings
|
5
7
|
from dataclasses import dataclass
|
@@ -12,14 +14,17 @@ import numpy as np
|
|
12
14
|
import tqdm
|
13
15
|
from numpy.typing import ArrayLike, NDArray
|
14
16
|
|
15
|
-
from dataeval.
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
17
|
+
from dataeval.interop import to_numpy_iter
|
18
|
+
from dataeval.output import OutputMetadata
|
19
|
+
from dataeval.utils.image import normalize_image_shape, rescale
|
18
20
|
|
19
21
|
DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
|
20
22
|
SOURCE_INDEX = "source_index"
|
21
23
|
BOX_COUNT = "box_count"
|
22
24
|
|
25
|
+
# TODO: Replace with global config
|
26
|
+
DEFAULT_PROCESSES: int | None = None
|
27
|
+
|
23
28
|
OptionalRange = Optional[Union[int, Iterable[int]]]
|
24
29
|
|
25
30
|
|
@@ -29,6 +34,19 @@ def matches(index: int | None, opt_range: OptionalRange) -> bool:
|
|
29
34
|
return index in opt_range if isinstance(opt_range, Iterable) else index == opt_range
|
30
35
|
|
31
36
|
|
37
|
+
def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
|
38
|
+
"""
|
39
|
+
Normalizes the bounding box shape into (N,4).
|
40
|
+
"""
|
41
|
+
ndim = bounding_box.ndim
|
42
|
+
if ndim == 1:
|
43
|
+
return np.expand_dims(bounding_box, axis=0)
|
44
|
+
elif ndim > 2:
|
45
|
+
raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
|
46
|
+
else:
|
47
|
+
return bounding_box
|
48
|
+
|
49
|
+
|
32
50
|
class SourceIndex(NamedTuple):
|
33
51
|
"""
|
34
52
|
Attributes
|
@@ -101,39 +119,39 @@ TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput, covariant=True)
|
|
101
119
|
class StatsProcessor(Generic[TStatsOutput]):
|
102
120
|
output_class: type[TStatsOutput]
|
103
121
|
cache_keys: list[str] = []
|
104
|
-
image_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
|
105
|
-
channel_function_map: dict[str, Callable[[StatsProcessor], Any]] = {}
|
122
|
+
image_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
|
123
|
+
channel_function_map: dict[str, Callable[[StatsProcessor[TStatsOutput]], Any]] = {}
|
106
124
|
|
107
|
-
def __init__(self, image: NDArray, box: NDArray | None, per_channel: bool):
|
125
|
+
def __init__(self, image: NDArray[Any], box: NDArray[Any] | None, per_channel: bool) -> None:
|
108
126
|
self.raw = image
|
109
|
-
self.width = image.shape[-1]
|
110
|
-
self.height = image.shape[-2]
|
111
|
-
self.box = np.array([0, 0, self.width, self.height]) if box is None else box
|
112
|
-
self.
|
127
|
+
self.width: int = image.shape[-1]
|
128
|
+
self.height: int = image.shape[-2]
|
129
|
+
self.box: NDArray[Any] = np.array([0, 0, self.width, self.height]) if box is None else box
|
130
|
+
self._per_channel = per_channel
|
113
131
|
self._image = None
|
114
132
|
self._shape = None
|
115
133
|
self._scaled = None
|
116
|
-
self.
|
117
|
-
self.
|
118
|
-
self.
|
134
|
+
self._cache = {}
|
135
|
+
self._fn_map = self.channel_function_map if per_channel else self.image_function_map
|
136
|
+
self._is_valid_slice = box is None or bool(
|
119
137
|
box[0] >= 0 and box[1] >= 0 and box[2] <= image.shape[-1] and box[3] <= image.shape[-2]
|
120
138
|
)
|
121
139
|
|
122
|
-
def get(self, fn_key: str) -> NDArray:
|
140
|
+
def get(self, fn_key: str) -> NDArray[Any]:
|
123
141
|
if fn_key in self.cache_keys:
|
124
|
-
if fn_key not in self.
|
125
|
-
self.
|
126
|
-
return self.
|
142
|
+
if fn_key not in self._cache:
|
143
|
+
self._cache[fn_key] = self._fn_map[fn_key](self)
|
144
|
+
return self._cache[fn_key]
|
127
145
|
else:
|
128
|
-
return self.
|
146
|
+
return self._fn_map[fn_key](self)
|
129
147
|
|
130
|
-
def process(self) -> dict:
|
131
|
-
return {k: self.
|
148
|
+
def process(self) -> dict[str, Any]:
|
149
|
+
return {k: self._fn_map[k](self) for k in self._fn_map}
|
132
150
|
|
133
151
|
@property
|
134
|
-
def image(self) -> NDArray:
|
152
|
+
def image(self) -> NDArray[Any]:
|
135
153
|
if self._image is None:
|
136
|
-
if self.
|
154
|
+
if self._is_valid_slice:
|
137
155
|
norm = normalize_image_shape(self.raw)
|
138
156
|
self._image = norm[:, self.box[1] : self.box[3], self.box[0] : self.box[2]]
|
139
157
|
else:
|
@@ -141,16 +159,16 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
141
159
|
return self._image
|
142
160
|
|
143
161
|
@property
|
144
|
-
def shape(self) -> tuple:
|
162
|
+
def shape(self) -> tuple[int, ...]:
|
145
163
|
if self._shape is None:
|
146
164
|
self._shape = self.image.shape
|
147
165
|
return self._shape
|
148
166
|
|
149
167
|
@property
|
150
|
-
def scaled(self) -> NDArray:
|
168
|
+
def scaled(self) -> NDArray[Any]:
|
151
169
|
if self._scaled is None:
|
152
170
|
self._scaled = rescale(self.image)
|
153
|
-
if self.
|
171
|
+
if self._per_channel:
|
154
172
|
self._scaled = self._scaled.reshape(self.image.shape[0], -1)
|
155
173
|
return self._scaled
|
156
174
|
|
@@ -175,25 +193,25 @@ class StatsProcessorOutput(NamedTuple):
|
|
175
193
|
results: list[dict[str, Any]]
|
176
194
|
source_indices: list[SourceIndex]
|
177
195
|
box_counts: list[int]
|
178
|
-
warnings_list: list[tuple[int, int, NDArray, tuple[int, ...]]]
|
196
|
+
warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]]
|
179
197
|
|
180
198
|
|
181
199
|
def process_stats(
|
182
200
|
i: int,
|
183
|
-
image_boxes: tuple[NDArray, NDArray | None],
|
201
|
+
image_boxes: tuple[NDArray[Any], NDArray[Any] | None],
|
184
202
|
per_channel: bool,
|
185
|
-
stats_processor_cls: Iterable[type[StatsProcessor]],
|
203
|
+
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
186
204
|
) -> StatsProcessorOutput:
|
187
205
|
image, boxes = image_boxes
|
188
206
|
results_list: list[dict[str, Any]] = []
|
189
207
|
source_indices: list[SourceIndex] = []
|
190
208
|
box_counts: list[int] = []
|
191
|
-
warnings_list: list[tuple[int, int, NDArray, tuple[int, ...]]] = []
|
209
|
+
warnings_list: list[tuple[int, int, NDArray[np.float64], tuple[int, ...]]] = []
|
192
210
|
nboxes = [None] if boxes is None else normalize_box_shape(boxes)
|
193
211
|
for i_b, box in enumerate(nboxes):
|
194
212
|
i_b = None if box is None else i_b
|
195
213
|
processor_list = [p(image, box, per_channel) for p in stats_processor_cls]
|
196
|
-
if any(not p.
|
214
|
+
if any(not p._is_valid_slice for p in processor_list) and i_b is not None and box is not None:
|
197
215
|
warnings_list.append((i, i_b, box, image.shape))
|
198
216
|
results_list.append({k: v for p in processor_list for k, v in p.process().items()})
|
199
217
|
if per_channel:
|
@@ -204,7 +222,11 @@ def process_stats(
|
|
204
222
|
return StatsProcessorOutput(results_list, source_indices, box_counts, warnings_list)
|
205
223
|
|
206
224
|
|
207
|
-
def process_stats_unpack(
|
225
|
+
def process_stats_unpack(
|
226
|
+
args: tuple[int, tuple[NDArray[Any], NDArray[Any] | None]],
|
227
|
+
per_channel: bool,
|
228
|
+
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
229
|
+
) -> StatsProcessorOutput:
|
208
230
|
return process_stats(*args, per_channel=per_channel, stats_processor_cls=stats_processor_cls)
|
209
231
|
|
210
232
|
|
@@ -215,7 +237,7 @@ def run_stats(
|
|
215
237
|
stats_processor_cls: Iterable[type[StatsProcessor[TStatsOutput]]],
|
216
238
|
) -> list[TStatsOutput]:
|
217
239
|
"""
|
218
|
-
Compute specified statistics on a set of images.
|
240
|
+
Compute specified :term:`statistics<Statistics>` on a set of images.
|
219
241
|
|
220
242
|
This function applies a set of statistical operations to each image in the input iterable,
|
221
243
|
based on the specified output class. The function determines which statistics to apply
|
@@ -225,7 +247,7 @@ def run_stats(
|
|
225
247
|
----------
|
226
248
|
images : Iterable[ArrayLike]
|
227
249
|
An iterable of images (e.g., list of arrays), where each image is represented as an
|
228
|
-
array-like structure (e.g., NumPy arrays).
|
250
|
+
array-like structure (e.g., :term:`NumPy` arrays).
|
229
251
|
bboxes : Iterable[ArrayLike]
|
230
252
|
An iterable of bounding boxes (e.g. list of arrays) where each bounding box is represented
|
231
253
|
as an array-like structure in the format of (X0, Y0, X1, Y1). The length of the bounding boxes
|
@@ -234,24 +256,28 @@ def run_stats(
|
|
234
256
|
A flag which determines if the states should be evaluated on a per-channel basis or not.
|
235
257
|
stats_processor_cls : Iterable[type[StatsProcessor]]
|
236
258
|
An iterable of stats processor classes that calculate stats and return output classes.
|
259
|
+
processes : int | None, default None
|
260
|
+
Number of processes to use, defaults to None which uses all available CPU cores.
|
237
261
|
|
238
262
|
Returns
|
239
263
|
-------
|
240
|
-
|
241
|
-
A
|
264
|
+
dict[str, NDArray]]
|
265
|
+
A dictionary containing the computed statistics for each image.
|
266
|
+
The dictionary keys correspond to the names of the statistics, and the values are :term:`NumPy` arrays
|
267
|
+
with the results of the computations.
|
242
268
|
|
243
269
|
Note
|
244
270
|
----
|
245
271
|
- The function performs image normalization (rescaling the image values)
|
246
272
|
before applying some of the statistics.
|
247
|
-
- Pixel-level statistics (e.g., brightness
|
273
|
+
- Pixel-level statistics (e.g., :term:`brightness<Brightness>`, entropy) are computed after
|
248
274
|
rescaling and, optionally, flattening the images.
|
249
275
|
- For statistics like histograms and entropy, intermediate results may
|
250
276
|
be reused to avoid redundant computation.
|
251
277
|
"""
|
252
|
-
results_list: list[dict[str, NDArray]] = []
|
253
|
-
source_index = []
|
254
|
-
box_count = []
|
278
|
+
results_list: list[dict[str, NDArray[np.float64]]] = []
|
279
|
+
source_index: list[SourceIndex] = []
|
280
|
+
box_count: list[int] = []
|
255
281
|
bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
|
256
282
|
|
257
283
|
warning_list = []
|
@@ -259,7 +285,7 @@ def run_stats(
|
|
259
285
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
260
286
|
|
261
287
|
# TODO: Introduce global controls for CPU job parallelism and GPU configurations
|
262
|
-
with Pool(
|
288
|
+
with Pool(processes=DEFAULT_PROCESSES) as p:
|
263
289
|
for r in tqdm.tqdm(
|
264
290
|
p.imap(
|
265
291
|
partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
|
@@ -1,14 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = ["boxratiostats"]
|
4
|
+
|
3
5
|
import copy
|
4
|
-
from typing import Callable, Generic, TypeVar, cast
|
6
|
+
from typing import Any, Callable, Generic, TypeVar, cast
|
5
7
|
|
6
8
|
import numpy as np
|
7
9
|
from numpy.typing import NDArray
|
8
10
|
|
9
|
-
from dataeval.
|
10
|
-
from dataeval.
|
11
|
-
from dataeval.
|
11
|
+
from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
|
12
|
+
from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
|
13
|
+
from dataeval.output import set_metadata
|
12
14
|
|
13
15
|
TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
|
14
16
|
ArraySlice = tuple[int, int]
|
@@ -39,14 +41,16 @@ class BoxImageStatsOutputSlice(Generic[TStatOutput]):
|
|
39
41
|
self.img = self.StatSlicer(img_stats, img_slice)
|
40
42
|
|
41
43
|
|
42
|
-
RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[
|
43
|
-
DimensionStatsOutput:
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
44
|
+
RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
|
45
|
+
DimensionStatsOutput: dict[str, Callable[[BoxImageStatsOutputSlice[DimensionStatsOutput]], NDArray[Any]]](
|
46
|
+
{
|
47
|
+
"left": lambda x: x.box["left"] / x.img["width"],
|
48
|
+
"top": lambda x: x.box["top"] / x.img["height"],
|
49
|
+
"channels": lambda x: x.box["channels"],
|
50
|
+
"depth": lambda x: x.box["depth"],
|
51
|
+
"distance": lambda x: x.box["distance"],
|
52
|
+
}
|
53
|
+
)
|
50
54
|
}
|
51
55
|
|
52
56
|
|
@@ -60,7 +64,7 @@ def get_index_map(stats: BaseStatsOutput) -> list[int]:
|
|
60
64
|
return index_map
|
61
65
|
|
62
66
|
|
63
|
-
def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsOutput) -> NDArray:
|
67
|
+
def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsOutput) -> NDArray[np.float64]:
|
64
68
|
if not hasattr(box_stats, key) or not hasattr(img_stats, key):
|
65
69
|
raise KeyError("Invalid key for provided stats output object.")
|
66
70
|
|
@@ -92,13 +96,13 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
|
|
92
96
|
return out_stats
|
93
97
|
|
94
98
|
|
95
|
-
@set_metadata(
|
99
|
+
@set_metadata()
|
96
100
|
def boxratiostats(
|
97
101
|
boxstats: TStatOutput,
|
98
102
|
imgstats: TStatOutput,
|
99
103
|
) -> TStatOutput:
|
100
104
|
"""
|
101
|
-
Calculates ratio statistics of box outputs over image outputs
|
105
|
+
Calculates ratio :term:`statistics<Statistics>` of box outputs over image outputs
|
102
106
|
|
103
107
|
Parameters
|
104
108
|
----------
|