dataeval 0.76.1__py3-none-any.whl → 0.81.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/{output.py → _output.py} +14 -0
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +41 -30
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +0 -3
- dataeval/detectors/linters/duplicates.py +17 -8
- dataeval/detectors/linters/outliers.py +23 -14
- dataeval/detectors/ood/ae.py +29 -8
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/metadata_ks_compare.py +1 -1
- dataeval/detectors/ood/mixin.py +20 -5
- dataeval/detectors/ood/output.py +1 -1
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +5 -0
- dataeval/metadata/_ood.py +238 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +5 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
- dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
- dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
- dataeval/metrics/bias/{parity.py → _parity.py} +89 -61
- dataeval/metrics/estimators/__init__.py +14 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
- dataeval/metrics/estimators/_clusterer.py +104 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/metrics/stats/{base.py → _base.py} +52 -16
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
- dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
- dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
- dataeval/metrics/stats/{labelstats.py → _labelstats.py} +4 -4
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
- dataeval/typing.py +54 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +18 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +4 -4
- dataeval/utils/data/__init__.py +22 -0
- dataeval/utils/data/_embeddings.py +105 -0
- dataeval/utils/data/_images.py +65 -0
- dataeval/utils/data/_metadata.py +352 -0
- dataeval/utils/data/_selection.py +119 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
- dataeval/utils/data/_targets.py +73 -0
- dataeval/utils/data/_types.py +58 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +60 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +51 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/sufficiency.py +10 -9
- {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/METADATA +4 -1
- dataeval-0.81.0.dist-info/RECORD +94 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.1.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
@@ -2,8 +2,18 @@
|
|
2
2
|
Estimators calculate performance bounds and the statistical distance between datasets.
|
3
3
|
"""
|
4
4
|
|
5
|
-
__all__ = [
|
5
|
+
__all__ = [
|
6
|
+
"ber",
|
7
|
+
"clusterer",
|
8
|
+
"divergence",
|
9
|
+
"uap",
|
10
|
+
"BEROutput",
|
11
|
+
"ClustererOutput",
|
12
|
+
"DivergenceOutput",
|
13
|
+
"UAPOutput",
|
14
|
+
]
|
6
15
|
|
7
|
-
from dataeval.metrics.estimators.
|
8
|
-
from dataeval.metrics.estimators.
|
9
|
-
from dataeval.metrics.estimators.
|
16
|
+
from dataeval.metrics.estimators._ber import BEROutput, ber
|
17
|
+
from dataeval.metrics.estimators._clusterer import ClustererOutput, clusterer
|
18
|
+
from dataeval.metrics.estimators._divergence import DivergenceOutput, divergence
|
19
|
+
from dataeval.metrics.estimators._uap import UAPOutput, uap
|
@@ -16,19 +16,21 @@ from dataclasses import dataclass
|
|
16
16
|
from typing import Literal
|
17
17
|
|
18
18
|
import numpy as np
|
19
|
-
from numpy.typing import
|
19
|
+
from numpy.typing import NDArray
|
20
20
|
from scipy.sparse import coo_matrix
|
21
21
|
from scipy.stats import mode
|
22
22
|
|
23
|
-
from dataeval.
|
24
|
-
from dataeval.
|
25
|
-
from dataeval.utils.
|
23
|
+
from dataeval._output import Output, set_metadata
|
24
|
+
from dataeval.typing import ArrayLike
|
25
|
+
from dataeval.utils._array import as_numpy, ensure_embeddings
|
26
|
+
from dataeval.utils._method import get_method
|
27
|
+
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
26
28
|
|
27
29
|
|
28
30
|
@dataclass(frozen=True)
|
29
31
|
class BEROutput(Output):
|
30
32
|
"""
|
31
|
-
Output class for :func
|
33
|
+
Output class for :func:`.ber` estimator metric.
|
32
34
|
|
33
35
|
Attributes
|
34
36
|
----------
|
@@ -116,18 +118,21 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
|
|
116
118
|
return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
|
117
119
|
|
118
120
|
|
121
|
+
_BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
|
122
|
+
|
123
|
+
|
119
124
|
@set_metadata
|
120
|
-
def ber(
|
125
|
+
def ber(embeddings: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
|
121
126
|
"""
|
122
127
|
An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
|
123
128
|
using FR or KNN test statistic basis.
|
124
129
|
|
125
130
|
Parameters
|
126
131
|
----------
|
127
|
-
|
128
|
-
Array of
|
132
|
+
embeddings : ArrayLike (N, ... )
|
133
|
+
Array of image :term:`embeddings<Embeddings>`
|
129
134
|
labels : ArrayLike (N, 1)
|
130
|
-
Array of labels for each image
|
135
|
+
Array of labels for each image
|
131
136
|
k : int, default 1
|
132
137
|
Number of nearest neighbors for KNN estimator -- ignored by MST estimator
|
133
138
|
method : Literal["KNN", "MST"], default "KNN"
|
@@ -152,8 +157,34 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
|
|
152
157
|
>>> ber(images, labels)
|
153
158
|
BEROutput(ber=0.04, ber_lower=0.020416847668728033)
|
154
159
|
"""
|
155
|
-
ber_fn = get_method(
|
156
|
-
X =
|
160
|
+
ber_fn = get_method(_BER_FN_MAP, method)
|
161
|
+
X = ensure_embeddings(embeddings, dtype=np.float64)
|
157
162
|
y = as_numpy(labels)
|
158
163
|
upper, lower = ber_fn(X, y, k)
|
159
164
|
return BEROutput(upper, lower)
|
165
|
+
|
166
|
+
|
167
|
+
def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
|
168
|
+
"""
|
169
|
+
Returns the classes and counts of from an array of labels
|
170
|
+
|
171
|
+
Parameters
|
172
|
+
----------
|
173
|
+
label : NDArray
|
174
|
+
Numpy labels array
|
175
|
+
|
176
|
+
Returns
|
177
|
+
-------
|
178
|
+
Classes and counts
|
179
|
+
|
180
|
+
Raises
|
181
|
+
------
|
182
|
+
ValueError
|
183
|
+
If the number of unique classes is less than 2
|
184
|
+
"""
|
185
|
+
classes, counts = np.unique(labels, return_counts=True)
|
186
|
+
M = len(classes)
|
187
|
+
if M < 2:
|
188
|
+
raise ValueError("Label vector contains less than 2 classes!")
|
189
|
+
N = int(np.sum(counts))
|
190
|
+
return M, N
|
@@ -0,0 +1,104 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from numpy.typing import NDArray
|
9
|
+
|
10
|
+
from dataeval._output import Output
|
11
|
+
from dataeval.typing import ArrayLike
|
12
|
+
from dataeval.utils._array import as_numpy
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass(frozen=True)
|
16
|
+
class ClustererOutput(Output):
|
17
|
+
"""
|
18
|
+
Output class for :func:`.clusterer`.
|
19
|
+
|
20
|
+
Attributes
|
21
|
+
----------
|
22
|
+
clusters : NDArray[int]
|
23
|
+
Assigned clusters
|
24
|
+
mst : NDArray[int]
|
25
|
+
The minimum spanning tree of the data
|
26
|
+
linkage_tree : NDArray[float]
|
27
|
+
The linkage array of the data
|
28
|
+
condensed_tree : NDArray[float]
|
29
|
+
The condensed tree of the data
|
30
|
+
membership_strengths : NDArray[float]
|
31
|
+
The strength of the data point belonging to the assigned cluster
|
32
|
+
"""
|
33
|
+
|
34
|
+
clusters: NDArray[np.int_]
|
35
|
+
mst: NDArray[np.double]
|
36
|
+
linkage_tree: NDArray[np.double]
|
37
|
+
condensed_tree: NDArray[np.double]
|
38
|
+
membership_strengths: NDArray[np.double]
|
39
|
+
|
40
|
+
def find_outliers(self) -> NDArray[np.int_]:
|
41
|
+
"""
|
42
|
+
Retrieves Outliers based on when the sample was added to the cluster
|
43
|
+
and how far it was from the cluster when it was added
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
NDArray[int]
|
48
|
+
A numpy array of the outlier indices
|
49
|
+
"""
|
50
|
+
return np.nonzero(self.clusters == -1)[0]
|
51
|
+
|
52
|
+
def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
|
53
|
+
"""
|
54
|
+
Finds duplicate and near duplicate data based on cluster average distance
|
55
|
+
|
56
|
+
Returns
|
57
|
+
-------
|
58
|
+
Tuple[List[List[int]], List[List[int]]]
|
59
|
+
The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
|
60
|
+
"""
|
61
|
+
# Delay load numba compiled functions
|
62
|
+
from dataeval.utils._clusterer import compare_links_to_cluster_std, sorted_union_find
|
63
|
+
|
64
|
+
exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)
|
65
|
+
exact_dupes = sorted_union_find(exact_indices)
|
66
|
+
near_dupes = sorted_union_find(near_indices)
|
67
|
+
|
68
|
+
return [[int(ii) for ii in il] for il in exact_dupes], [[int(ii) for ii in il] for il in near_dupes]
|
69
|
+
|
70
|
+
|
71
|
+
def clusterer(data: ArrayLike) -> ClustererOutput:
|
72
|
+
"""
|
73
|
+
Uses hierarchical clustering on the flattened data and returns clustering
|
74
|
+
information.
|
75
|
+
|
76
|
+
Parameters
|
77
|
+
----------
|
78
|
+
data : ArrayLike, shape - (N, ...)
|
79
|
+
A dataset in an ArrayLike format. Function expects the data to have 2
|
80
|
+
or more dimensions which will flatten to (N, P) where N number of
|
81
|
+
observations in a P-dimensional space.
|
82
|
+
|
83
|
+
Returns
|
84
|
+
-------
|
85
|
+
:class:`.ClustererOutput`
|
86
|
+
|
87
|
+
Note
|
88
|
+
----
|
89
|
+
The clusterer works best when the length of the feature dimension, P, is
|
90
|
+
less than 500. If flattening a CxHxW image results in a dimension larger
|
91
|
+
than 500, then it is recommended to reduce the dimensions.
|
92
|
+
|
93
|
+
Example
|
94
|
+
-------
|
95
|
+
>>> clusterer(clusterer_images).clusters
|
96
|
+
array([ 2, 0, 0, 0, 0, 0, 4, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0,
|
97
|
+
4, 2, 0, 0, 1, 2, 0, 1, 3, 0, 3, 3, 4, 0, 0, 3, 0,
|
98
|
+
3, -1, 0, 0, 2, 4, 3, 4, 0, 1, 0, -1, 3, 0, 0, 0])
|
99
|
+
"""
|
100
|
+
# Delay load numba compiled functions
|
101
|
+
from dataeval.utils._clusterer import cluster
|
102
|
+
|
103
|
+
c = cluster(data)
|
104
|
+
return ClustererOutput(c.clusters, c.mst, c.linkage_tree, as_numpy(c.condensed_tree), c.membership_strengths)
|
@@ -11,17 +11,19 @@ from dataclasses import dataclass
|
|
11
11
|
from typing import Literal
|
12
12
|
|
13
13
|
import numpy as np
|
14
|
-
from numpy.typing import
|
14
|
+
from numpy.typing import NDArray
|
15
15
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
18
|
-
from dataeval.utils.
|
16
|
+
from dataeval._output import Output, set_metadata
|
17
|
+
from dataeval.typing import ArrayLike
|
18
|
+
from dataeval.utils._array import ensure_embeddings
|
19
|
+
from dataeval.utils._method import get_method
|
20
|
+
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
19
21
|
|
20
22
|
|
21
23
|
@dataclass(frozen=True)
|
22
24
|
class DivergenceOutput(Output):
|
23
25
|
"""
|
24
|
-
Output class for :func
|
26
|
+
Output class for :func:`.divergence` estimator metric.
|
25
27
|
|
26
28
|
Attributes
|
27
29
|
----------
|
@@ -78,18 +80,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
|
78
80
|
return errors
|
79
81
|
|
80
82
|
|
83
|
+
_DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
|
84
|
+
|
85
|
+
|
81
86
|
@set_metadata
|
82
|
-
def divergence(
|
87
|
+
def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
83
88
|
"""
|
84
89
|
Calculates the :term:`divergence` and any errors between the datasets.
|
85
90
|
|
86
91
|
Parameters
|
87
92
|
----------
|
88
|
-
|
89
|
-
|
93
|
+
emb_a : ArrayLike, shape - (N, P)
|
94
|
+
Image embeddings in an ArrayLike format to compare.
|
90
95
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
91
|
-
|
92
|
-
|
96
|
+
emb_b : ArrayLike, shape - (N, P)
|
97
|
+
Image embeddings in an ArrayLike format to compare.
|
93
98
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
94
99
|
method : Literal["MST, "FNN"], default "FNN"
|
95
100
|
Method used to estimate dataset :term:`divergence<Divergence>`
|
@@ -125,9 +130,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
125
130
|
>>> divergence(datasetA, datasetB)
|
126
131
|
DivergenceOutput(divergence=0.28, errors=36)
|
127
132
|
"""
|
128
|
-
div_fn = get_method(
|
129
|
-
a =
|
130
|
-
b =
|
133
|
+
div_fn = get_method(_DIVERGENCE_FN_MAP, method)
|
134
|
+
a = ensure_embeddings(emb_a, dtype=np.float64)
|
135
|
+
b = ensure_embeddings(emb_b, dtype=np.float64)
|
131
136
|
N = a.shape[0]
|
132
137
|
M = b.shape[0]
|
133
138
|
|
@@ -10,17 +10,17 @@ __all__ = []
|
|
10
10
|
|
11
11
|
from dataclasses import dataclass
|
12
12
|
|
13
|
-
from numpy.typing import ArrayLike
|
14
13
|
from sklearn.metrics import average_precision_score
|
15
14
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
15
|
+
from dataeval._output import Output, set_metadata
|
16
|
+
from dataeval.typing import ArrayLike
|
17
|
+
from dataeval.utils._array import as_numpy
|
18
18
|
|
19
19
|
|
20
20
|
@dataclass(frozen=True)
|
21
21
|
class UAPOutput(Output):
|
22
22
|
"""
|
23
|
-
Output class for :func
|
23
|
+
Output class for :func:`.uap` estimator metric.
|
24
24
|
|
25
25
|
Attributes
|
26
26
|
----------
|
@@ -21,15 +21,15 @@ __all__ = [
|
|
21
21
|
"visualstats",
|
22
22
|
]
|
23
23
|
|
24
|
-
from dataeval.metrics.stats.
|
25
|
-
from dataeval.metrics.stats.
|
24
|
+
from dataeval.metrics.stats._boxratiostats import boxratiostats
|
25
|
+
from dataeval.metrics.stats._datasetstats import (
|
26
26
|
ChannelStatsOutput,
|
27
27
|
DatasetStatsOutput,
|
28
28
|
channelstats,
|
29
29
|
datasetstats,
|
30
30
|
)
|
31
|
-
from dataeval.metrics.stats.
|
32
|
-
from dataeval.metrics.stats.
|
33
|
-
from dataeval.metrics.stats.
|
34
|
-
from dataeval.metrics.stats.
|
35
|
-
from dataeval.metrics.stats.
|
31
|
+
from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, dimensionstats
|
32
|
+
from dataeval.metrics.stats._hashstats import HashStatsOutput, hashstats
|
33
|
+
from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
|
34
|
+
from dataeval.metrics.stats._pixelstats import PixelStatsOutput, pixelstats
|
35
|
+
from dataeval.metrics.stats._visualstats import VisualStatsOutput, visualstats
|
@@ -1,32 +1,31 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from dataeval.utils.plot import histogram_plot
|
4
|
-
|
5
3
|
__all__ = []
|
6
4
|
|
7
5
|
import re
|
8
6
|
import warnings
|
7
|
+
from copy import deepcopy
|
9
8
|
from dataclasses import dataclass
|
10
9
|
from functools import partial
|
11
10
|
from itertools import repeat
|
12
11
|
from multiprocessing import Pool
|
13
|
-
from typing import Any, Callable, Generic, Iterable,
|
12
|
+
from typing import Any, Callable, Generic, Iterable, Optional, Sequence, Sized, TypeVar, Union
|
14
13
|
|
15
14
|
import numpy as np
|
16
15
|
import tqdm
|
17
|
-
from numpy.typing import
|
16
|
+
from numpy.typing import NDArray
|
18
17
|
|
19
|
-
from dataeval.
|
20
|
-
from dataeval.
|
21
|
-
from dataeval.
|
18
|
+
from dataeval._output import Output
|
19
|
+
from dataeval.config import get_max_processes
|
20
|
+
from dataeval.typing import ArrayLike
|
21
|
+
from dataeval.utils._array import to_numpy_iter
|
22
|
+
from dataeval.utils._image import normalize_image_shape, rescale
|
23
|
+
from dataeval.utils._plot import histogram_plot
|
22
24
|
|
23
25
|
DTYPE_REGEX = re.compile(r"NDArray\[np\.(.*?)\]")
|
24
26
|
SOURCE_INDEX = "source_index"
|
25
27
|
BOX_COUNT = "box_count"
|
26
28
|
|
27
|
-
# TODO: Replace with global config
|
28
|
-
DEFAULT_PROCESSES: int | None = None
|
29
|
-
|
30
29
|
OptionalRange = Optional[Union[int, Iterable[int]]]
|
31
30
|
|
32
31
|
|
@@ -49,7 +48,8 @@ def normalize_box_shape(bounding_box: NDArray[Any]) -> NDArray[Any]:
|
|
49
48
|
return bounding_box
|
50
49
|
|
51
50
|
|
52
|
-
|
51
|
+
@dataclass
|
52
|
+
class SourceIndex:
|
53
53
|
"""
|
54
54
|
Attributes
|
55
55
|
----------
|
@@ -205,7 +205,8 @@ class StatsProcessor(Generic[TStatsOutput]):
|
|
205
205
|
return cls.output_class(**output, source_index=source_index, box_count=np.asarray(box_count, dtype=np.uint16))
|
206
206
|
|
207
207
|
|
208
|
-
|
208
|
+
@dataclass
|
209
|
+
class StatsProcessorOutput:
|
209
210
|
results: list[dict[str, Any]]
|
210
211
|
source_indices: list[SourceIndex]
|
211
212
|
box_counts: list[int]
|
@@ -272,8 +273,6 @@ def run_stats(
|
|
272
273
|
A flag which determines if the states should be evaluated on a per-channel basis or not.
|
273
274
|
stats_processor_cls : Iterable[type[StatsProcessor]]
|
274
275
|
An iterable of stats processor classes that calculate stats and return output classes.
|
275
|
-
processes : int | None, default None
|
276
|
-
Number of processes to use, defaults to None which uses all available CPU cores.
|
277
276
|
|
278
277
|
Returns
|
279
278
|
-------
|
@@ -297,11 +296,11 @@ def run_stats(
|
|
297
296
|
bbox_iter = repeat(None) if bboxes is None else to_numpy_iter(bboxes)
|
298
297
|
|
299
298
|
warning_list = []
|
300
|
-
total_for_status =
|
299
|
+
total_for_status = len(images) if isinstance(images, Sized) else None
|
301
300
|
stats_processor_cls = stats_processor_cls if isinstance(stats_processor_cls, Iterable) else [stats_processor_cls]
|
302
301
|
|
303
302
|
# TODO: Introduce global controls for CPU job parallelism and GPU configurations
|
304
|
-
with Pool(processes=
|
303
|
+
with Pool(processes=get_max_processes()) as p:
|
305
304
|
for r in tqdm.tqdm(
|
306
305
|
p.imap(
|
307
306
|
partial(process_stats_unpack, per_channel=per_channel, stats_processor_cls=stats_processor_cls),
|
@@ -330,3 +329,40 @@ def run_stats(
|
|
330
329
|
|
331
330
|
outputs = [s.convert_output(output, source_index, box_count) for s in stats_processor_cls]
|
332
331
|
return outputs
|
332
|
+
|
333
|
+
|
334
|
+
def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
335
|
+
if type(a) is not type(b):
|
336
|
+
raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
|
337
|
+
|
338
|
+
sum_dict = deepcopy(a.dict())
|
339
|
+
|
340
|
+
for k in sum_dict:
|
341
|
+
if isinstance(sum_dict[k], list):
|
342
|
+
sum_dict[k].extend(b.dict()[k])
|
343
|
+
else:
|
344
|
+
sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
|
345
|
+
|
346
|
+
return type(a)(**sum_dict)
|
347
|
+
|
348
|
+
|
349
|
+
def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
|
350
|
+
output = None
|
351
|
+
dataset_steps = []
|
352
|
+
cur_len = 0
|
353
|
+
for s in stats:
|
354
|
+
output = s if output is None else add_stats(output, s)
|
355
|
+
cur_len += len(s)
|
356
|
+
dataset_steps.append(cur_len)
|
357
|
+
if output is None:
|
358
|
+
raise TypeError("Cannot combine empty sequence of stats.")
|
359
|
+
return output, dataset_steps
|
360
|
+
|
361
|
+
|
362
|
+
def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
|
363
|
+
last_step = 0
|
364
|
+
for i, step in enumerate(dataset_steps):
|
365
|
+
if idx < step:
|
366
|
+
return i, idx - last_step
|
367
|
+
last_step = step
|
368
|
+
return -1, idx
|
@@ -8,9 +8,9 @@ from typing import Any, Callable, Generic, TypeVar, cast
|
|
8
8
|
import numpy as np
|
9
9
|
from numpy.typing import NDArray
|
10
10
|
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.metrics.stats.
|
13
|
-
from dataeval.
|
11
|
+
from dataeval._output import set_metadata
|
12
|
+
from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, BaseStatsOutput
|
13
|
+
from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
|
14
14
|
|
15
15
|
TStatOutput = TypeVar("TStatOutput", bound=BaseStatsOutput, contravariant=True)
|
16
16
|
ArraySlice = tuple[int, int]
|
@@ -50,7 +50,7 @@ RATIOSTATS_OVERRIDE_MAP: dict[type, dict[str, Callable[..., NDArray[Any]]]] = {
|
|
50
50
|
"depth": lambda x: x.box["depth"],
|
51
51
|
"distance": lambda x: x.box["distance"],
|
52
52
|
}
|
53
|
-
)
|
53
|
+
),
|
54
54
|
}
|
55
55
|
|
56
56
|
|
@@ -87,11 +87,8 @@ def calculate_ratios(key: str, box_stats: BaseStatsOutput, img_stats: BaseStatsO
|
|
87
87
|
stats = BoxImageStatsOutputSlice(box_stats, (box_i, box_j), img_stats, (img_i, img_j))
|
88
88
|
out_type = type(box_stats)
|
89
89
|
use_override = out_type in RATIOSTATS_OVERRIDE_MAP and key in RATIOSTATS_OVERRIDE_MAP[out_type]
|
90
|
-
|
91
|
-
RATIOSTATS_OVERRIDE_MAP[out_type][key](stats)
|
92
|
-
if use_override
|
93
|
-
else np.nan_to_num(stats.box[key] / stats.img[key])
|
94
|
-
)
|
90
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
91
|
+
ratio = RATIOSTATS_OVERRIDE_MAP[out_type][key](stats) if use_override else stats.box[key] / stats.img[key]
|
95
92
|
out_stats[box_i:box_j] = ratio.reshape(-1, *out_stats[box_i].shape)
|
96
93
|
return out_stats
|
97
94
|
|
@@ -5,24 +5,20 @@ __all__ = []
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import Any, Iterable
|
7
7
|
|
8
|
-
from
|
9
|
-
|
10
|
-
from dataeval.metrics.stats.
|
11
|
-
from dataeval.metrics.stats.
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
from dataeval.
|
16
|
-
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, PixelStatsProcessor
|
17
|
-
from dataeval.metrics.stats.visualstats import VisualStatsOutput, VisualStatsProcessor
|
18
|
-
from dataeval.output import Output, set_metadata
|
19
|
-
from dataeval.utils.plot import channel_histogram_plot
|
8
|
+
from dataeval._output import Output, set_metadata
|
9
|
+
from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, _is_plottable, run_stats
|
10
|
+
from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, DimensionStatsProcessor
|
11
|
+
from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
|
12
|
+
from dataeval.metrics.stats._pixelstats import PixelStatsOutput, PixelStatsProcessor
|
13
|
+
from dataeval.metrics.stats._visualstats import VisualStatsOutput, VisualStatsProcessor
|
14
|
+
from dataeval.typing import ArrayLike
|
15
|
+
from dataeval.utils._plot import channel_histogram_plot
|
20
16
|
|
21
17
|
|
22
18
|
@dataclass(frozen=True)
|
23
19
|
class DatasetStatsOutput(Output, HistogramPlotMixin):
|
24
20
|
"""
|
25
|
-
Output class for :func
|
21
|
+
Output class for :func:`.datasetstats` stats metric.
|
26
22
|
|
27
23
|
This class represents the outputs of various stats functions against a single
|
28
24
|
dataset, such that each index across all stat outputs are representative of
|
@@ -82,7 +78,7 @@ def _get_channels(cls, channel_limit: int | None = None, channel_index: int | It
|
|
82
78
|
@dataclass(frozen=True)
|
83
79
|
class ChannelStatsOutput(Output):
|
84
80
|
"""
|
85
|
-
Output class for :func
|
81
|
+
Output class for :func:`.channelstats` stats metric.
|
86
82
|
|
87
83
|
This class represents the outputs of various per-channel stats functions against
|
88
84
|
a single dataset, such that each index across all stat outputs are representative
|
@@ -6,17 +6,18 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import Any, Callable, Iterable
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
-
from numpy.typing import
|
9
|
+
from numpy.typing import NDArray
|
10
10
|
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
11
|
+
from dataeval._output import set_metadata
|
12
|
+
from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
|
13
|
+
from dataeval.typing import ArrayLike
|
14
|
+
from dataeval.utils._image import get_bitdepth
|
14
15
|
|
15
16
|
|
16
17
|
@dataclass(frozen=True)
|
17
18
|
class DimensionStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
18
19
|
"""
|
19
|
-
Output class for :func
|
20
|
+
Output class for :func:`.dimensionstats` stats metric.
|
20
21
|
|
21
22
|
Attributes
|
22
23
|
----------
|
@@ -9,14 +9,14 @@ from typing import Callable, Iterable
|
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
import xxhash as xxh
|
12
|
-
from numpy.typing import ArrayLike
|
13
12
|
from PIL import Image
|
14
13
|
from scipy.fftpack import dct
|
15
14
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.metrics.stats.
|
18
|
-
from dataeval.
|
19
|
-
from dataeval.utils.
|
15
|
+
from dataeval._output import set_metadata
|
16
|
+
from dataeval.metrics.stats._base import BaseStatsOutput, StatsProcessor, run_stats
|
17
|
+
from dataeval.typing import ArrayLike
|
18
|
+
from dataeval.utils._array import as_numpy
|
19
|
+
from dataeval.utils._image import normalize_image_shape, rescale
|
20
20
|
|
21
21
|
HASH_SIZE = 8
|
22
22
|
MAX_FACTOR = 4
|
@@ -25,7 +25,7 @@ MAX_FACTOR = 4
|
|
25
25
|
@dataclass(frozen=True)
|
26
26
|
class HashStatsOutput(BaseStatsOutput):
|
27
27
|
"""
|
28
|
-
Output class for :func
|
28
|
+
Output class for :func:`.hashstats` stats metric.
|
29
29
|
|
30
30
|
Attributes
|
31
31
|
----------
|
@@ -8,10 +8,10 @@ from dataclasses import dataclass
|
|
8
8
|
from typing import Any, Iterable, Mapping, TypeVar
|
9
9
|
|
10
10
|
import numpy as np
|
11
|
-
from numpy.typing import ArrayLike
|
12
11
|
|
13
|
-
from dataeval.
|
14
|
-
from dataeval.
|
12
|
+
from dataeval._output import Output, set_metadata
|
13
|
+
from dataeval.typing import ArrayLike
|
14
|
+
from dataeval.utils._array import as_numpy
|
15
15
|
|
16
16
|
with contextlib.suppress(ImportError):
|
17
17
|
import pandas as pd
|
@@ -20,7 +20,7 @@ with contextlib.suppress(ImportError):
|
|
20
20
|
@dataclass(frozen=True)
|
21
21
|
class LabelStatsOutput(Output):
|
22
22
|
"""
|
23
|
-
Output class for :func
|
23
|
+
Output class for :func:`.labelstats` stats metric.
|
24
24
|
|
25
25
|
Attributes
|
26
26
|
----------
|
@@ -6,17 +6,18 @@ from dataclasses import dataclass
|
|
6
6
|
from typing import Any, Callable, Iterable
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
-
from numpy.typing import
|
9
|
+
from numpy.typing import NDArray
|
10
10
|
from scipy.stats import entropy, kurtosis, skew
|
11
11
|
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
12
|
+
from dataeval._output import set_metadata
|
13
|
+
from dataeval.metrics.stats._base import BaseStatsOutput, HistogramPlotMixin, StatsProcessor, run_stats
|
14
|
+
from dataeval.typing import ArrayLike
|
14
15
|
|
15
16
|
|
16
17
|
@dataclass(frozen=True)
|
17
18
|
class PixelStatsOutput(BaseStatsOutput, HistogramPlotMixin):
|
18
19
|
"""
|
19
|
-
Output class for :func
|
20
|
+
Output class for :func:`.pixelstats` stats metric.
|
20
21
|
|
21
22
|
Attributes
|
22
23
|
----------
|