dataeval 0.69.4__py3-none-any.whl → 0.70.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +8 -8
- dataeval/_internal/datasets.py +235 -131
- dataeval/_internal/detectors/clusterer.py +2 -0
- dataeval/_internal/detectors/drift/base.py +7 -8
- dataeval/_internal/detectors/drift/mmd.py +4 -4
- dataeval/_internal/detectors/duplicates.py +64 -45
- dataeval/_internal/detectors/merged_stats.py +23 -54
- dataeval/_internal/detectors/ood/ae.py +8 -6
- dataeval/_internal/detectors/ood/aegmm.py +6 -4
- dataeval/_internal/detectors/ood/base.py +12 -7
- dataeval/_internal/detectors/ood/llr.py +6 -4
- dataeval/_internal/detectors/ood/vae.py +5 -3
- dataeval/_internal/detectors/ood/vaegmm.py +6 -4
- dataeval/_internal/detectors/outliers.py +137 -63
- dataeval/_internal/interop.py +11 -7
- dataeval/_internal/metrics/balance.py +13 -11
- dataeval/_internal/metrics/ber.py +5 -3
- dataeval/_internal/metrics/coverage.py +4 -0
- dataeval/_internal/metrics/divergence.py +9 -5
- dataeval/_internal/metrics/diversity.py +14 -12
- dataeval/_internal/metrics/parity.py +32 -22
- dataeval/_internal/metrics/stats/base.py +231 -0
- dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
- dataeval/_internal/metrics/stats/datasetstats.py +99 -0
- dataeval/_internal/metrics/stats/dimensionstats.py +113 -0
- dataeval/_internal/metrics/stats/hashstats.py +75 -0
- dataeval/_internal/metrics/stats/labelstats.py +125 -0
- dataeval/_internal/metrics/stats/pixelstats.py +119 -0
- dataeval/_internal/metrics/stats/visualstats.py +124 -0
- dataeval/_internal/metrics/uap.py +8 -4
- dataeval/_internal/metrics/utils.py +30 -15
- dataeval/_internal/models/pytorch/autoencoder.py +5 -5
- dataeval/_internal/models/tensorflow/pixelcnn.py +1 -4
- dataeval/_internal/output.py +3 -18
- dataeval/_internal/utils.py +11 -16
- dataeval/_internal/workflows/sufficiency.py +152 -151
- dataeval/detectors/__init__.py +4 -0
- dataeval/detectors/drift/__init__.py +8 -3
- dataeval/detectors/drift/kernels/__init__.py +4 -0
- dataeval/detectors/drift/updates/__init__.py +4 -0
- dataeval/detectors/linters/__init__.py +15 -4
- dataeval/detectors/ood/__init__.py +14 -2
- dataeval/metrics/__init__.py +5 -0
- dataeval/metrics/bias/__init__.py +13 -4
- dataeval/metrics/estimators/__init__.py +8 -8
- dataeval/metrics/stats/__init__.py +25 -3
- dataeval/utils/__init__.py +16 -3
- dataeval/utils/tensorflow/__init__.py +11 -0
- dataeval/utils/torch/__init__.py +12 -0
- dataeval/utils/torch/datasets/__init__.py +7 -0
- dataeval/workflows/__init__.py +6 -2
- {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/METADATA +12 -4
- dataeval-0.70.1.dist-info/RECORD +80 -0
- {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/WHEEL +1 -1
- dataeval/_internal/flags.py +0 -77
- dataeval/_internal/metrics/stats.py +0 -397
- dataeval/flags/__init__.py +0 -3
- dataeval/tensorflow/__init__.py +0 -3
- dataeval/torch/__init__.py +0 -3
- dataeval-0.69.4.dist-info/RECORD +0 -74
- /dataeval/{tensorflow → utils/tensorflow}/loss/__init__.py +0 -0
- /dataeval/{tensorflow → utils/tensorflow}/models/__init__.py +0 -0
- /dataeval/{tensorflow → utils/tensorflow}/recon/__init__.py +0 -0
- /dataeval/{torch → utils/torch}/models/__init__.py +0 -0
- /dataeval/{torch → utils/torch}/trainer/__init__.py +0 -0
- {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/LICENSE.txt +0 -0
@@ -1,39 +1,47 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from typing import Iterable, Literal, Sequence,
|
5
|
-
from warnings import warn
|
4
|
+
from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
|
6
5
|
|
7
6
|
import numpy as np
|
8
7
|
from numpy.typing import ArrayLike, NDArray
|
9
8
|
|
10
9
|
from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
|
11
|
-
from dataeval._internal.
|
12
|
-
from dataeval._internal.metrics.stats import
|
10
|
+
from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
|
11
|
+
from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
|
12
|
+
from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
|
13
|
+
from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
|
14
|
+
from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
|
13
15
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
14
16
|
|
15
17
|
IndexIssueMap = dict[int, dict[str, float]]
|
16
|
-
|
17
|
-
""
|
18
|
-
Mapping of image indices to a dictionary of issue types and calculated values
|
19
|
-
"""
|
18
|
+
OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
19
|
+
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
|
20
20
|
|
21
21
|
|
22
22
|
@dataclass(frozen=True)
|
23
|
-
class OutliersOutput(OutputMetadata):
|
23
|
+
class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
|
24
24
|
"""
|
25
|
+
Output class for :class:`Outliers` lint detector
|
26
|
+
|
25
27
|
Attributes
|
26
28
|
----------
|
27
|
-
issues : dict[int, dict[str, float]] |
|
29
|
+
issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
|
28
30
|
Indices of image outliers with their associated issue type and calculated values.
|
29
31
|
|
30
32
|
- For a single dataset, a dictionary containing the indices of outliers and
|
31
33
|
a dictionary showing the issues and calculated values for the given index.
|
32
|
-
- For multiple
|
33
|
-
and their associated issues and calculated values.
|
34
|
+
- For multiple stats outputs, a list of dictionaries containing the indices of
|
35
|
+
outliers and their associated issues and calculated values.
|
34
36
|
"""
|
35
37
|
|
36
|
-
issues:
|
38
|
+
issues: TIndexIssueMap
|
39
|
+
|
40
|
+
def __len__(self):
|
41
|
+
if isinstance(self.issues, dict):
|
42
|
+
return len(self.issues)
|
43
|
+
else:
|
44
|
+
return sum(len(d) for d in self.issues)
|
37
45
|
|
38
46
|
|
39
47
|
def _get_outlier_mask(
|
@@ -43,7 +51,7 @@ def _get_outlier_mask(
|
|
43
51
|
threshold = threshold if threshold else 3.0
|
44
52
|
std = np.std(values)
|
45
53
|
abs_diff = np.abs(values - np.mean(values))
|
46
|
-
return (abs_diff / std) > threshold
|
54
|
+
return std != 0 and (abs_diff / std) > threshold
|
47
55
|
elif method == "modzscore":
|
48
56
|
threshold = threshold if threshold else 3.5
|
49
57
|
abs_diff = np.abs(values - np.median(values))
|
@@ -65,9 +73,6 @@ class Outliers:
|
|
65
73
|
|
66
74
|
Parameters
|
67
75
|
----------
|
68
|
-
flags : ImageStat, default ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS
|
69
|
-
Metric(s) to calculate for each image - calculates all metrics if None
|
70
|
-
Only supports ImageStat.ALL_STATS
|
71
76
|
outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
|
72
77
|
Statistical method used to identify outliers
|
73
78
|
outlier_threshold : float, optional - default None
|
@@ -76,15 +81,15 @@ class Outliers:
|
|
76
81
|
|
77
82
|
Attributes
|
78
83
|
----------
|
79
|
-
stats :
|
80
|
-
|
84
|
+
stats : tuple[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
85
|
+
Various stats output classes that hold the value of each metric for each image
|
81
86
|
|
82
87
|
See Also
|
83
88
|
--------
|
84
89
|
Duplicates
|
85
90
|
|
86
|
-
|
87
|
-
|
91
|
+
Note
|
92
|
+
----
|
88
93
|
There are 3 different statistical methods:
|
89
94
|
|
90
95
|
- zscore
|
@@ -109,52 +114,61 @@ class Outliers:
|
|
109
114
|
|
110
115
|
>>> outliers = Outliers()
|
111
116
|
|
112
|
-
Specifying specific metrics to analyze:
|
113
|
-
|
114
|
-
>>> outliers = Outliers(flags=ImageStat.SIZE | ImageStat.ALL_VISUALS)
|
115
|
-
|
116
117
|
Specifying an outlier method:
|
117
118
|
|
118
119
|
>>> outliers = Outliers(outlier_method="iqr")
|
119
120
|
|
120
121
|
Specifying an outlier method and threshold:
|
121
122
|
|
122
|
-
>>> outliers = Outliers(outlier_method="zscore", outlier_threshold=
|
123
|
+
>>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
|
123
124
|
"""
|
124
125
|
|
125
126
|
def __init__(
|
126
127
|
self,
|
127
|
-
|
128
|
+
use_dimension: bool = True,
|
129
|
+
use_pixel: bool = True,
|
130
|
+
use_visual: bool = True,
|
128
131
|
outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
|
129
132
|
outlier_threshold: float | None = None,
|
130
133
|
):
|
131
|
-
|
132
|
-
self.
|
134
|
+
self.stats: DatasetStatsOutput
|
135
|
+
self.use_dimension = use_dimension
|
136
|
+
self.use_pixel = use_pixel
|
137
|
+
self.use_visual = use_visual
|
133
138
|
self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
|
134
139
|
self.outlier_threshold = outlier_threshold
|
135
140
|
|
136
|
-
def _get_outliers(self) -> dict:
|
137
|
-
flagged_images = {}
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if
|
142
|
-
mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
|
141
|
+
def _get_outliers(self, stats: dict) -> dict[int, dict[str, float]]:
|
142
|
+
flagged_images: dict[int, dict[str, float]] = {}
|
143
|
+
for stat, values in stats.items():
|
144
|
+
if stat in (SOURCE_INDEX, BOX_COUNT):
|
145
|
+
continue
|
146
|
+
if values.ndim == 1:
|
147
|
+
mask = _get_outlier_mask(values.astype(np.float64), self.outlier_method, self.outlier_threshold)
|
143
148
|
indices = np.flatnonzero(mask)
|
144
149
|
for i, value in zip(indices, values[mask]):
|
145
|
-
flagged_images.setdefault(i, {}).update({stat:
|
150
|
+
flagged_images.setdefault(i, {}).update({stat: value})
|
146
151
|
|
147
152
|
return dict(sorted(flagged_images.items()))
|
148
153
|
|
149
|
-
@
|
150
|
-
def
|
154
|
+
@overload
|
155
|
+
def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
|
156
|
+
|
157
|
+
@overload
|
158
|
+
def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
|
159
|
+
|
160
|
+
@set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
|
161
|
+
def from_stats(
|
162
|
+
self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
|
163
|
+
) -> OutliersOutput:
|
151
164
|
"""
|
152
165
|
Returns indices of outliers with the issues identified for each
|
153
166
|
|
154
167
|
Parameters
|
155
168
|
----------
|
156
|
-
|
157
|
-
|
169
|
+
stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
|
170
|
+
The output(s) from a dimensionstats, pixelstats, or visualstats metric
|
171
|
+
analysis or an aggregate DatasetStatsOutput
|
158
172
|
|
159
173
|
Returns
|
160
174
|
-------
|
@@ -162,36 +176,96 @@ class Outliers:
|
|
162
176
|
Output class containing the indices of outliers and a dictionary showing
|
163
177
|
the issues and calculated values for the given index.
|
164
178
|
|
179
|
+
See Also
|
180
|
+
--------
|
181
|
+
dimensionstats
|
182
|
+
pixelstats
|
183
|
+
visualstats
|
184
|
+
|
165
185
|
Example
|
166
186
|
-------
|
167
187
|
Evaluate the dataset:
|
168
188
|
|
169
|
-
>>> outliers.
|
170
|
-
|
189
|
+
>>> results = outliers.from_stats([stats1, stats2])
|
190
|
+
>>> len(results)
|
191
|
+
2
|
192
|
+
>>> results.issues[0]
|
193
|
+
{10: {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}, 12: {'std': 0.00536, 'var': 2.87e-05, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}}
|
194
|
+
>>> results.issues[1]
|
195
|
+
{}
|
171
196
|
""" # noqa: E501
|
172
|
-
stats,
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
197
|
+
if isinstance(stats, DatasetStatsOutput):
|
198
|
+
outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
|
199
|
+
return OutliersOutput(outliers)
|
200
|
+
|
201
|
+
if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
|
202
|
+
return OutliersOutput(self._get_outliers(stats.dict()))
|
203
|
+
|
204
|
+
if not isinstance(stats, Sequence):
|
205
|
+
raise TypeError(
|
206
|
+
"Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
|
207
|
+
)
|
208
|
+
|
209
|
+
stats_map: dict[type, list[int]] = {}
|
210
|
+
for i, stats_output in enumerate(stats):
|
211
|
+
if not isinstance(
|
212
|
+
stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
|
213
|
+
):
|
214
|
+
raise TypeError(
|
215
|
+
"Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
|
182
216
|
)
|
183
|
-
|
184
|
-
else:
|
185
|
-
self.stats = imagestats(cast(Iterable[ArrayLike], data), self.flags)
|
186
|
-
|
187
|
-
outliers = self._get_outliers()
|
217
|
+
stats_map.setdefault(type(stats_output), []).append(i)
|
188
218
|
|
189
|
-
|
190
|
-
|
191
|
-
|
219
|
+
output_list: list[dict[int, dict[str, float]]] = [{} for _ in stats]
|
220
|
+
for _, indices in stats_map.items():
|
221
|
+
substats, dataset_steps = combine_stats([stats[i] for i in indices])
|
222
|
+
outliers = self._get_outliers(substats.dict())
|
192
223
|
for idx, issue in outliers.items():
|
193
224
|
k, v = get_dataset_step_from_idx(idx, dataset_steps)
|
194
|
-
|
195
|
-
|
225
|
+
output_list[indices[k]][v] = issue
|
226
|
+
|
227
|
+
return OutliersOutput(output_list)
|
228
|
+
|
229
|
+
@set_metadata(
|
230
|
+
"dataeval.detectors",
|
231
|
+
[
|
232
|
+
"use_dimension",
|
233
|
+
"use_pixel",
|
234
|
+
"use_visual",
|
235
|
+
"outlier_method",
|
236
|
+
"outlier_threshold",
|
237
|
+
],
|
238
|
+
)
|
239
|
+
def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
|
240
|
+
"""
|
241
|
+
Returns indices of outliers with the issues identified for each
|
242
|
+
|
243
|
+
Parameters
|
244
|
+
----------
|
245
|
+
data : Iterable[ArrayLike], shape - (C, H, W)
|
246
|
+
A dataset of images in an ArrayLike format
|
196
247
|
|
248
|
+
Returns
|
249
|
+
-------
|
250
|
+
OutliersOutput
|
251
|
+
Output class containing the indices of outliers and a dictionary showing
|
252
|
+
the issues and calculated values for the given index.
|
253
|
+
|
254
|
+
Example
|
255
|
+
-------
|
256
|
+
Evaluate the dataset:
|
257
|
+
|
258
|
+
>>> results = outliers.evaluate(images)
|
259
|
+
>>> list(results.issues)
|
260
|
+
[10, 12]
|
261
|
+
>>> results.issues[10]
|
262
|
+
{'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
|
263
|
+
"""
|
264
|
+
self.stats = datasetstats(
|
265
|
+
images=data,
|
266
|
+
use_dimension=self.use_dimension,
|
267
|
+
use_pixel=self.use_pixel,
|
268
|
+
use_visual=self.use_visual,
|
269
|
+
)
|
270
|
+
outliers = self._get_outliers({k: v for o in self.stats.outputs() for k, v in o.dict().items()})
|
197
271
|
return OutliersOutput(outliers)
|
dataeval/_internal/interop.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from importlib import import_module
|
4
|
-
from typing import Iterable
|
4
|
+
from typing import Any, Iterable, Iterator
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
from numpy.typing import ArrayLike, NDArray
|
@@ -22,24 +22,28 @@ def try_import(module_name):
|
|
22
22
|
return module
|
23
23
|
|
24
24
|
|
25
|
-
def
|
25
|
+
def as_numpy(array: ArrayLike | None) -> NDArray[Any]:
|
26
|
+
return to_numpy(array, copy=False)
|
27
|
+
|
28
|
+
|
29
|
+
def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
|
26
30
|
if array is None:
|
27
31
|
return np.ndarray([])
|
28
32
|
|
29
33
|
if isinstance(array, np.ndarray):
|
30
|
-
return array
|
34
|
+
return array.copy() if copy else array
|
31
35
|
|
32
36
|
tf = try_import("tensorflow")
|
33
37
|
if tf and tf.is_tensor(array):
|
34
|
-
return array.numpy() # type: ignore
|
38
|
+
return array.numpy().copy() if copy else array.numpy() # type: ignore
|
35
39
|
|
36
40
|
torch = try_import("torch")
|
37
41
|
if torch and isinstance(array, torch.Tensor):
|
38
|
-
return array.detach().cpu().numpy() # type: ignore
|
42
|
+
return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
|
39
43
|
|
40
|
-
return np.
|
44
|
+
return np.array(array, copy=copy)
|
41
45
|
|
42
46
|
|
43
|
-
def to_numpy_iter(iterable: Iterable[ArrayLike]):
|
47
|
+
def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
|
44
48
|
for array in iterable:
|
45
49
|
yield to_numpy(array)
|
@@ -2,10 +2,10 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import warnings
|
4
4
|
from dataclasses import dataclass
|
5
|
-
from typing import
|
5
|
+
from typing import Mapping
|
6
6
|
|
7
7
|
import numpy as np
|
8
|
-
from numpy.typing import NDArray
|
8
|
+
from numpy.typing import ArrayLike, NDArray
|
9
9
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
10
10
|
|
11
11
|
from dataeval._internal.metrics.utils import entropy, preprocess_metadata
|
@@ -15,6 +15,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
|
|
15
15
|
@dataclass(frozen=True)
|
16
16
|
class BalanceOutput(OutputMetadata):
|
17
17
|
"""
|
18
|
+
Output class for :func:`balance` bias metric
|
19
|
+
|
18
20
|
Attributes
|
19
21
|
----------
|
20
22
|
balance : NDArray[np.float64]
|
@@ -51,16 +53,16 @@ def validate_num_neighbors(num_neighbors: int) -> int:
|
|
51
53
|
|
52
54
|
|
53
55
|
@set_metadata("dataeval.metrics")
|
54
|
-
def balance(class_labels:
|
56
|
+
def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neighbors: int = 5) -> BalanceOutput:
|
55
57
|
"""
|
56
58
|
Mutual information (MI) between factors (class label, metadata, label/image properties)
|
57
59
|
|
58
60
|
Parameters
|
59
61
|
----------
|
60
|
-
class_labels:
|
62
|
+
class_labels: ArrayLike
|
61
63
|
List of class labels for each image
|
62
|
-
metadata:
|
63
|
-
|
64
|
+
metadata: Mapping[str, ArrayLike]
|
65
|
+
Dict of lists of metadata factors for each image
|
64
66
|
num_neighbors: int, default 5
|
65
67
|
Number of nearest neighbors to use for computing MI between discrete
|
66
68
|
and continuous variables.
|
@@ -71,8 +73,8 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
|
|
71
73
|
(num_factors+1) x (num_factors+1) estimate of mutual information
|
72
74
|
between num_factors metadata factors and class label. Symmetry is enforced.
|
73
75
|
|
74
|
-
|
75
|
-
|
76
|
+
Note
|
77
|
+
----
|
76
78
|
We use `mutual_info_classif` from sklearn since class label is categorical.
|
77
79
|
`mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
|
78
80
|
seed. MI is computed differently for categorical and continuous variables, and
|
@@ -90,9 +92,9 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
|
|
90
92
|
Return intra/interfactor balance (mutual information)
|
91
93
|
|
92
94
|
>>> bal.factors
|
93
|
-
array([[0.99999843, 0.
|
94
|
-
[0.
|
95
|
-
[0.09725766, 0.
|
95
|
+
array([[0.99999843, 0.04133555, 0.09725766],
|
96
|
+
[0.04133555, 0.08433558, 0.1301489 ],
|
97
|
+
[0.09725766, 0.1301489 , 0.99999856]])
|
96
98
|
|
97
99
|
Return classwise balance (mutual information) of factors with individual class_labels
|
98
100
|
|
@@ -17,7 +17,7 @@ from numpy.typing import ArrayLike, NDArray
|
|
17
17
|
from scipy.sparse import coo_matrix
|
18
18
|
from scipy.stats import mode
|
19
19
|
|
20
|
-
from dataeval._internal.interop import
|
20
|
+
from dataeval._internal.interop import as_numpy
|
21
21
|
from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
|
22
22
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
23
23
|
|
@@ -25,6 +25,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
|
|
25
25
|
@dataclass(frozen=True)
|
26
26
|
class BEROutput(OutputMetadata):
|
27
27
|
"""
|
28
|
+
Output class for :func:`ber` estimator metric
|
29
|
+
|
28
30
|
Attributes
|
29
31
|
----------
|
30
32
|
ber : float
|
@@ -145,7 +147,7 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
|
|
145
147
|
BEROutput(ber=0.04, ber_lower=0.020416847668728033)
|
146
148
|
"""
|
147
149
|
ber_fn = get_method(BER_FN_MAP, method)
|
148
|
-
X =
|
149
|
-
y =
|
150
|
+
X = as_numpy(images)
|
151
|
+
y = as_numpy(labels)
|
150
152
|
upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
|
151
153
|
return BEROutput(upper, lower)
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import math
|
2
4
|
from dataclasses import dataclass
|
3
5
|
from typing import Literal
|
@@ -14,6 +16,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
|
|
14
16
|
@dataclass(frozen=True)
|
15
17
|
class CoverageOutput(OutputMetadata):
|
16
18
|
"""
|
19
|
+
Output class for :func:`coverage` bias metric
|
20
|
+
|
17
21
|
Attributes
|
18
22
|
----------
|
19
23
|
indices : NDArray
|
@@ -3,13 +3,15 @@ This module contains the implementation of HP Divergence
|
|
3
3
|
using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
|
4
4
|
"""
|
5
5
|
|
6
|
+
from __future__ import annotations
|
7
|
+
|
6
8
|
from dataclasses import dataclass
|
7
9
|
from typing import Literal
|
8
10
|
|
9
11
|
import numpy as np
|
10
12
|
from numpy.typing import ArrayLike, NDArray
|
11
13
|
|
12
|
-
from dataeval._internal.interop import
|
14
|
+
from dataeval._internal.interop import as_numpy
|
13
15
|
from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
|
14
16
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
15
17
|
|
@@ -17,6 +19,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
|
|
17
19
|
@dataclass(frozen=True)
|
18
20
|
class DivergenceOutput(OutputMetadata):
|
19
21
|
"""
|
22
|
+
Output class for :func:`divergence` estimator metric
|
23
|
+
|
20
24
|
Attributes
|
21
25
|
----------
|
22
26
|
divergence : float
|
@@ -96,8 +100,8 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
96
100
|
DivergenceOutput
|
97
101
|
The divergence value (0.0..1.0) and the number of differing edges between the datasets
|
98
102
|
|
99
|
-
|
100
|
-
|
103
|
+
Note
|
104
|
+
----
|
101
105
|
The divergence value indicates how similar the 2 datasets are
|
102
106
|
with 0 indicating approximately identical data distributions.
|
103
107
|
|
@@ -123,8 +127,8 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
123
127
|
DivergenceOutput(divergence=0.28, errors=36.0)
|
124
128
|
"""
|
125
129
|
div_fn = get_method(DIVERGENCE_FN_MAP, method)
|
126
|
-
a =
|
127
|
-
b =
|
130
|
+
a = as_numpy(data_a)
|
131
|
+
b = as_numpy(data_b)
|
128
132
|
N = a.shape[0]
|
129
133
|
M = b.shape[0]
|
130
134
|
|
@@ -1,10 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from typing import Literal,
|
4
|
+
from typing import Literal, Mapping
|
5
5
|
|
6
6
|
import numpy as np
|
7
|
-
from numpy.typing import NDArray
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
8
8
|
|
9
9
|
from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
|
10
10
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
@@ -13,6 +13,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
|
|
13
13
|
@dataclass(frozen=True)
|
14
14
|
class DiversityOutput(OutputMetadata):
|
15
15
|
"""
|
16
|
+
Output class for :func:`diversity` bias metric
|
17
|
+
|
16
18
|
Attributes
|
17
19
|
----------
|
18
20
|
diversity_index : NDArray[np.float64]
|
@@ -52,8 +54,8 @@ def diversity_shannon(
|
|
52
54
|
subset_mask: NDArray[np.bool_] | None
|
53
55
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
54
56
|
|
55
|
-
|
56
|
-
|
57
|
+
Note
|
58
|
+
----
|
57
59
|
For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
|
58
60
|
|
59
61
|
Returns
|
@@ -103,8 +105,8 @@ def diversity_simpson(
|
|
103
105
|
subset_mask: NDArray[np.bool_] | None
|
104
106
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
105
107
|
|
106
|
-
|
107
|
-
|
108
|
+
Note
|
109
|
+
----
|
108
110
|
For continuous variables, histogram bins are chosen automatically. See
|
109
111
|
numpy.histogram for details.
|
110
112
|
If there is only one category, the diversity index takes a value of 0.
|
@@ -142,7 +144,7 @@ DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
|
|
142
144
|
|
143
145
|
@set_metadata("dataeval.metrics")
|
144
146
|
def diversity(
|
145
|
-
class_labels:
|
147
|
+
class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
|
146
148
|
) -> DiversityOutput:
|
147
149
|
"""
|
148
150
|
Compute diversity and classwise diversity for discrete/categorical variables and, through standard
|
@@ -155,15 +157,15 @@ def diversity(
|
|
155
157
|
|
156
158
|
Parameters
|
157
159
|
----------
|
158
|
-
class_labels:
|
160
|
+
class_labels: ArrayLike
|
159
161
|
List of class labels for each image
|
160
|
-
metadata:
|
161
|
-
|
162
|
+
metadata: Mapping[str, ArrayLike]
|
163
|
+
Dict of list of metadata factors for each image
|
162
164
|
method: Literal["shannon", "simpson"], default "simpson"
|
163
165
|
Indicates which diversity index should be computed
|
164
166
|
|
165
|
-
|
166
|
-
|
167
|
+
Note
|
168
|
+
----
|
167
169
|
- For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
|
168
170
|
- The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
|
169
171
|
- If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
|