dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +27 -23
- dataeval/detectors/__init__.py +2 -2
- dataeval/detectors/drift/__init__.py +14 -12
- dataeval/detectors/drift/base.py +3 -3
- dataeval/detectors/drift/cvm.py +1 -1
- dataeval/detectors/drift/ks.py +3 -2
- dataeval/detectors/drift/mmd.py +9 -7
- dataeval/detectors/drift/torch.py +12 -12
- dataeval/detectors/drift/uncertainty.py +5 -4
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +4 -4
- dataeval/detectors/linters/clusterer.py +5 -9
- dataeval/detectors/linters/duplicates.py +10 -14
- dataeval/detectors/linters/outliers.py +100 -5
- dataeval/detectors/ood/__init__.py +4 -11
- dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
- dataeval/detectors/ood/base.py +47 -160
- dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- dataeval/detectors/ood/metadata_least_likely.py +3 -3
- dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval/detectors/ood/mixin.py +146 -0
- dataeval/detectors/ood/output.py +63 -0
- dataeval/interop.py +7 -6
- dataeval/{logging.py → log.py} +2 -0
- dataeval/metrics/__init__.py +3 -3
- dataeval/metrics/bias/__init__.py +10 -13
- dataeval/metrics/bias/balance.py +13 -11
- dataeval/metrics/bias/coverage.py +53 -5
- dataeval/metrics/bias/diversity.py +56 -24
- dataeval/metrics/bias/parity.py +20 -17
- dataeval/metrics/estimators/__init__.py +2 -2
- dataeval/metrics/estimators/ber.py +7 -4
- dataeval/metrics/estimators/divergence.py +4 -4
- dataeval/metrics/estimators/uap.py +4 -4
- dataeval/metrics/stats/__init__.py +19 -19
- dataeval/metrics/stats/base.py +28 -12
- dataeval/metrics/stats/boxratiostats.py +13 -14
- dataeval/metrics/stats/datasetstats.py +49 -20
- dataeval/metrics/stats/dimensionstats.py +8 -8
- dataeval/metrics/stats/hashstats.py +14 -10
- dataeval/metrics/stats/labelstats.py +94 -11
- dataeval/metrics/stats/pixelstats.py +11 -14
- dataeval/metrics/stats/visualstats.py +10 -13
- dataeval/output.py +23 -14
- dataeval/utils/__init__.py +5 -14
- dataeval/utils/dataset/__init__.py +7 -0
- dataeval/utils/{torch → dataset}/datasets.py +2 -0
- dataeval/utils/dataset/read.py +63 -0
- dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
- dataeval/utils/image.py +2 -2
- dataeval/utils/metadata.py +317 -14
- dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
- dataeval/utils/torch/__init__.py +2 -17
- dataeval/utils/torch/gmm.py +29 -6
- dataeval/utils/torch/{utils.py → internal.py} +82 -58
- dataeval/utils/torch/models.py +10 -8
- dataeval/utils/torch/trainer.py +6 -85
- dataeval/workflows/__init__.py +2 -5
- dataeval/workflows/sufficiency.py +18 -8
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
- dataeval-0.76.0.dist-info/METADATA +137 -0
- dataeval-0.76.0.dist-info/RECORD +67 -0
- dataeval/detectors/ood/base_torch.py +0 -109
- dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval/utils/gmm.py +0 -26
- dataeval-0.74.2.dist-info/METADATA +0 -120
- dataeval-0.74.2.dist-info/RECORD +0 -66
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0
@@ -1,109 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
from typing import Callable, cast
|
12
|
-
|
13
|
-
import torch
|
14
|
-
from numpy.typing import ArrayLike
|
15
|
-
|
16
|
-
from dataeval.detectors.drift.torch import get_device
|
17
|
-
from dataeval.detectors.ood.base import OODBaseMixin, OODFitMixin, OODGMMMixin
|
18
|
-
from dataeval.interop import to_numpy
|
19
|
-
from dataeval.utils.torch.gmm import gmm_params
|
20
|
-
from dataeval.utils.torch.trainer import trainer
|
21
|
-
|
22
|
-
|
23
|
-
class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.nn.Module], torch.optim.Optimizer]):
|
24
|
-
def __init__(self, model: torch.nn.Module, device: str | torch.device | None = None) -> None:
|
25
|
-
self.device: torch.device = get_device(device)
|
26
|
-
super().__init__(model)
|
27
|
-
|
28
|
-
def fit(
|
29
|
-
self,
|
30
|
-
x_ref: ArrayLike,
|
31
|
-
threshold_perc: float,
|
32
|
-
loss_fn: Callable[..., torch.nn.Module] | None,
|
33
|
-
optimizer: torch.optim.Optimizer | None,
|
34
|
-
epochs: int,
|
35
|
-
batch_size: int,
|
36
|
-
verbose: bool,
|
37
|
-
) -> None:
|
38
|
-
"""
|
39
|
-
Train the model and infer the threshold value.
|
40
|
-
|
41
|
-
Parameters
|
42
|
-
----------
|
43
|
-
x_ref : ArrayLike
|
44
|
-
Training data.
|
45
|
-
threshold_perc : float, default 100.0
|
46
|
-
Percentage of reference data that is normal.
|
47
|
-
loss_fn : Callable | None, default None
|
48
|
-
Loss function used for training.
|
49
|
-
optimizer : Optimizer, default keras.optimizers.Adam
|
50
|
-
Optimizer used for training.
|
51
|
-
epochs : int, default 20
|
52
|
-
Number of training epochs.
|
53
|
-
batch_size : int, default 64
|
54
|
-
Batch size used for training.
|
55
|
-
verbose : bool, default True
|
56
|
-
Whether to print training progress.
|
57
|
-
"""
|
58
|
-
|
59
|
-
# Train the model
|
60
|
-
trainer(
|
61
|
-
model=self.model,
|
62
|
-
x_train=to_numpy(x_ref),
|
63
|
-
y_train=None,
|
64
|
-
loss_fn=loss_fn,
|
65
|
-
optimizer=optimizer,
|
66
|
-
preprocess_fn=None,
|
67
|
-
epochs=epochs,
|
68
|
-
batch_size=batch_size,
|
69
|
-
device=self.device,
|
70
|
-
verbose=verbose,
|
71
|
-
)
|
72
|
-
|
73
|
-
# Infer the threshold values
|
74
|
-
self._ref_score = self.score(x_ref, batch_size)
|
75
|
-
self._threshold_perc = threshold_perc
|
76
|
-
|
77
|
-
|
78
|
-
class OODBaseGMM(OODBase, OODGMMMixin[torch.Tensor]):
|
79
|
-
def fit(
|
80
|
-
self,
|
81
|
-
x_ref: ArrayLike,
|
82
|
-
threshold_perc: float,
|
83
|
-
loss_fn: Callable[..., torch.nn.Module] | None,
|
84
|
-
optimizer: torch.optim.Optimizer | None,
|
85
|
-
epochs: int,
|
86
|
-
batch_size: int,
|
87
|
-
verbose: bool,
|
88
|
-
) -> None:
|
89
|
-
# Train the model
|
90
|
-
trainer(
|
91
|
-
model=self.model,
|
92
|
-
x_train=to_numpy(x_ref),
|
93
|
-
y_train=None,
|
94
|
-
loss_fn=loss_fn,
|
95
|
-
optimizer=optimizer,
|
96
|
-
preprocess_fn=None,
|
97
|
-
epochs=epochs,
|
98
|
-
batch_size=batch_size,
|
99
|
-
device=self.device,
|
100
|
-
verbose=verbose,
|
101
|
-
)
|
102
|
-
|
103
|
-
# Calculate the GMM parameters
|
104
|
-
_, z, gamma = cast(tuple[torch.Tensor, torch.Tensor, torch.Tensor], self.model(x_ref))
|
105
|
-
self._gmm_params = gmm_params(z, gamma)
|
106
|
-
|
107
|
-
# Infer the threshold values
|
108
|
-
self._ref_score = self.score(x_ref, batch_size)
|
109
|
-
self._threshold_perc = threshold_perc
|
@@ -1,285 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = ["MetadataOutput", "metadata_preprocessing"]
|
4
|
-
|
5
|
-
import warnings
|
6
|
-
from dataclasses import dataclass
|
7
|
-
from typing import Any, Iterable, Literal, Mapping, TypeVar
|
8
|
-
|
9
|
-
import numpy as np
|
10
|
-
from numpy.typing import ArrayLike, NDArray
|
11
|
-
from scipy.stats import wasserstein_distance as wd
|
12
|
-
|
13
|
-
from dataeval.interop import as_numpy, to_numpy
|
14
|
-
from dataeval.output import Output, set_metadata
|
15
|
-
from dataeval.utils.metadata import merge_metadata
|
16
|
-
|
17
|
-
TNum = TypeVar("TNum", int, float)
|
18
|
-
DISCRETE_MIN_WD = 0.054
|
19
|
-
CONTINUOUS_MIN_SAMPLE_SIZE = 20
|
20
|
-
|
21
|
-
|
22
|
-
@dataclass(frozen=True)
|
23
|
-
class MetadataOutput(Output):
|
24
|
-
"""
|
25
|
-
Output class for :func:`metadata_binning` function
|
26
|
-
|
27
|
-
Attributes
|
28
|
-
----------
|
29
|
-
discrete_factor_names : list[str]
|
30
|
-
List containing factor names for the original data that was discrete and the binned continuous data
|
31
|
-
discrete_data : NDArray[np.int]
|
32
|
-
Array containing values for the original data that was discrete and the binned continuous data
|
33
|
-
continuous_factor_names : list[str]
|
34
|
-
List containing factor names for the original continuous data
|
35
|
-
continuous_data : NDArray[np.int or np.double] | None
|
36
|
-
Array containing values for the original continuous data or None if there was no continuous data
|
37
|
-
class_labels : NDArray[np.int]
|
38
|
-
Numerical class labels for the images/objects
|
39
|
-
class_names : NDArray[Any]
|
40
|
-
Array of unique class names (for use with plotting)
|
41
|
-
total_num_factors : int
|
42
|
-
Sum of discrete_factor_names and continuous_factor_names plus 1 for class
|
43
|
-
"""
|
44
|
-
|
45
|
-
discrete_factor_names: list[str]
|
46
|
-
discrete_data: NDArray[np.int_]
|
47
|
-
continuous_factor_names: list[str]
|
48
|
-
continuous_data: NDArray[np.int_ | np.double] | None
|
49
|
-
class_labels: NDArray[np.int_]
|
50
|
-
class_names: NDArray[Any]
|
51
|
-
total_num_factors: int
|
52
|
-
|
53
|
-
|
54
|
-
@set_metadata
|
55
|
-
def metadata_preprocessing(
|
56
|
-
raw_metadata: Iterable[Mapping[str, Any]],
|
57
|
-
class_labels: ArrayLike | str,
|
58
|
-
continuous_factor_bins: Mapping[str, int | list[tuple[TNum, TNum]]] | None = None,
|
59
|
-
auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
|
60
|
-
exclude: Iterable[str] | None = None,
|
61
|
-
) -> MetadataOutput:
|
62
|
-
"""
|
63
|
-
Restructures the metadata to be in the correct format for the bias functions.
|
64
|
-
|
65
|
-
This identifies whether the incoming metadata is discrete or continuous,
|
66
|
-
and whether the data is already binned or still needs binning.
|
67
|
-
It accepts a list of dictionaries containing the per image metadata and
|
68
|
-
automatically adjusts for multiple targets in an image.
|
69
|
-
|
70
|
-
Parameters
|
71
|
-
----------
|
72
|
-
raw_metadata : Iterable[Mapping[str, Any]]
|
73
|
-
Iterable collection of metadata dictionaries to flatten and merge.
|
74
|
-
class_labels : ArrayLike or string or None
|
75
|
-
If arraylike, expects the labels for each image (image classification) or each object (object detection).
|
76
|
-
If the labels are included in the metadata dictionary, pass in the key value.
|
77
|
-
continuous_factor_bins : Mapping[str, int] or Mapping[str, list[tuple[TNum, TNum]]] or None, default None
|
78
|
-
User provided dictionary specifying how to bin the continuous metadata factors
|
79
|
-
auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
|
80
|
-
Method by which the function will automatically bin continuous metadata factors. It is recommended
|
81
|
-
that the user provide the bins through the `continuous_factor_bins`.
|
82
|
-
exclude : Iterable[str] or None, default None
|
83
|
-
User provided collection of metadata keys to exclude when processing metadata.
|
84
|
-
|
85
|
-
Returns
|
86
|
-
-------
|
87
|
-
MetadataOutput
|
88
|
-
Output class containing the binned metadata
|
89
|
-
"""
|
90
|
-
# Transform metadata into single, flattened dictionary
|
91
|
-
metadata, image_repeats = merge_metadata(raw_metadata)
|
92
|
-
|
93
|
-
# Drop any excluded metadata keys
|
94
|
-
if exclude:
|
95
|
-
for k in list(metadata):
|
96
|
-
if k in exclude:
|
97
|
-
metadata.pop(k)
|
98
|
-
|
99
|
-
# Get the class label array in numeric form
|
100
|
-
class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
|
101
|
-
if class_array.ndim > 1:
|
102
|
-
raise ValueError(
|
103
|
-
f"Got class labels with {class_array.ndim}-dimensional "
|
104
|
-
f"shape {class_array.shape}, but expected a 1-dimensional array."
|
105
|
-
)
|
106
|
-
if not np.issubdtype(class_array.dtype, np.int_):
|
107
|
-
unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
|
108
|
-
else:
|
109
|
-
numerical_labels = class_array
|
110
|
-
unique_classes = np.unique(class_array)
|
111
|
-
|
112
|
-
# Bin according to user supplied bins
|
113
|
-
continuous_metadata = {}
|
114
|
-
discrete_metadata = {}
|
115
|
-
if continuous_factor_bins is not None and continuous_factor_bins != {}:
|
116
|
-
invalid_keys = set(continuous_factor_bins.keys()) - set(metadata.keys())
|
117
|
-
if invalid_keys:
|
118
|
-
raise KeyError(
|
119
|
-
f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
|
120
|
-
"but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
|
121
|
-
"or add corresponding entries to the `metadata` dictionary."
|
122
|
-
)
|
123
|
-
for factor, grouping in continuous_factor_bins.items():
|
124
|
-
discrete_metadata[factor] = _user_defined_bin(metadata[factor], grouping)
|
125
|
-
continuous_metadata[factor] = metadata[factor]
|
126
|
-
|
127
|
-
# Determine category of the rest of the keys
|
128
|
-
remaining_keys = set(metadata.keys()) - set(continuous_metadata.keys())
|
129
|
-
for key in remaining_keys:
|
130
|
-
data = to_numpy(metadata[key])
|
131
|
-
if np.issubdtype(data.dtype, np.number):
|
132
|
-
result = _is_continuous(data, image_repeats)
|
133
|
-
if result:
|
134
|
-
continuous_metadata[key] = data
|
135
|
-
unique_samples, ordinal_data = np.unique(data, return_inverse=True)
|
136
|
-
if unique_samples.size <= np.max([20, data.size * 0.01]):
|
137
|
-
discrete_metadata[key] = ordinal_data
|
138
|
-
else:
|
139
|
-
warnings.warn(
|
140
|
-
f"A user defined binning was not provided for {key}. "
|
141
|
-
f"Using the {auto_bin_method} method to discretize the data. "
|
142
|
-
"It is recommended that the user rerun and supply the desired "
|
143
|
-
"bins using the continuous_factor_bins parameter.",
|
144
|
-
UserWarning,
|
145
|
-
)
|
146
|
-
discrete_metadata[key] = _binning_function(data, auto_bin_method)
|
147
|
-
else:
|
148
|
-
_, discrete_metadata[key] = np.unique(data, return_inverse=True)
|
149
|
-
|
150
|
-
# splitting out the dictionaries into the keys and values
|
151
|
-
discrete_factor_names = list(discrete_metadata.keys())
|
152
|
-
discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
|
153
|
-
continuous_factor_names = list(continuous_metadata.keys())
|
154
|
-
continuous_data = np.stack(list(continuous_metadata.values()), axis=-1) if continuous_metadata else None
|
155
|
-
total_num_factors = len(discrete_factor_names + continuous_factor_names) + 1
|
156
|
-
|
157
|
-
return MetadataOutput(
|
158
|
-
discrete_factor_names,
|
159
|
-
discrete_data,
|
160
|
-
continuous_factor_names,
|
161
|
-
continuous_data,
|
162
|
-
numerical_labels,
|
163
|
-
unique_classes,
|
164
|
-
total_num_factors,
|
165
|
-
)
|
166
|
-
|
167
|
-
|
168
|
-
def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[TNum, TNum]]) -> NDArray[np.intp]:
|
169
|
-
"""
|
170
|
-
Digitizes a list of values into a given number of bins.
|
171
|
-
|
172
|
-
Parameters
|
173
|
-
----------
|
174
|
-
data : list | NDArray
|
175
|
-
The values to be digitized.
|
176
|
-
binning : int | list[tuple[TNum, TNum]]
|
177
|
-
The number of bins for the discrete values that data will be digitized into.
|
178
|
-
|
179
|
-
Returns
|
180
|
-
-------
|
181
|
-
NDArray[np.intp]
|
182
|
-
The digitized values
|
183
|
-
"""
|
184
|
-
|
185
|
-
if not np.all([np.issubdtype(type(n), np.number) for n in data]):
|
186
|
-
raise TypeError(
|
187
|
-
"Encountered a data value with non-numeric type when digitizing a factor. "
|
188
|
-
"Ensure all occurrences of continuous factors are numeric types."
|
189
|
-
)
|
190
|
-
if type(binning) is int:
|
191
|
-
_, bin_edges = np.histogram(data, bins=binning)
|
192
|
-
bin_edges[-1] = np.inf
|
193
|
-
bin_edges[0] = -np.inf
|
194
|
-
else:
|
195
|
-
bin_edges = binning
|
196
|
-
return np.digitize(data, bin_edges)
|
197
|
-
|
198
|
-
|
199
|
-
def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
|
200
|
-
"""
|
201
|
-
Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
|
202
|
-
"""
|
203
|
-
if bin_method == "clusters":
|
204
|
-
# bin_edges = _binning_by_clusters(data)
|
205
|
-
warnings.warn(
|
206
|
-
"Binning by clusters is currently unavailable until changes to the clustering function go through.",
|
207
|
-
UserWarning,
|
208
|
-
)
|
209
|
-
bin_method = "uniform_width"
|
210
|
-
|
211
|
-
if bin_method != "clusters":
|
212
|
-
counts, bin_edges = np.histogram(data, bins="auto")
|
213
|
-
n_bins = counts.size
|
214
|
-
if counts[counts > 0].min() < 10:
|
215
|
-
for _ in range(20):
|
216
|
-
n_bins -= 1
|
217
|
-
counts, bin_edges = np.histogram(data, bins=n_bins)
|
218
|
-
if counts[counts > 0].min() >= 10 or n_bins < 2:
|
219
|
-
break
|
220
|
-
|
221
|
-
if bin_method == "uniform_count":
|
222
|
-
quantiles = np.linspace(0, 100, n_bins + 1)
|
223
|
-
bin_edges = np.asarray(np.percentile(data, quantiles))
|
224
|
-
|
225
|
-
bin_edges[0] = -np.inf # type: ignore # until the clusters speed up is merged
|
226
|
-
bin_edges[-1] = np.inf # type: ignore # and the _binning_by_clusters can be uncommented
|
227
|
-
return np.digitize(data, bin_edges) # type: ignore
|
228
|
-
|
229
|
-
|
230
|
-
def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
|
231
|
-
"""
|
232
|
-
Determines whether the data is continuous or discrete using the Wasserstein distance.
|
233
|
-
|
234
|
-
Given a 1D sample, we consider the intervals between adjacent points. For a continuous distribution,
|
235
|
-
a point is equally likely to lie anywhere in the interval bounded by its two neighbors. Furthermore,
|
236
|
-
we can put all "between neighbor" locations on the same scale of 0 to 1 by subtracting the smaller
|
237
|
-
neighbor and dividing out the length of the interval. (Duplicates are either assigned to zero or
|
238
|
-
ignored, depending on context). These normalized locations will be much more uniformly distributed
|
239
|
-
for continuous data than for discrete, and this gives us a way to distinguish them. Call this the
|
240
|
-
Normalized Near Neighbor distribution (NNN), defined on the interval [0,1].
|
241
|
-
|
242
|
-
The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure
|
243
|
-
how close the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at
|
244
|
-
least 20 points, and furthermore at least half as many points as there are discrete values, we can
|
245
|
-
reliably distinguish discrete from continuous samples by testing that the Wasserstein distance
|
246
|
-
measured from a uniform distribution is greater or less than 0.054, respectively.
|
247
|
-
"""
|
248
|
-
# Check if the metadata is image specific
|
249
|
-
_, data_indicies_unsorted = np.unique(data, return_index=True)
|
250
|
-
if data_indicies_unsorted.size == image_indicies.size:
|
251
|
-
data_indicies = np.sort(data_indicies_unsorted)
|
252
|
-
if (data_indicies == image_indicies).all():
|
253
|
-
data = data[data_indicies]
|
254
|
-
|
255
|
-
# OLD METHOD
|
256
|
-
# uvals = np.unique(data)
|
257
|
-
# pct_unique = uvals.size / data.size
|
258
|
-
# return pct_unique < threshold
|
259
|
-
|
260
|
-
n_examples = len(data)
|
261
|
-
|
262
|
-
if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
|
263
|
-
warnings.warn(
|
264
|
-
f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})", UserWarning
|
265
|
-
)
|
266
|
-
return False
|
267
|
-
|
268
|
-
# Require at least 3 unique values before bothering with NNN
|
269
|
-
xu = np.unique(data, axis=None)
|
270
|
-
if xu.size < 3:
|
271
|
-
return False
|
272
|
-
|
273
|
-
Xs = np.sort(data)
|
274
|
-
|
275
|
-
X0, X1 = Xs[0:-2], Xs[2:] # left and right neighbors
|
276
|
-
|
277
|
-
dx = np.zeros(n_examples - 2) # no dx at end points
|
278
|
-
gtz = (X1 - X0) > 0 # check for dups; dx will be zero for them
|
279
|
-
dx[np.logical_not(gtz)] = 0.0
|
280
|
-
|
281
|
-
dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz] # the core idea: dx is NNN samples.
|
282
|
-
|
283
|
-
shift = wd(dx, np.linspace(0, 1, dx.size)) # how far is dx from uniform, for this feature?
|
284
|
-
|
285
|
-
return shift < DISCRETE_MIN_WD # if NNN is close enough to uniform, consider the sample continuous.
|
dataeval/utils/gmm.py
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
from dataclasses import dataclass
|
2
|
-
from typing import Generic, TypeVar
|
3
|
-
|
4
|
-
TGMMData = TypeVar("TGMMData")
|
5
|
-
|
6
|
-
|
7
|
-
@dataclass
|
8
|
-
class GaussianMixtureModelParams(Generic[TGMMData]):
|
9
|
-
"""
|
10
|
-
phi : TGMMData
|
11
|
-
Mixture component distribution weights.
|
12
|
-
mu : TGMMData
|
13
|
-
Mixture means.
|
14
|
-
cov : TGMMData
|
15
|
-
Mixture covariance.
|
16
|
-
L : TGMMData
|
17
|
-
Cholesky decomposition of `cov`.
|
18
|
-
log_det_cov : TGMMData
|
19
|
-
Log of the determinant of `cov`.
|
20
|
-
"""
|
21
|
-
|
22
|
-
phi: TGMMData
|
23
|
-
mu: TGMMData
|
24
|
-
cov: TGMMData
|
25
|
-
L: TGMMData
|
26
|
-
log_det_cov: TGMMData
|
@@ -1,120 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: dataeval
|
3
|
-
Version: 0.74.2
|
4
|
-
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
|
-
Home-page: https://dataeval.ai/
|
6
|
-
License: MIT
|
7
|
-
Author: Andrew Weng
|
8
|
-
Author-email: andrew.weng@ariacoustics.com
|
9
|
-
Maintainer: ARiA
|
10
|
-
Maintainer-email: dataeval@ariacoustics.com
|
11
|
-
Requires-Python: >=3.9,<3.13
|
12
|
-
Classifier: Development Status :: 4 - Beta
|
13
|
-
Classifier: Intended Audience :: Science/Research
|
14
|
-
Classifier: License :: OSI Approved :: MIT License
|
15
|
-
Classifier: Operating System :: OS Independent
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
20
|
-
Classifier: Programming Language :: Python :: 3.12
|
21
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
22
|
-
Classifier: Topic :: Scientific/Engineering
|
23
|
-
Provides-Extra: all
|
24
|
-
Provides-Extra: torch
|
25
|
-
Requires-Dist: matplotlib ; extra == "all"
|
26
|
-
Requires-Dist: numpy (>=1.24.3)
|
27
|
-
Requires-Dist: pillow (>=10.3.0)
|
28
|
-
Requires-Dist: scikit-learn (>=1.5.0)
|
29
|
-
Requires-Dist: scipy (>=1.10)
|
30
|
-
Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
|
31
|
-
Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
|
32
|
-
Requires-Dist: tqdm
|
33
|
-
Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "3.10"
|
34
|
-
Requires-Dist: xxhash (>=3.3)
|
35
|
-
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
36
|
-
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
37
|
-
Description-Content-Type: text/markdown
|
38
|
-
|
39
|
-
# DataEval
|
40
|
-
|
41
|
-
## About DataEval
|
42
|
-
|
43
|
-
DataEval focuses on characterizing image data and its impact on model performance across Classification and object-detection tasks.
|
44
|
-
|
45
|
-
<!-- start about -->
|
46
|
-
|
47
|
-
**Model-agnostic metrics that bound real-world performance**
|
48
|
-
- relevance/completeness/coverage
|
49
|
-
- metafeatures (data complexity)
|
50
|
-
|
51
|
-
**Model-specific metrics that guide model selection and training**
|
52
|
-
- dataset sufficiency
|
53
|
-
- data/model complexity mismatch
|
54
|
-
|
55
|
-
**Metrics for post-deployment monitoring of data with bounds on model performance to guide retraining**
|
56
|
-
- dataset-shift metrics
|
57
|
-
- model performance bounds under covariate shift
|
58
|
-
- guidance on sampling to assess model error and model retraining
|
59
|
-
|
60
|
-
<!-- end about -->
|
61
|
-
|
62
|
-
## Getting Started
|
63
|
-
|
64
|
-
### Requirements
|
65
|
-
- Python 3.9-3.11
|
66
|
-
|
67
|
-
### Installing DataEval
|
68
|
-
|
69
|
-
You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `torch`, `tensorflow` and `all`. Using `torch` enables Sufficiency metrics, and `tensorflow` enables OOD Detection.
|
70
|
-
|
71
|
-
```
|
72
|
-
pip install dataeval[all]
|
73
|
-
```
|
74
|
-
|
75
|
-
### Installing DataEval in Conda/Mamba
|
76
|
-
|
77
|
-
DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
|
78
|
-
are installed from the `pytorch` channel, the channel is specified in the below example.
|
79
|
-
|
80
|
-
```
|
81
|
-
micromamba create -f environment\environment.yaml -c pytorch
|
82
|
-
```
|
83
|
-
|
84
|
-
### Installing DataEval from GitHub
|
85
|
-
|
86
|
-
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
|
87
|
-
|
88
|
-
```
|
89
|
-
sudo apt-get install git-lfs
|
90
|
-
pip install poetry
|
91
|
-
```
|
92
|
-
|
93
|
-
Pull the source down and change to the DataEval project directory.
|
94
|
-
```
|
95
|
-
git clone https://github.com/aria-ml/dataeval.git
|
96
|
-
cd dataeval
|
97
|
-
```
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
Install DataEval with optional dependencies for development.
|
102
|
-
```
|
103
|
-
poetry install --all-extras --with dev
|
104
|
-
```
|
105
|
-
|
106
|
-
Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
|
107
|
-
```
|
108
|
-
poetry shell
|
109
|
-
```
|
110
|
-
|
111
|
-
### Documentation and Tutorials
|
112
|
-
For more ideas on getting started using DataEval in your workflow, additional information and tutorials are in our Sphinx documentation hosted on [Read the Docs](https://dataeval.readthedocs.io/).
|
113
|
-
|
114
|
-
## Attribution
|
115
|
-
This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) python library developed by SeldonIO. Additional documentation from the developers are also available [here](https://docs.seldon.io/projects/alibi-detect/en/stable/).
|
116
|
-
|
117
|
-
## POCs
|
118
|
-
- **POC**: Scott Swan @scott.swan
|
119
|
-
- **DPOC**: Andrew Weng @aweng
|
120
|
-
|
dataeval-0.74.2.dist-info/RECORD
DELETED
@@ -1,66 +0,0 @@
|
|
1
|
-
dataeval/__init__.py,sha256=w_On8sJ5o_f8PboMo6LLErdFSqDAQ1Jg_e0mcp-5FRU,959
|
2
|
-
dataeval/detectors/__init__.py,sha256=Y-0bbyWyuMvZU80bCx6WPt3IV_r2hu9ymzpA8uzMqoI,206
|
3
|
-
dataeval/detectors/drift/__init__.py,sha256=BSXm21y7cAawHep-ZldCJ5HOvzYjPzYGKGrmoEs3i0E,737
|
4
|
-
dataeval/detectors/drift/base.py,sha256=QDGHMu1WADD-38MEIOwjQMEQM3DE7B0yFHO3hsMbV-E,14481
|
5
|
-
dataeval/detectors/drift/cvm.py,sha256=kc59w2_wtxFGNnLcaJRvX5v_38gPXiebSGNiFVdunEQ,4142
|
6
|
-
dataeval/detectors/drift/ks.py,sha256=gcpe1WIQeNeZdLYkdMZCFLXUp1bHMQUxwJE6-RLVOXs,4229
|
7
|
-
dataeval/detectors/drift/mmd.py,sha256=C0FX5v9ZJzmKNYEcYUaC7sDtMpJ2dZpwikNDu-AEWiI,7584
|
8
|
-
dataeval/detectors/drift/torch.py,sha256=igEQ2DV9JmcpTdUKCOHBi5LxtoNeCAslJS2Ldulg1hw,7585
|
9
|
-
dataeval/detectors/drift/uncertainty.py,sha256=Xz2yzJjtJfw1vLag234jwRvaa_HK36nMajGx8bQaNRs,5322
|
10
|
-
dataeval/detectors/drift/updates.py,sha256=UJ0z5hlunRi7twnkLABfdJG3tT2EqX4y9IGx8_USYvo,1780
|
11
|
-
dataeval/detectors/linters/__init__.py,sha256=BvpaB1RUpkEhhXk3Mqi5NYoOcJKZRFSBOJCmQOIfYRU,483
|
12
|
-
dataeval/detectors/linters/clusterer.py,sha256=hK-ak02GaxwWuufesZMKDsvoE5fMdXO7UWsLiK8hfY0,21008
|
13
|
-
dataeval/detectors/linters/duplicates.py,sha256=2bmPTFqoefeiAQV9y4CGlHV_mJNrysJSEFLXLd2DO4I,5661
|
14
|
-
dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
|
15
|
-
dataeval/detectors/linters/outliers.py,sha256=X48bzTfTr1LqC6WKVKBRfvpjcQRgmb93cNLT7Oipe3M,10113
|
16
|
-
dataeval/detectors/ood/__init__.py,sha256=-D4Fq-ysFylNNMqjHG1ALbB9qBCm_UinkCAgsK9HGg0,408
|
17
|
-
dataeval/detectors/ood/ae_torch.py,sha256=pO9w5221bXR9lEBkE7oakXeE7PXUUR--xcTpmHvOCSk,2142
|
18
|
-
dataeval/detectors/ood/base.py,sha256=UzcDbXl8Gv43VFzjrOegTnKSIoEYmfDP7fAySeWyWPw,6955
|
19
|
-
dataeval/detectors/ood/base_torch.py,sha256=yFbSfQsBMwZeVf8mrixmkZYBGChhV5oAHtkgzWnMzsA,3405
|
20
|
-
dataeval/detectors/ood/metadata_ks_compare.py,sha256=LNDNWGEDKTW8_-djgmK53sn9EZzzXq1Sgwc47k0QI-Y,5380
|
21
|
-
dataeval/detectors/ood/metadata_least_likely.py,sha256=nxMCXUOjOfWHDTGT2SLE7OYBCydRq8zHLd8t17k7hMM,5193
|
22
|
-
dataeval/detectors/ood/metadata_ood_mi.py,sha256=KLay2BmgHrStBV92VpIs_B1yEfQKllsMTgzOQEng01I,4065
|
23
|
-
dataeval/interop.py,sha256=5lACbR7bZYGCagiwbXzAWvWeHRj8kWBmsTC9oEjFh78,2249
|
24
|
-
dataeval/logging.py,sha256=uGxXPqGpn5guQjuHtm25rzILaz7nCQUsy2o7tFo91OI,343
|
25
|
-
dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
|
26
|
-
dataeval/metrics/bias/__init__.py,sha256=dYiPHenS8J7pgRMMW2jNkTBmTbPoYTxT04fZu9PFats,747
|
27
|
-
dataeval/metrics/bias/balance.py,sha256=_TZEe17AT-qOvPp-QFrQfTqNwh8uVVCYjC4Sv6JBx9o,9118
|
28
|
-
dataeval/metrics/bias/coverage.py,sha256=o65_IgrWSlGnYeYZFABjwKaxq09uqyy5esHJM67PJ-k,4528
|
29
|
-
dataeval/metrics/bias/diversity.py,sha256=WL1NbZiRrv0SIq97FY3womZNCSl_EBMVlBWQZAUtjk8,7701
|
30
|
-
dataeval/metrics/bias/metadata_preprocessing.py,sha256=ekUFiirkmaHDiH7nJjkNpiUQD7OolAPhHorjLxpXv_Y,12248
|
31
|
-
dataeval/metrics/bias/metadata_utils.py,sha256=HmTjlRRTdM9566oKUDDdVMJ8luss4DYykFOiS2FQzhM,6558
|
32
|
-
dataeval/metrics/bias/parity.py,sha256=hnA7qQH4Uy3tl_krluZ9BPD5zYjjagUxZt2fEiIa2yE,12745
|
33
|
-
dataeval/metrics/estimators/__init__.py,sha256=O6ocxJq8XDkfJWwXeJnnnzbOyRnFPKF4kTIVTTZYOA8,380
|
34
|
-
dataeval/metrics/estimators/ber.py,sha256=fs3_e9pgu7I50QIALWtF2aidkBZhTCKVE2pA7PyB5Go,5019
|
35
|
-
dataeval/metrics/estimators/divergence.py,sha256=r_SKSurf1TdI5E1ivENqDnz8cQ3_sxVGKAqmF9cqcT4,4275
|
36
|
-
dataeval/metrics/estimators/uap.py,sha256=Aw5ReoWNK73Tq96r__qN_-cvHrELauqtDX3Af_QxX4s,2157
|
37
|
-
dataeval/metrics/stats/__init__.py,sha256=igLRaAt1nX6yRwC4xI0zNPBADi3u7EsSxWP3OZ8AqcU,1086
|
38
|
-
dataeval/metrics/stats/base.py,sha256=_C05KUAuDrfX3N-19o25V3vmXr0-45A5fc57cXyV8qs,12161
|
39
|
-
dataeval/metrics/stats/boxratiostats.py,sha256=bZunY-b8Y2IQqHlTusQN77ujLOHftogEQIARDpdVv6A,6463
|
40
|
-
dataeval/metrics/stats/datasetstats.py,sha256=rZUDiciHwEpnXmkI8-uJNiYwUuTL9ssZMKMx73hVX-Y,6219
|
41
|
-
dataeval/metrics/stats/dimensionstats.py,sha256=xITgQF_oomb6Ty_dJcbT3ARGGNp4QRcYSgnkjB4f-YE,4054
|
42
|
-
dataeval/metrics/stats/hashstats.py,sha256=vxw_K74EJM9CZy-EV617vdrysFO8nEspVWqIYsIHC-c,4958
|
43
|
-
dataeval/metrics/stats/labelstats.py,sha256=K0hJTphMe7htSjyss8GPtKDiHepTuU60_hX0xRA-uAg,4096
|
44
|
-
dataeval/metrics/stats/pixelstats.py,sha256=2zr9i3GLNx1i_SCtbfdtZNxXBEc_9wCe4qDpmXLVbKY,4576
|
45
|
-
dataeval/metrics/stats/visualstats.py,sha256=vLIC4sMo796axWl-4e4RzT33ll-_6ki54Dirn3V-EL8,4948
|
46
|
-
dataeval/output.py,sha256=hR5TJ67f7FgrZO9Du46aw-jvRpMjOimSgJSau4ZNK44,3565
|
47
|
-
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
48
|
-
dataeval/utils/__init__.py,sha256=z7HxSijjycey-rGdQkgVOdpvT0oO2pKAuT4uYyxYGMs,555
|
49
|
-
dataeval/utils/gmm.py,sha256=YuLsJKsVWgH_wHr1u_hSRH5Yeexdj8exht8h99L7bLo,561
|
50
|
-
dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
|
51
|
-
dataeval/utils/metadata.py,sha256=0A--iru0zEmi044mKz5P35q69KrI30yoiRSlvs7TSdQ,9418
|
52
|
-
dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
|
53
|
-
dataeval/utils/split_dataset.py,sha256=KYIl2ueLN0BeBoEvbUP5FdwVcMYW_l-ES1nQf_zKpQA,18776
|
54
|
-
dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
|
55
|
-
dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
56
|
-
dataeval/utils/torch/datasets.py,sha256=10elNgLuH_FDX_CHE3y2Z215JN4-PQovQm5brcIJOeM,15021
|
57
|
-
dataeval/utils/torch/gmm.py,sha256=VbLlUQohwToApT493_tjQBWy2UM5R-3ppS9Dp-eP7BA,3240
|
58
|
-
dataeval/utils/torch/models.py,sha256=sdGeo7a8vshCTGA4lYyVxxb_aDWUlxdtIVxrddS-_ls,8542
|
59
|
-
dataeval/utils/torch/trainer.py,sha256=8BEXr6xtk-CHJTcNxOBnWgkFWfJUAiBy28cEdBhLMRU,7883
|
60
|
-
dataeval/utils/torch/utils.py,sha256=nWRcT6z6DbFVrL1RyxCOX3DPoCrv9G0B-VI_9LdGCQQ,5784
|
61
|
-
dataeval/workflows/__init__.py,sha256=ef1MiVL5IuhlDXXbwsiAfafhnr7tD3TXF9GRusy9_O8,290
|
62
|
-
dataeval/workflows/sufficiency.py,sha256=v9AV3BZT0NW-zD2VNIL_5aWspvoscrxRIUKcUdpy7HI,18540
|
63
|
-
dataeval-0.74.2.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
|
64
|
-
dataeval-0.74.2.dist-info/METADATA,sha256=Rcnn55cRPZ2JZ1jn8YamuVDxmQVDKEItK4oqZyAYkHM,4298
|
65
|
-
dataeval-0.74.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
66
|
-
dataeval-0.74.2.dist-info/RECORD,,
|
File without changes
|