dataeval 0.74.1__py3-none-any.whl → 0.75.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +33 -10
- dataeval/detectors/__init__.py +2 -2
- dataeval/detectors/drift/__init__.py +14 -12
- dataeval/detectors/drift/base.py +1 -1
- dataeval/detectors/drift/cvm.py +1 -1
- dataeval/detectors/drift/ks.py +1 -1
- dataeval/detectors/drift/mmd.py +6 -5
- dataeval/detectors/drift/torch.py +12 -12
- dataeval/detectors/drift/uncertainty.py +3 -2
- dataeval/detectors/linters/__init__.py +4 -4
- dataeval/detectors/linters/clusterer.py +2 -7
- dataeval/detectors/linters/duplicates.py +6 -10
- dataeval/detectors/linters/outliers.py +4 -2
- dataeval/detectors/ood/__init__.py +3 -10
- dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
- dataeval/detectors/ood/base.py +64 -161
- dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- dataeval/detectors/ood/metadata_least_likely.py +3 -3
- dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval/detectors/ood/mixin.py +146 -0
- dataeval/detectors/ood/output.py +63 -0
- dataeval/interop.py +16 -3
- dataeval/log.py +18 -0
- dataeval/metrics/__init__.py +2 -2
- dataeval/metrics/bias/__init__.py +9 -12
- dataeval/metrics/bias/balance.py +10 -8
- dataeval/metrics/bias/coverage.py +52 -4
- dataeval/metrics/bias/diversity.py +42 -14
- dataeval/metrics/bias/parity.py +15 -12
- dataeval/metrics/estimators/__init__.py +2 -2
- dataeval/metrics/estimators/ber.py +3 -1
- dataeval/metrics/estimators/divergence.py +1 -1
- dataeval/metrics/estimators/uap.py +1 -1
- dataeval/metrics/stats/__init__.py +18 -18
- dataeval/metrics/stats/base.py +4 -4
- dataeval/metrics/stats/boxratiostats.py +8 -9
- dataeval/metrics/stats/datasetstats.py +10 -14
- dataeval/metrics/stats/dimensionstats.py +4 -4
- dataeval/metrics/stats/hashstats.py +12 -8
- dataeval/metrics/stats/labelstats.py +5 -5
- dataeval/metrics/stats/pixelstats.py +4 -9
- dataeval/metrics/stats/visualstats.py +4 -9
- dataeval/output.py +1 -1
- dataeval/utils/__init__.py +4 -13
- dataeval/utils/dataset/__init__.py +7 -0
- dataeval/utils/{torch → dataset}/datasets.py +2 -0
- dataeval/utils/dataset/read.py +63 -0
- dataeval/utils/dataset/split.py +527 -0
- dataeval/utils/image.py +2 -2
- dataeval/utils/metadata.py +310 -5
- dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +1 -104
- dataeval/utils/torch/__init__.py +2 -17
- dataeval/utils/torch/gmm.py +29 -6
- dataeval/utils/torch/{utils.py → internal.py} +82 -58
- dataeval/utils/torch/models.py +10 -8
- dataeval/utils/torch/trainer.py +6 -85
- dataeval/workflows/__init__.py +2 -5
- dataeval/workflows/sufficiency.py +16 -6
- dataeval-0.75.0.dist-info/METADATA +136 -0
- dataeval-0.75.0.dist-info/RECORD +67 -0
- dataeval/detectors/ood/base_torch.py +0 -109
- dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval/utils/gmm.py +0 -26
- dataeval/utils/split_dataset.py +0 -492
- dataeval-0.74.1.dist-info/METADATA +0 -120
- dataeval-0.74.1.dist-info/RECORD +0 -65
- {dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/WHEEL +0 -0
dataeval/utils/split_dataset.py
DELETED
@@ -1,492 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = ["split_dataset"]
|
4
|
-
|
5
|
-
import warnings
|
6
|
-
from typing import Any
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
from numpy.typing import NDArray
|
10
|
-
from sklearn.cluster import KMeans
|
11
|
-
from sklearn.metrics import silhouette_score
|
12
|
-
from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
|
13
|
-
from sklearn.utils.multiclass import type_of_target
|
14
|
-
|
15
|
-
|
16
|
-
def validate_test_val(num_folds: int, test_frac: float | None, val_frac: float | None) -> tuple[float, float]:
|
17
|
-
"""Check input fractions to ensure unambiguous splitting arguments are passed return calculated
|
18
|
-
test and validation fractions.
|
19
|
-
|
20
|
-
|
21
|
-
Parameters
|
22
|
-
----------
|
23
|
-
num_folds : int
|
24
|
-
number of [train, val] cross-validation folds to generate
|
25
|
-
test_frac : float, optional
|
26
|
-
If specified, also generate a test set containing (test_frac*100)% of the data
|
27
|
-
val_frac : float, optional
|
28
|
-
Only specify if requesting a single [train, val] split. The validation split will
|
29
|
-
contain (val_frac*100)% of any data not already allocated to the test set
|
30
|
-
|
31
|
-
Raises
|
32
|
-
------
|
33
|
-
UnboundLocalError
|
34
|
-
Raised if more than one fold AND the fraction of data to be used for validation are
|
35
|
-
both requested. In this case, val_frac is ambiguous, since the validation fraction must be
|
36
|
-
by definition 1/num_folds
|
37
|
-
ValueError
|
38
|
-
Raised if num_folds is 1 (or left blank) AND val_frac is unspecified. When only 1 fold is
|
39
|
-
requested, we need to know how much of the data should be allocated for validation.
|
40
|
-
ValueError
|
41
|
-
Raised if the total fraction of data used for evaluation (val + test) meets or exceeds 1.0
|
42
|
-
|
43
|
-
Returns
|
44
|
-
-------
|
45
|
-
tuple[float, float]
|
46
|
-
Tuple of the validated and calculated values as appropriate for test and validation fractions
|
47
|
-
"""
|
48
|
-
if (num_folds > 1) and (val_frac is not None):
|
49
|
-
raise ValueError("If specifying val_frac, num_folds must be None or 1")
|
50
|
-
if (num_folds == 1) and (val_frac is None):
|
51
|
-
raise ValueError("If num_folds is None or 1, must assign a value to val_frac")
|
52
|
-
t_frac = 0.0 if test_frac is None else test_frac
|
53
|
-
v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
|
54
|
-
if (t_frac + v_frac) >= 1.0:
|
55
|
-
raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
|
56
|
-
return t_frac, v_frac
|
57
|
-
|
58
|
-
|
59
|
-
def check_labels(
|
60
|
-
labels: list[int] | NDArray[np.int_], total_partitions: int
|
61
|
-
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
62
|
-
"""Check to make sure there are more input data than the total number of partitions requested
|
63
|
-
Also converts labels to a numpy array, if it isn't already
|
64
|
-
|
65
|
-
Parameters
|
66
|
-
----------
|
67
|
-
labels : list or np.ndarray
|
68
|
-
all class labels from the input dataset
|
69
|
-
total_partitions : int
|
70
|
-
number of train-val splits requested (+1 if a test holdout is specified)
|
71
|
-
|
72
|
-
Raises
|
73
|
-
------
|
74
|
-
IndexError
|
75
|
-
Raised if more partitions are requested than number of labels. This is exceedingly rare and
|
76
|
-
usually means you've specified some argument incorrectly.
|
77
|
-
ValueError
|
78
|
-
Raised if the labels are considered continuous by Scikit-Learn. This does not necessarily
|
79
|
-
mean that floats are not accepted as a label format. Rather, this exception implies that
|
80
|
-
there are too many unique values in the set relative to it's cardinality.
|
81
|
-
|
82
|
-
Returns
|
83
|
-
-------
|
84
|
-
index : np.ndarray
|
85
|
-
Integer index generated based on the total number of labels
|
86
|
-
labels : np.ndarray
|
87
|
-
labels, converted to an ndarray if passed as a list.
|
88
|
-
"""
|
89
|
-
if len(labels) <= total_partitions:
|
90
|
-
raise IndexError(f"""
|
91
|
-
Total number of labels must greater than the number of total partitions.
|
92
|
-
Got {len(labels)} labels and {total_partitions} total train/val/test partitions.""")
|
93
|
-
if isinstance(labels, list):
|
94
|
-
labels = np.array(labels)
|
95
|
-
if type_of_target(labels) == "continuous":
|
96
|
-
raise ValueError("Detected continuous labels, labels must be discrete for proper stratification")
|
97
|
-
index = np.arange(len(labels))
|
98
|
-
return index, labels
|
99
|
-
|
100
|
-
|
101
|
-
def check_stratifiable(labels: NDArray[np.int_], total_partitions: int) -> bool:
|
102
|
-
"""
|
103
|
-
Very basic check to see if dataset can be stratified by class label. This is not a
|
104
|
-
comprehensive test, as factors such as grouping also affect the ability to stratify by label
|
105
|
-
|
106
|
-
Parameters
|
107
|
-
----------
|
108
|
-
labels : list or np.ndarray
|
109
|
-
all class labels from the input dataset
|
110
|
-
total_partitions : int
|
111
|
-
number of train-val splits requested (+1 if a test holdout is specified)
|
112
|
-
|
113
|
-
Warns
|
114
|
-
-----
|
115
|
-
UserWarning
|
116
|
-
Warns user if the dataset cannot be stratified due to the number of total (train, val, test)
|
117
|
-
partitions exceeding the number of instances of the rarest class label.
|
118
|
-
|
119
|
-
Returns
|
120
|
-
-------
|
121
|
-
stratifiable : bool
|
122
|
-
True if dataset can be stratified according to the criteria above.
|
123
|
-
"""
|
124
|
-
|
125
|
-
stratifiable = True
|
126
|
-
_, label_counts = np.unique(labels, return_counts=True)
|
127
|
-
rarest_label_count = label_counts.min()
|
128
|
-
if rarest_label_count < total_partitions:
|
129
|
-
warnings.warn(f"""
|
130
|
-
Unable to stratify due to label frequency. The rarest label occurs {rarest_label_count},
|
131
|
-
which is fewer than the total number of partitions requested. Setting stratify flag to
|
132
|
-
false.""")
|
133
|
-
stratifiable = False
|
134
|
-
return stratifiable
|
135
|
-
|
136
|
-
|
137
|
-
def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
|
138
|
-
"""
|
139
|
-
Warns user if the number of unique group_ids is incompatible with a grouped partition containing
|
140
|
-
num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
|
141
|
-
group the input data.
|
142
|
-
|
143
|
-
Parameters
|
144
|
-
----------
|
145
|
-
group_ids : np.ndarray
|
146
|
-
Identifies the group to which a sample at the same index belongs.
|
147
|
-
num_partitions : int
|
148
|
-
How many total (train, val) folds will be generated (+1 if also specifying a test fold).
|
149
|
-
|
150
|
-
Warns
|
151
|
-
-----
|
152
|
-
UserWarning
|
153
|
-
Warns if there are fewer groups than the minimum required to successfully partition the data
|
154
|
-
into num_partitions. The minimum is defined as the number of partitions requested plus one.
|
155
|
-
|
156
|
-
Returns
|
157
|
-
-------
|
158
|
-
groupable : bool
|
159
|
-
True if dataset can be grouped by the given group ids, given the criteria above.
|
160
|
-
"""
|
161
|
-
|
162
|
-
groupable = True
|
163
|
-
num_unique_groups = len(np.unique(group_ids))
|
164
|
-
min_unique_groups = num_partitions + 1
|
165
|
-
if num_unique_groups < min_unique_groups:
|
166
|
-
warnings.warn(f"""
|
167
|
-
{min_unique_groups} unique groups required for {num_partitions} partitions.
|
168
|
-
Found {num_unique_groups} instead. Reverting to ungrouped partitioning""")
|
169
|
-
groupable = False
|
170
|
-
else:
|
171
|
-
groupable = True
|
172
|
-
return groupable
|
173
|
-
|
174
|
-
|
175
|
-
def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
|
176
|
-
"""
|
177
|
-
Find bins of continuous data by iteratively applying k-means clustering, and keeping the
|
178
|
-
clustering with the highest silhouette score.
|
179
|
-
|
180
|
-
Parameters
|
181
|
-
----------
|
182
|
-
array : np.ndarray
|
183
|
-
continuous data to bin
|
184
|
-
|
185
|
-
Returns
|
186
|
-
-------
|
187
|
-
np.ndarray[int]: bin numbers assigned by the kmeans best clusterer.
|
188
|
-
"""
|
189
|
-
array = np.array(array)
|
190
|
-
if array.ndim == 1:
|
191
|
-
array = array.reshape([-1, 1])
|
192
|
-
best_score = 0.60
|
193
|
-
else:
|
194
|
-
best_score = 0.50
|
195
|
-
bin_index = np.zeros(len(array), dtype=np.int_)
|
196
|
-
for k in range(2, 20):
|
197
|
-
clusterer = KMeans(n_clusters=k)
|
198
|
-
cluster_labels = clusterer.fit_predict(array)
|
199
|
-
score = silhouette_score(array, cluster_labels, sample_size=25_000)
|
200
|
-
if score > best_score:
|
201
|
-
best_score = score
|
202
|
-
bin_index = cluster_labels.astype(np.int_)
|
203
|
-
return bin_index
|
204
|
-
|
205
|
-
|
206
|
-
def angle2xy(angles: NDArray[Any]) -> NDArray[Any]:
|
207
|
-
"""
|
208
|
-
Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
|
209
|
-
|
210
|
-
Parameters
|
211
|
-
----------
|
212
|
-
angles : np.ndarray
|
213
|
-
angle data in either radians or degrees
|
214
|
-
|
215
|
-
Returns
|
216
|
-
-------
|
217
|
-
xy : np.ndarray
|
218
|
-
Nx2 array of xy coordinates for each angle (can be radians or degrees)
|
219
|
-
"""
|
220
|
-
is_radians = ((angles >= -np.pi) & (angles <= 2 * np.pi)).all()
|
221
|
-
radians = angles if is_radians else np.pi / 180 * angles
|
222
|
-
xy = np.stack([np.cos(radians), np.sin(radians)], axis=1)
|
223
|
-
return xy
|
224
|
-
|
225
|
-
|
226
|
-
def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.int_]:
|
227
|
-
"""Returns individual group numbers based on a subset of metadata defined by groupnames
|
228
|
-
|
229
|
-
Parameters
|
230
|
-
----------
|
231
|
-
metadata : dict
|
232
|
-
dictionary containing all metadata
|
233
|
-
groupnames : list
|
234
|
-
which groups from the metadata dictionary to consider for dataset grouping
|
235
|
-
num_samples : int
|
236
|
-
number of labels. Used to ensure agreement between input data/labels and metadata entries.
|
237
|
-
|
238
|
-
Raises
|
239
|
-
------
|
240
|
-
IndexError
|
241
|
-
raised if an entry in the metadata dictionary doesn't have the same length as num_samples
|
242
|
-
|
243
|
-
Returns
|
244
|
-
-------
|
245
|
-
group_ids : np.ndarray
|
246
|
-
group identifiers from metadata
|
247
|
-
"""
|
248
|
-
features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
|
249
|
-
if not features2group:
|
250
|
-
return np.zeros(num_samples, dtype=np.int_)
|
251
|
-
for name, feature in features2group.items():
|
252
|
-
if len(feature) != num_samples:
|
253
|
-
raise IndexError(f"""Feature length does not match number of labels.
|
254
|
-
Got {len(feature)} features and {num_samples} samples""")
|
255
|
-
if type_of_target(feature) == "continuous":
|
256
|
-
if ("ANGLE" in name.upper()) or ("AZIMUTH" in name.upper()):
|
257
|
-
feature = angle2xy(feature)
|
258
|
-
features2group[name] = bin_kmeans(feature)
|
259
|
-
binned_features = np.stack(list(features2group.values()), axis=1)
|
260
|
-
_, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
|
261
|
-
return group_ids
|
262
|
-
|
263
|
-
|
264
|
-
def make_splits(
|
265
|
-
index: NDArray[np.int_],
|
266
|
-
labels: NDArray[np.int_],
|
267
|
-
n_folds: int,
|
268
|
-
groups: NDArray[np.int_] | None = None,
|
269
|
-
stratified: bool = False,
|
270
|
-
) -> list[dict[str, NDArray[np.int_]]]:
|
271
|
-
"""Split data into n_folds partitions of training and validation data.
|
272
|
-
|
273
|
-
Parameters
|
274
|
-
----------
|
275
|
-
index : np.ndarray
|
276
|
-
index corresponding to each label (see below)
|
277
|
-
labels : np.ndarray
|
278
|
-
classification labels
|
279
|
-
n_folds : int
|
280
|
-
number or train/val folds
|
281
|
-
groups : np.ndarray, Optional
|
282
|
-
group index for grouped partitions. Grouped partitions are split such that no group id is
|
283
|
-
present in both a training and validation split.
|
284
|
-
stratified : bool, default=False
|
285
|
-
If True, maintain dataset class balance within each train/val split
|
286
|
-
|
287
|
-
Returns
|
288
|
-
-------
|
289
|
-
split_defs : list[dict]
|
290
|
-
list of dictionaries, which specifying train index, validation index, and the ratio of
|
291
|
-
validation to all data.
|
292
|
-
"""
|
293
|
-
split_defs = []
|
294
|
-
index = index.reshape([-1, 1])
|
295
|
-
if groups is not None:
|
296
|
-
splitter = StratifiedGroupKFold(n_folds) if stratified else GroupKFold(n_folds)
|
297
|
-
splits = splitter.split(index, labels, groups)
|
298
|
-
else:
|
299
|
-
splitter = StratifiedKFold(n_folds) if stratified else KFold(n_folds)
|
300
|
-
splits = splitter.split(index, labels)
|
301
|
-
for train_idx, eval_idx in splits:
|
302
|
-
test_ratio = len(eval_idx) / index.shape[0]
|
303
|
-
split_defs.append(
|
304
|
-
{
|
305
|
-
"train": train_idx.astype(np.int_),
|
306
|
-
"eval": eval_idx.astype(np.int_),
|
307
|
-
"eval_frac": test_ratio,
|
308
|
-
}
|
309
|
-
)
|
310
|
-
return split_defs
|
311
|
-
|
312
|
-
|
313
|
-
def find_best_split(
|
314
|
-
labels: NDArray[np.int_], split_defs: list[dict[str, NDArray[np.int_]]], stratified: bool, eval_frac: float
|
315
|
-
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
316
|
-
"""Finds the split that most closely satisfies a criterion determined by the arguments passed.
|
317
|
-
If stratified is True, returns the split whose class balance most closely resembles the overall
|
318
|
-
class balance. If false, returns the split with the size closest to the desired eval_frac
|
319
|
-
|
320
|
-
Parameters
|
321
|
-
----------
|
322
|
-
labels : np.ndarray
|
323
|
-
Labels upon which splits are (optionally) stratified
|
324
|
-
split_defs : list[dict]
|
325
|
-
List of dictionaries, which specifying train index, validation index, and the ratio of
|
326
|
-
validation to all data.
|
327
|
-
stratified : bool
|
328
|
-
If True, maintain dataset class balance within each train/val split
|
329
|
-
eval_frac : float
|
330
|
-
Desired fraction of the dataset sequestered for evaluation
|
331
|
-
|
332
|
-
Returns
|
333
|
-
-------
|
334
|
-
train_index : np.ndarray
|
335
|
-
indices of data partitioned for training
|
336
|
-
eval_index : np.ndarray
|
337
|
-
indices of data partitioned for evaluation
|
338
|
-
"""
|
339
|
-
|
340
|
-
def class_freq_diff(split):
|
341
|
-
train_labels = labels[split["train"]]
|
342
|
-
_, train_counts = np.unique(train_labels, return_counts=True)
|
343
|
-
train_freq = train_counts / train_counts.sum()
|
344
|
-
return np.square(train_freq - class_freq).sum()
|
345
|
-
|
346
|
-
if stratified:
|
347
|
-
_, class_counts = np.unique(labels, return_counts=True)
|
348
|
-
class_freq = class_counts / class_counts.sum()
|
349
|
-
best_split = min(split_defs, key=class_freq_diff)
|
350
|
-
return best_split["train"], best_split["eval"]
|
351
|
-
elif eval_frac <= 2 / 3:
|
352
|
-
best_split = min(split_defs, key=lambda x: abs(eval_frac - x["eval_frac"])) # type: ignore
|
353
|
-
return best_split["train"], best_split["eval"]
|
354
|
-
else:
|
355
|
-
best_split = min(split_defs, key=lambda x: abs(eval_frac - (1 - x["eval_frac"]))) # type: ignore
|
356
|
-
return best_split["eval"], best_split["train"]
|
357
|
-
|
358
|
-
|
359
|
-
def single_split(
|
360
|
-
index: NDArray[np.int_],
|
361
|
-
labels: NDArray[np.int_],
|
362
|
-
eval_frac: float,
|
363
|
-
groups: NDArray[np.int_] | None = None,
|
364
|
-
stratified: bool = False,
|
365
|
-
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
366
|
-
"""Handles the special case where only 1 partition of the data is desired (such as when
|
367
|
-
generating the test holdout split). In this case, the desired fraction of the data to be
|
368
|
-
partitioned into the test data must be specified, and a single [train, eval] pair are returned.
|
369
|
-
|
370
|
-
Parameters
|
371
|
-
----------
|
372
|
-
index : np.ndarray
|
373
|
-
Input Dataset index corresponding to each label
|
374
|
-
labels : np.ndarray
|
375
|
-
Labels upon which splits are (optionally) stratified
|
376
|
-
eval_frac : float
|
377
|
-
Fraction of incoming data to be set aside for evaluation
|
378
|
-
groups : np.ndarray, Optional
|
379
|
-
Group_ids (same shape as labels) for optional group partitioning
|
380
|
-
stratified : bool, default=False
|
381
|
-
Generates stratified splits if true (recommended)
|
382
|
-
|
383
|
-
Returns
|
384
|
-
-------
|
385
|
-
train_index : np.ndarray
|
386
|
-
indices of data partitioned for training
|
387
|
-
eval_index : np.ndarray
|
388
|
-
indices of data partitioned for evaluation
|
389
|
-
"""
|
390
|
-
if groups is not None:
|
391
|
-
n_unique_groups = np.unique(groups).shape[0]
|
392
|
-
_, label_counts = np.unique(labels, return_counts=True)
|
393
|
-
n_folds = min(n_unique_groups, label_counts.min())
|
394
|
-
elif eval_frac <= 2 / 3:
|
395
|
-
n_folds = max(2, int(round(1 / (eval_frac + 1e-6))))
|
396
|
-
else:
|
397
|
-
n_folds = max(2, int(round(1 / (1 - eval_frac - 1e-6))))
|
398
|
-
split_candidates = make_splits(index, labels, n_folds, groups, stratified)
|
399
|
-
best_train, best_eval = find_best_split(labels, split_candidates, stratified, eval_frac)
|
400
|
-
return best_train, best_eval
|
401
|
-
|
402
|
-
|
403
|
-
def split_dataset(
|
404
|
-
labels: list[int] | NDArray[np.int_],
|
405
|
-
num_folds: int = 1,
|
406
|
-
stratify: bool = False,
|
407
|
-
split_on: list[str] | None = None,
|
408
|
-
metadata: dict[str, Any] | None = None,
|
409
|
-
test_frac: float | None = None,
|
410
|
-
val_frac: float | None = None,
|
411
|
-
) -> dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]]:
|
412
|
-
"""Top level splitting function. Returns a dict with each key-value pair containing
|
413
|
-
train and validation indices. Indices for a test holdout may also be optionally included
|
414
|
-
|
415
|
-
Parameters
|
416
|
-
----------
|
417
|
-
labels : Union[list, np.ndarray]
|
418
|
-
Classification Labels used to generate splits. Determines the size of the dataset
|
419
|
-
num_folds : int, optional
|
420
|
-
Number of train/val folds. If None, returns a single train/val split, and val_frac must be
|
421
|
-
specified. Defaults to None.
|
422
|
-
stratify : bool, default=False
|
423
|
-
If true, dataset is split such that the class distribution of the entire dataset is
|
424
|
-
preserved within each train/val partition, which is generally recommended.
|
425
|
-
split_on : list, optional
|
426
|
-
Keys of the metadata dictionary which map to columns upon which to group the dataset.
|
427
|
-
A grouped partition is divided such that no group is present within both the training and
|
428
|
-
validation set. Split_on groups should be selected to mitigate validation bias. Defaults to
|
429
|
-
None, in which groups will not be considered when partitioning the data.
|
430
|
-
metadata : dict, optional
|
431
|
-
Dict containing data for potential dataset grouping. See split_on above. Defaults to None.
|
432
|
-
test_frac : float, optional
|
433
|
-
Fraction of data to be optionally held out for test set. Defaults to None, in which no test
|
434
|
-
set is created.
|
435
|
-
val_frac : float, optional
|
436
|
-
Fraction of training data to be set aside for validation in the case where a single
|
437
|
-
train/val split is desired. Defaults to None.
|
438
|
-
|
439
|
-
Raises
|
440
|
-
------
|
441
|
-
UnboundLocalError
|
442
|
-
Raised if split_on is passed, but metadata is left as None. This is because split_on
|
443
|
-
defines the keys in which metadata dict must be indexed to determine the group index of the
|
444
|
-
data
|
445
|
-
|
446
|
-
Returns
|
447
|
-
-------
|
448
|
-
split_defs : dict
|
449
|
-
dictionary of folds, each containing indices of training and validation data.
|
450
|
-
ex.
|
451
|
-
{
|
452
|
-
"Fold_00": {
|
453
|
-
"train": [1,2,3,5,6,7,9,10,11],
|
454
|
-
"val": [0, 4, 8, 12]
|
455
|
-
},
|
456
|
-
"test": [13, 14, 15, 16]
|
457
|
-
}
|
458
|
-
"""
|
459
|
-
|
460
|
-
test_frac, val_frac = validate_test_val(num_folds, test_frac, val_frac)
|
461
|
-
total_partitions = num_folds + 1 if test_frac else num_folds
|
462
|
-
index, labels = check_labels(labels, total_partitions)
|
463
|
-
stratify &= check_stratifiable(labels, total_partitions)
|
464
|
-
if split_on:
|
465
|
-
if metadata is None:
|
466
|
-
raise UnboundLocalError("If split_on is specified, metadata must also be provided")
|
467
|
-
groups = get_group_ids(metadata, split_on, len(labels))
|
468
|
-
groupable = check_groups(groups, total_partitions)
|
469
|
-
if not groupable:
|
470
|
-
groups = None
|
471
|
-
else:
|
472
|
-
groups = None
|
473
|
-
split_defs: dict[str, dict[str, NDArray[np.int_]] | NDArray[np.int_]] = {}
|
474
|
-
if test_frac:
|
475
|
-
tv_idx, test_idx = single_split(index, labels, test_frac, groups, stratify)
|
476
|
-
tv_labels = labels[tv_idx]
|
477
|
-
tv_groups = groups[tv_idx] if groups is not None else None
|
478
|
-
split_defs["test"] = test_idx
|
479
|
-
else:
|
480
|
-
tv_idx = np.arange(len(labels)).reshape((-1, 1))
|
481
|
-
tv_labels = labels
|
482
|
-
tv_groups = groups
|
483
|
-
if num_folds == 1:
|
484
|
-
train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify)
|
485
|
-
split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
|
486
|
-
else:
|
487
|
-
tv_splits = make_splits(tv_idx, tv_labels, num_folds, tv_groups, stratify)
|
488
|
-
for i, split in enumerate(tv_splits):
|
489
|
-
train_split = tv_idx[split["train"]]
|
490
|
-
val_split = tv_idx[split["eval"]]
|
491
|
-
split_defs[f"fold_{i}"] = {"train": train_split.squeeze(), "val": val_split.squeeze()}
|
492
|
-
return split_defs
|
@@ -1,120 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: dataeval
|
3
|
-
Version: 0.74.1
|
4
|
-
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
|
-
Home-page: https://dataeval.ai/
|
6
|
-
License: MIT
|
7
|
-
Author: Andrew Weng
|
8
|
-
Author-email: andrew.weng@ariacoustics.com
|
9
|
-
Maintainer: ARiA
|
10
|
-
Maintainer-email: dataeval@ariacoustics.com
|
11
|
-
Requires-Python: >=3.9,<3.13
|
12
|
-
Classifier: Development Status :: 4 - Beta
|
13
|
-
Classifier: Intended Audience :: Science/Research
|
14
|
-
Classifier: License :: OSI Approved :: MIT License
|
15
|
-
Classifier: Operating System :: OS Independent
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
20
|
-
Classifier: Programming Language :: Python :: 3.12
|
21
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
22
|
-
Classifier: Topic :: Scientific/Engineering
|
23
|
-
Provides-Extra: all
|
24
|
-
Provides-Extra: torch
|
25
|
-
Requires-Dist: matplotlib ; extra == "all"
|
26
|
-
Requires-Dist: numpy (>=1.24.3)
|
27
|
-
Requires-Dist: pillow (>=10.3.0)
|
28
|
-
Requires-Dist: scikit-learn (>=1.5.0)
|
29
|
-
Requires-Dist: scipy (>=1.10)
|
30
|
-
Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
|
31
|
-
Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
|
32
|
-
Requires-Dist: tqdm
|
33
|
-
Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "3.10"
|
34
|
-
Requires-Dist: xxhash (>=3.3)
|
35
|
-
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
36
|
-
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
37
|
-
Description-Content-Type: text/markdown
|
38
|
-
|
39
|
-
# DataEval
|
40
|
-
|
41
|
-
## About DataEval
|
42
|
-
|
43
|
-
DataEval focuses on characterizing image data and its impact on model performance across Classification and object-detection tasks.
|
44
|
-
|
45
|
-
<!-- start about -->
|
46
|
-
|
47
|
-
**Model-agnostic metrics that bound real-world performance**
|
48
|
-
- relevance/completeness/coverage
|
49
|
-
- metafeatures (data complexity)
|
50
|
-
|
51
|
-
**Model-specific metrics that guide model selection and training**
|
52
|
-
- dataset sufficiency
|
53
|
-
- data/model complexity mismatch
|
54
|
-
|
55
|
-
**Metrics for post-deployment monitoring of data with bounds on model performance to guide retraining**
|
56
|
-
- dataset-shift metrics
|
57
|
-
- model performance bounds under covariate shift
|
58
|
-
- guidance on sampling to assess model error and model retraining
|
59
|
-
|
60
|
-
<!-- end about -->
|
61
|
-
|
62
|
-
## Getting Started
|
63
|
-
|
64
|
-
### Requirements
|
65
|
-
- Python 3.9-3.11
|
66
|
-
|
67
|
-
### Installing DataEval
|
68
|
-
|
69
|
-
You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `torch`, `tensorflow` and `all`. Using `torch` enables Sufficiency metrics, and `tensorflow` enables OOD Detection.
|
70
|
-
|
71
|
-
```
|
72
|
-
pip install dataeval[all]
|
73
|
-
```
|
74
|
-
|
75
|
-
### Installing DataEval in Conda/Mamba
|
76
|
-
|
77
|
-
DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
|
78
|
-
are installed from the `pytorch` channel, the channel is specified in the below example.
|
79
|
-
|
80
|
-
```
|
81
|
-
micromamba create -f environment\environment.yaml -c pytorch
|
82
|
-
```
|
83
|
-
|
84
|
-
### Installing DataEval from GitHub
|
85
|
-
|
86
|
-
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
|
87
|
-
|
88
|
-
```
|
89
|
-
sudo apt-get install git-lfs
|
90
|
-
pip install poetry
|
91
|
-
```
|
92
|
-
|
93
|
-
Pull the source down and change to the DataEval project directory.
|
94
|
-
```
|
95
|
-
git clone https://github.com/aria-ml/dataeval.git
|
96
|
-
cd dataeval
|
97
|
-
```
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
Install DataEval with optional dependencies for development.
|
102
|
-
```
|
103
|
-
poetry install --all-extras --with dev
|
104
|
-
```
|
105
|
-
|
106
|
-
Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
|
107
|
-
```
|
108
|
-
poetry shell
|
109
|
-
```
|
110
|
-
|
111
|
-
### Documentation and Tutorials
|
112
|
-
For more ideas on getting started using DataEval in your workflow, additional information and tutorials are in our Sphinx documentation hosted on [Read the Docs](https://dataeval.readthedocs.io/).
|
113
|
-
|
114
|
-
## Attribution
|
115
|
-
This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) python library developed by SeldonIO. Additional documentation from the developers are also available [here](https://docs.seldon.io/projects/alibi-detect/en/stable/).
|
116
|
-
|
117
|
-
## POCs
|
118
|
-
- **POC**: Scott Swan @scott.swan
|
119
|
-
- **DPOC**: Andrew Weng @aweng
|
120
|
-
|
dataeval-0.74.1.dist-info/RECORD
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
dataeval/__init__.py,sha256=HNOjwnFIQCD7vwBBo0xMexlnNG3xRZ3s3VUMsA4Qozw,392
|
2
|
-
dataeval/detectors/__init__.py,sha256=Y-0bbyWyuMvZU80bCx6WPt3IV_r2hu9ymzpA8uzMqoI,206
|
3
|
-
dataeval/detectors/drift/__init__.py,sha256=BSXm21y7cAawHep-ZldCJ5HOvzYjPzYGKGrmoEs3i0E,737
|
4
|
-
dataeval/detectors/drift/base.py,sha256=QDGHMu1WADD-38MEIOwjQMEQM3DE7B0yFHO3hsMbV-E,14481
|
5
|
-
dataeval/detectors/drift/cvm.py,sha256=kc59w2_wtxFGNnLcaJRvX5v_38gPXiebSGNiFVdunEQ,4142
|
6
|
-
dataeval/detectors/drift/ks.py,sha256=gcpe1WIQeNeZdLYkdMZCFLXUp1bHMQUxwJE6-RLVOXs,4229
|
7
|
-
dataeval/detectors/drift/mmd.py,sha256=C0FX5v9ZJzmKNYEcYUaC7sDtMpJ2dZpwikNDu-AEWiI,7584
|
8
|
-
dataeval/detectors/drift/torch.py,sha256=igEQ2DV9JmcpTdUKCOHBi5LxtoNeCAslJS2Ldulg1hw,7585
|
9
|
-
dataeval/detectors/drift/uncertainty.py,sha256=Xz2yzJjtJfw1vLag234jwRvaa_HK36nMajGx8bQaNRs,5322
|
10
|
-
dataeval/detectors/drift/updates.py,sha256=UJ0z5hlunRi7twnkLABfdJG3tT2EqX4y9IGx8_USYvo,1780
|
11
|
-
dataeval/detectors/linters/__init__.py,sha256=BvpaB1RUpkEhhXk3Mqi5NYoOcJKZRFSBOJCmQOIfYRU,483
|
12
|
-
dataeval/detectors/linters/clusterer.py,sha256=hK-ak02GaxwWuufesZMKDsvoE5fMdXO7UWsLiK8hfY0,21008
|
13
|
-
dataeval/detectors/linters/duplicates.py,sha256=2bmPTFqoefeiAQV9y4CGlHV_mJNrysJSEFLXLd2DO4I,5661
|
14
|
-
dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
|
15
|
-
dataeval/detectors/linters/outliers.py,sha256=X48bzTfTr1LqC6WKVKBRfvpjcQRgmb93cNLT7Oipe3M,10113
|
16
|
-
dataeval/detectors/ood/__init__.py,sha256=-D4Fq-ysFylNNMqjHG1ALbB9qBCm_UinkCAgsK9HGg0,408
|
17
|
-
dataeval/detectors/ood/ae_torch.py,sha256=pO9w5221bXR9lEBkE7oakXeE7PXUUR--xcTpmHvOCSk,2142
|
18
|
-
dataeval/detectors/ood/base.py,sha256=UzcDbXl8Gv43VFzjrOegTnKSIoEYmfDP7fAySeWyWPw,6955
|
19
|
-
dataeval/detectors/ood/base_torch.py,sha256=yFbSfQsBMwZeVf8mrixmkZYBGChhV5oAHtkgzWnMzsA,3405
|
20
|
-
dataeval/detectors/ood/metadata_ks_compare.py,sha256=LNDNWGEDKTW8_-djgmK53sn9EZzzXq1Sgwc47k0QI-Y,5380
|
21
|
-
dataeval/detectors/ood/metadata_least_likely.py,sha256=nxMCXUOjOfWHDTGT2SLE7OYBCydRq8zHLd8t17k7hMM,5193
|
22
|
-
dataeval/detectors/ood/metadata_ood_mi.py,sha256=KLay2BmgHrStBV92VpIs_B1yEfQKllsMTgzOQEng01I,4065
|
23
|
-
dataeval/interop.py,sha256=SB5Nca12rluZeXrpmmlfY7LFJbN5opYM7jmAb2c29hM,1748
|
24
|
-
dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
|
25
|
-
dataeval/metrics/bias/__init__.py,sha256=dYiPHenS8J7pgRMMW2jNkTBmTbPoYTxT04fZu9PFats,747
|
26
|
-
dataeval/metrics/bias/balance.py,sha256=_TZEe17AT-qOvPp-QFrQfTqNwh8uVVCYjC4Sv6JBx9o,9118
|
27
|
-
dataeval/metrics/bias/coverage.py,sha256=o65_IgrWSlGnYeYZFABjwKaxq09uqyy5esHJM67PJ-k,4528
|
28
|
-
dataeval/metrics/bias/diversity.py,sha256=WL1NbZiRrv0SIq97FY3womZNCSl_EBMVlBWQZAUtjk8,7701
|
29
|
-
dataeval/metrics/bias/metadata_preprocessing.py,sha256=ekUFiirkmaHDiH7nJjkNpiUQD7OolAPhHorjLxpXv_Y,12248
|
30
|
-
dataeval/metrics/bias/metadata_utils.py,sha256=HmTjlRRTdM9566oKUDDdVMJ8luss4DYykFOiS2FQzhM,6558
|
31
|
-
dataeval/metrics/bias/parity.py,sha256=hnA7qQH4Uy3tl_krluZ9BPD5zYjjagUxZt2fEiIa2yE,12745
|
32
|
-
dataeval/metrics/estimators/__init__.py,sha256=O6ocxJq8XDkfJWwXeJnnnzbOyRnFPKF4kTIVTTZYOA8,380
|
33
|
-
dataeval/metrics/estimators/ber.py,sha256=fs3_e9pgu7I50QIALWtF2aidkBZhTCKVE2pA7PyB5Go,5019
|
34
|
-
dataeval/metrics/estimators/divergence.py,sha256=r_SKSurf1TdI5E1ivENqDnz8cQ3_sxVGKAqmF9cqcT4,4275
|
35
|
-
dataeval/metrics/estimators/uap.py,sha256=Aw5ReoWNK73Tq96r__qN_-cvHrELauqtDX3Af_QxX4s,2157
|
36
|
-
dataeval/metrics/stats/__init__.py,sha256=igLRaAt1nX6yRwC4xI0zNPBADi3u7EsSxWP3OZ8AqcU,1086
|
37
|
-
dataeval/metrics/stats/base.py,sha256=_C05KUAuDrfX3N-19o25V3vmXr0-45A5fc57cXyV8qs,12161
|
38
|
-
dataeval/metrics/stats/boxratiostats.py,sha256=bZunY-b8Y2IQqHlTusQN77ujLOHftogEQIARDpdVv6A,6463
|
39
|
-
dataeval/metrics/stats/datasetstats.py,sha256=rZUDiciHwEpnXmkI8-uJNiYwUuTL9ssZMKMx73hVX-Y,6219
|
40
|
-
dataeval/metrics/stats/dimensionstats.py,sha256=xITgQF_oomb6Ty_dJcbT3ARGGNp4QRcYSgnkjB4f-YE,4054
|
41
|
-
dataeval/metrics/stats/hashstats.py,sha256=vxw_K74EJM9CZy-EV617vdrysFO8nEspVWqIYsIHC-c,4958
|
42
|
-
dataeval/metrics/stats/labelstats.py,sha256=K0hJTphMe7htSjyss8GPtKDiHepTuU60_hX0xRA-uAg,4096
|
43
|
-
dataeval/metrics/stats/pixelstats.py,sha256=2zr9i3GLNx1i_SCtbfdtZNxXBEc_9wCe4qDpmXLVbKY,4576
|
44
|
-
dataeval/metrics/stats/visualstats.py,sha256=vLIC4sMo796axWl-4e4RzT33ll-_6ki54Dirn3V-EL8,4948
|
45
|
-
dataeval/output.py,sha256=SmzH9W9yewdL9SBKVBkUUvOo45oA5lHphE2DYvJJMu0,3573
|
46
|
-
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
-
dataeval/utils/__init__.py,sha256=z7HxSijjycey-rGdQkgVOdpvT0oO2pKAuT4uYyxYGMs,555
|
48
|
-
dataeval/utils/gmm.py,sha256=YuLsJKsVWgH_wHr1u_hSRH5Yeexdj8exht8h99L7bLo,561
|
49
|
-
dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
|
50
|
-
dataeval/utils/metadata.py,sha256=0A--iru0zEmi044mKz5P35q69KrI30yoiRSlvs7TSdQ,9418
|
51
|
-
dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
|
52
|
-
dataeval/utils/split_dataset.py,sha256=Ot1ZJhbIhVfcShYXF9MkWXak5odBXyuBdRh-noXh-MI,19555
|
53
|
-
dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
|
54
|
-
dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
55
|
-
dataeval/utils/torch/datasets.py,sha256=10elNgLuH_FDX_CHE3y2Z215JN4-PQovQm5brcIJOeM,15021
|
56
|
-
dataeval/utils/torch/gmm.py,sha256=VbLlUQohwToApT493_tjQBWy2UM5R-3ppS9Dp-eP7BA,3240
|
57
|
-
dataeval/utils/torch/models.py,sha256=sdGeo7a8vshCTGA4lYyVxxb_aDWUlxdtIVxrddS-_ls,8542
|
58
|
-
dataeval/utils/torch/trainer.py,sha256=8BEXr6xtk-CHJTcNxOBnWgkFWfJUAiBy28cEdBhLMRU,7883
|
59
|
-
dataeval/utils/torch/utils.py,sha256=nWRcT6z6DbFVrL1RyxCOX3DPoCrv9G0B-VI_9LdGCQQ,5784
|
60
|
-
dataeval/workflows/__init__.py,sha256=ef1MiVL5IuhlDXXbwsiAfafhnr7tD3TXF9GRusy9_O8,290
|
61
|
-
dataeval/workflows/sufficiency.py,sha256=v9AV3BZT0NW-zD2VNIL_5aWspvoscrxRIUKcUdpy7HI,18540
|
62
|
-
dataeval-0.74.1.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
|
63
|
-
dataeval-0.74.1.dist-info/METADATA,sha256=nd7os3kaLfp-A5HWH0QYVxe-gQdj5q3dIn9d0fPf-Lk,4298
|
64
|
-
dataeval-0.74.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
65
|
-
dataeval-0.74.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|