dataeval 0.72.0__py3-none-any.whl → 0.72.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_internal/datasets.py +1 -1
  3. dataeval/_internal/detectors/clusterer.py +6 -6
  4. dataeval/_internal/detectors/drift/base.py +15 -14
  5. dataeval/_internal/detectors/drift/cvm.py +5 -5
  6. dataeval/_internal/detectors/drift/ks.py +7 -7
  7. dataeval/_internal/detectors/drift/mmd.py +10 -9
  8. dataeval/_internal/detectors/drift/torch.py +2 -2
  9. dataeval/_internal/detectors/drift/uncertainty.py +5 -4
  10. dataeval/_internal/detectors/duplicates.py +1 -1
  11. dataeval/_internal/detectors/ood/ae.py +2 -2
  12. dataeval/_internal/detectors/ood/aegmm.py +2 -2
  13. dataeval/_internal/detectors/ood/base.py +3 -3
  14. dataeval/_internal/detectors/ood/llr.py +3 -3
  15. dataeval/_internal/detectors/ood/vae.py +1 -1
  16. dataeval/_internal/detectors/ood/vaegmm.py +1 -1
  17. dataeval/_internal/detectors/outliers.py +5 -5
  18. dataeval/_internal/metrics/balance.py +5 -5
  19. dataeval/_internal/metrics/ber.py +6 -6
  20. dataeval/_internal/metrics/coverage.py +4 -4
  21. dataeval/_internal/metrics/divergence.py +4 -4
  22. dataeval/_internal/metrics/diversity.py +6 -6
  23. dataeval/_internal/metrics/parity.py +9 -7
  24. dataeval/_internal/metrics/stats/base.py +7 -5
  25. dataeval/_internal/metrics/stats/boxratiostats.py +1 -1
  26. dataeval/_internal/metrics/stats/datasetstats.py +1 -1
  27. dataeval/_internal/metrics/stats/dimensionstats.py +3 -3
  28. dataeval/_internal/metrics/stats/hashstats.py +3 -3
  29. dataeval/_internal/metrics/stats/labelstats.py +3 -3
  30. dataeval/_internal/metrics/stats/pixelstats.py +3 -3
  31. dataeval/_internal/metrics/stats/visualstats.py +4 -4
  32. dataeval/_internal/metrics/uap.py +3 -3
  33. dataeval/_internal/metrics/utils.py +8 -8
  34. dataeval/_internal/models/pytorch/autoencoder.py +8 -8
  35. dataeval/_internal/models/pytorch/utils.py +3 -3
  36. dataeval/_internal/models/tensorflow/autoencoder.py +4 -4
  37. dataeval/_internal/models/tensorflow/losses.py +1 -1
  38. dataeval/_internal/models/tensorflow/pixelcnn.py +2 -2
  39. dataeval/_internal/models/tensorflow/utils.py +1 -1
  40. dataeval/_internal/split_dataset.py +421 -0
  41. dataeval/_internal/workflows/sufficiency.py +3 -3
  42. dataeval/detectors/drift/__init__.py +1 -1
  43. dataeval/detectors/drift/updates/__init__.py +2 -1
  44. dataeval/detectors/ood/__init__.py +2 -10
  45. dataeval/utils/__init__.py +1 -1
  46. dataeval/utils/tensorflow/__init__.py +2 -1
  47. {dataeval-0.72.0.dist-info → dataeval-0.72.1.dist-info}/METADATA +6 -5
  48. dataeval-0.72.1.dist-info/RECORD +81 -0
  49. dataeval-0.72.0.dist-info/RECORD +0 -80
  50. {dataeval-0.72.0.dist-info → dataeval-0.72.1.dist-info}/LICENSE.txt +0 -0
  51. {dataeval-0.72.0.dist-info → dataeval-0.72.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,421 @@
1
+ from __future__ import annotations
2
+
3
+ import warnings
4
+
5
+ import numpy as np
6
+ from sklearn.cluster import KMeans
7
+ from sklearn.metrics import silhouette_score
8
+ from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, StratifiedKFold
9
+ from sklearn.utils.multiclass import type_of_target
10
+
11
+
12
+ def check_args(num_folds: int = 1, test_frac: float | None = None, val_frac: float | None = None):
13
+ """Check input arguments to ensure unambiguous splitting arguments are passed.
14
+
15
+ Parameters
16
+ ----------
17
+ num_folds : int, default 1
18
+ number of [train, val] cross-validation folds to generate
19
+ test_frac : float, optional
20
+ If specified, also generate a test set containing (test_frac*100)% of the data
21
+ val_frac : float, optional
22
+ Only specify if requesting a single [train, val] split. The validation split will
23
+ contain (val_frac*100)% of any data not already allocated to the test set
24
+
25
+ Raises
26
+ ------
27
+ UnboundLocalError
28
+ Raised if more than one fold AND the fraction of data to be used for validation are
29
+ both requested. In this case, val_frac is ambiguous, since the validation fraction must be
30
+ by definition 1/num_folds
31
+ ValueError
32
+ Raised if num_folds is 1 (or left blank) AND val_frac is unspecified. When only 1 fold is
33
+ requested, we need to know how much of the data should be allocated for validation.
34
+ ValueError
35
+ Raised if the total fraction of data used for evaluation (val + test) meets or exceeds 1.0
36
+
37
+ Returns
38
+ -------
39
+ None
40
+ """
41
+ if (num_folds > 1) and (val_frac is not None):
42
+ raise ValueError("If specifying val_frac, num_folds must be None or 1")
43
+ if (num_folds == 1) and (val_frac is None):
44
+ raise UnboundLocalError("If num_folds is None or 1, must assign a value to val_frac")
45
+ t_frac = 0.0 if test_frac is None else test_frac
46
+ v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
47
+ if (t_frac + v_frac) >= 1.0:
48
+ raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
49
+
50
+
51
+ def check_labels(labels: list | np.ndarray, total_partitions: int):
52
+ """Check to make sure there are more input data than the total number of partitions requested
53
+ Also converts labels to a numpy array, if it isn't already
54
+
55
+ Parameters
56
+ ----------
57
+ labels : list or np.ndarray
58
+ all class labels from the input dataset
59
+ total_partitions : int
60
+ number of train-val splits requested (+1 if a test holdout is specified)
61
+
62
+ Raises
63
+ ------
64
+ IndexError
65
+ Raised if more partitions are requested than number of labels. This is exceedingly rare and
66
+ usually means you've specified some argument incorrectly.
67
+ ValueError
68
+ Raised if the labels are considered continuous by Scikit-Learn. This does not necessarily
69
+ mean that floats are not accepted as a label format. Rather, this exception implies that
70
+ there are too many unique values in the set relative to it's cardinality.
71
+
72
+ Returns
73
+ -------
74
+ index : np.ndarray
75
+ Integer index generated based on the total number of labels
76
+ labels : np.ndarray
77
+ labels, converted to an ndarray if passed as a list.
78
+ """
79
+ if len(labels) <= total_partitions:
80
+ raise IndexError(f"""
81
+ Total number of labels must greater than the number of total partitions.
82
+ Got {len(labels)} labels and {total_partitions} total train/val/test partitions.""")
83
+ if isinstance(labels, list):
84
+ labels = np.array(labels)
85
+ if type_of_target(labels) == "continuous":
86
+ raise ValueError("Detected continuous labels, labels must be discrete for proper stratification")
87
+ index = np.arange(len(labels))
88
+ return index, labels
89
+
90
+
91
+ def check_stratifiable(labels: np.ndarray, total_partitions: int):
92
+ """
93
+ Very basic check to see if dataset can be stratified by class label. This is not a
94
+ comprehensive test, as factors such as grouping also affect the ability to stratify by label
95
+
96
+ Parameters
97
+ ----------
98
+ labels : list or np.ndarray
99
+ all class labels from the input dataset
100
+ total_partitions : int
101
+ number of train-val splits requested (+1 if a test holdout is specified)
102
+
103
+ Warns
104
+ -----
105
+ UserWarning
106
+ Warns user if the dataset cannot be stratified due to the number of total (train, val, test)
107
+ partitions exceeding the number of instances of the rarest class label.
108
+
109
+ Returns
110
+ -------
111
+ stratifiable : bool
112
+ True if dataset can be stratified according to the criteria above.
113
+ """
114
+
115
+ stratifiable = True
116
+ _, label_counts = np.unique(labels, return_counts=True)
117
+ rarest_label_count = label_counts.min()
118
+ if rarest_label_count < total_partitions:
119
+ warnings.warn(f"""
120
+ Unable to stratify due to label frequency. The rarest label occurs {rarest_label_count},
121
+ which is fewer than the total number of partitions requested. Setting stratify flag to
122
+ false.""")
123
+ stratifiable = False
124
+ return stratifiable
125
+
126
+
127
+ def check_groups(group_ids: np.ndarray, num_partitions: int):
128
+ """
129
+ Warns user if the number of unique group_ids is incompatible with a grouped partition containing
130
+ num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
131
+ group the input data.
132
+
133
+ Parameters
134
+ ----------
135
+ group_ids : np.ndarray
136
+ Identifies the group to which a sample at the same index belongs.
137
+ num_partitions: int
138
+ How many total (train, val) folds will be generated (+1 if also specifying a test fold).
139
+
140
+ Warns
141
+ -----
142
+ UserWarning
143
+ Warns if there are fewer groups than the minimum required to successfully partition the data
144
+ into num_partitions. The minimum is defined as the number of partitions requested plus one.
145
+
146
+ Returns
147
+ -------
148
+ groupable : bool
149
+ True if dataset can be grouped by the given group ids, given the criteria above.
150
+ """
151
+
152
+ groupable = True
153
+ num_unique_groups = len(np.unique(group_ids))
154
+ min_unique_groups = num_partitions + 1
155
+ if num_unique_groups < min_unique_groups:
156
+ warnings.warn(f"""
157
+ {min_unique_groups} unique groups required for {num_partitions} partitions.
158
+ Found {num_unique_groups} instead. Reverting to ungrouped partitioning""")
159
+ groupable = False
160
+ else:
161
+ groupable = True
162
+ return groupable
163
+
164
+
165
+ def bin_kmeans(array: np.ndarray):
166
+ """
167
+ Find bins of continuous data by iteratively applying k-means clustering, and keeping the
168
+ clustering with the highest silhouette score.
169
+
170
+ Parameters
171
+ ----------
172
+ array : np.ndarray
173
+ continuous data to bin
174
+
175
+ Returns
176
+ -------
177
+ np.ndarray[int]: bin numbers assigned by the kmeans best clusterer.
178
+ """
179
+ array = np.array(array)
180
+ if array.ndim == 1:
181
+ array = array.reshape([-1, 1])
182
+ best_score = 0.60
183
+ else:
184
+ best_score = 0.50
185
+ bin_index = np.zeros(len(array))
186
+ for k in range(2, 20):
187
+ clusterer = KMeans(n_clusters=k)
188
+ cluster_labels = clusterer.fit_predict(array)
189
+ score = silhouette_score(array, cluster_labels, sample_size=25_000)
190
+ if score > best_score:
191
+ best_score = score
192
+ bin_index = cluster_labels
193
+ return bin_index
194
+
195
+
196
+ def angle2xy(angles: np.ndarray):
197
+ """
198
+ Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
199
+
200
+ Parameters
201
+ ----------
202
+ angles : np.ndarray
203
+ angle data in either radians or degrees
204
+
205
+ Returns
206
+ -------
207
+ xy : np.ndarray
208
+ Nx2 array of xy coordinates for each angle (can be radians or degrees)
209
+ """
210
+ is_radians = ((angles >= -np.pi) & (angles <= 2 * np.pi)).all()
211
+ radians = angles if is_radians else np.pi / 180 * angles
212
+ xy = np.stack([np.cos(radians), np.sin(radians)], axis=1)
213
+ return xy
214
+
215
+
216
+ def get_group_ids(metadata: dict, groupnames: list, num_samples: int):
217
+ """Returns individual group numbers based on a subset of metadata defined by groupnames
218
+
219
+ Parameters
220
+ ----------
221
+ metadata : dict
222
+ dictionary containing all metadata
223
+ groupnames : list
224
+ which groups from the metadata dictionary to consider for dataset grouping
225
+ num_samples : int
226
+ number of labels. Used to ensure agreement between input data/labels and metadata entries.
227
+
228
+ Raises
229
+ ------
230
+ IndexError
231
+ raised if an entry in the metadata dictionary doesn't have the same length as num_samples
232
+
233
+ Returns
234
+ -------
235
+ group_ids: np.ndarray
236
+ group identifiers from metadata
237
+ """
238
+ features2group = {k: np.array(v) for k, v in metadata.items() if k in groupnames}
239
+ if not features2group:
240
+ return np.zeros(num_samples, dtype=int)
241
+ for name, feature in features2group.items():
242
+ if len(feature) != num_samples:
243
+ raise IndexError(f"""Feature length does not match number of labels.
244
+ Got {len(feature)} features and {num_samples} samples""")
245
+ if type_of_target(feature) == "continuous":
246
+ if ("ANGLE" in name.upper()) or ("AZIMUTH" in name.upper()):
247
+ feature = angle2xy(feature)
248
+ features2group[name] = bin_kmeans(feature)
249
+ binned_features = np.stack(list(features2group.values()), axis=1)
250
+ _, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
251
+ return group_ids
252
+
253
+
254
+ def make_splits(
255
+ index: np.ndarray, labels: np.ndarray, n_folds: int, groups: np.ndarray | None = None, stratified: bool = False
256
+ ):
257
+ """Split data into n_folds partitions of training and validation data.
258
+
259
+ Parameters
260
+ ----------
261
+ index : np.ndarray
262
+ index corresponding to each label (see below)
263
+ labels : np.ndarray
264
+ classification labels
265
+ n_folds : int
266
+ number or train/val folds
267
+ groups : np.ndarray, Optional
268
+ group index for grouped partitions. Grouped partitions are split such that no group id is
269
+ present in both a training and validation split.
270
+ stratified : bool, default=False
271
+ If True, maintain dataset class balance within each train/val split
272
+
273
+ Returns
274
+ -------
275
+ split_defs : list[dict]
276
+ list of dictionaries, which specifying train index, validation index, and the ratio of
277
+ validation to all data.
278
+ """
279
+ split_defs = []
280
+ index = index.reshape([-1, 1])
281
+ if groups is not None:
282
+ splitter = StratifiedGroupKFold(n_folds) if stratified else GroupKFold(n_folds)
283
+ splits = splitter.split(index, labels, groups)
284
+ else:
285
+ splitter = StratifiedKFold(n_folds) if stratified else KFold(n_folds)
286
+ splits = splitter.split(index, labels)
287
+ for train_idx, eval_idx in splits:
288
+ test_ratio = len(eval_idx) / index.shape[0]
289
+ split_defs.append({"train": train_idx.astype(int), "eval": eval_idx.astype(int), "eval_frac": test_ratio})
290
+ return split_defs
291
+
292
+
293
+ def single_split(
294
+ index: np.ndarray, labels: np.ndarray, eval_frac: float, groups: np.ndarray | None = None, stratified: bool = False
295
+ ):
296
+ """Handles the special case where only 1 partition of the data is desired (such as when
297
+ generating the test holdout split). In this case, the desired fraction of the data to be
298
+ partitioned into the test data must be specified, and a single [train, eval] pair are returned.
299
+
300
+ Parameters
301
+ ----------
302
+ index : np.ndarray
303
+ Input Dataset index corresponding to each label
304
+ labels : np.ndarray
305
+ Labels upon which splits are (optionally) stratified
306
+ eval_frac : float
307
+ Fraction of incoming data to be set aside for evaluation
308
+ groups : np.ndarray, Optional
309
+ Group_ids (same shape as labels) for optional group partitioning
310
+ stratified : bool, default=False
311
+ Generates stratified splits if true (recommended)
312
+
313
+ Returns
314
+ -------
315
+ train_index : np.ndarray
316
+ indices of data partitioned for training
317
+ eval_index : np.ndarray
318
+ indices of data partitioned for evaluation
319
+ """
320
+ if eval_frac <= 2 / 3:
321
+ n_folds = max(2, int(round(1 / (eval_frac + 1e-6))))
322
+ split_candidates = make_splits(index, labels, n_folds, groups, stratified)
323
+ best_split = min(split_candidates, key=lambda x: abs(eval_frac - x["eval_frac"]))
324
+ return best_split["train"], best_split["eval"]
325
+ else:
326
+ n_folds = max(2, int(round(1 / (1 - eval_frac + 1e-6))))
327
+ split_candidates = make_splits(index, labels, n_folds, groups, stratified)
328
+ best_split = min(split_candidates, key=lambda x: abs(eval_frac - (1 - x["eval_frac"])))
329
+ return best_split["eval"], best_split["train"]
330
+
331
+
332
+ def split_dataset(
333
+ labels: list | np.ndarray,
334
+ num_folds: int = 1,
335
+ stratify: bool = False,
336
+ split_on: list | None = None,
337
+ metadata: dict | None = None,
338
+ test_frac: float | None = None,
339
+ val_frac: float | None = None,
340
+ ):
341
+ """Top level splitting function. Returns a dict with each key-value pair containing
342
+ train and validation indices. Indices for a test holdout may also be optionally included
343
+
344
+ Parameters
345
+ ----------
346
+ labels : Union[list, np.ndarray]
347
+ Classification Labels used to generate splits. Determines the size of the dataset
348
+ num_folds : int, optional
349
+ Number of train/val folds. If None, returns a single train/val split, and val_frac must be
350
+ specified. Defaults to None.
351
+ stratify : bool, default=False
352
+ If true, dataset is split such that the class distribution of the entire dataset is
353
+ preserved within each train/val partition, which is generally recommended.
354
+ split_on : list, optional
355
+ Keys of the metadata dictionary which map to columns upon which to group the dataset.
356
+ A grouped partition is divided such that no group is present within both the training and
357
+ validation set. Split_on groups should be selected to mitigate validation bias. Defaults to
358
+ None, in which groups will not be considered when partitioning the data.
359
+ metadata : dict, optional
360
+ Dict containing data for potential dataset grouping. See split_on above. Defaults to None.
361
+ test_frac : float, optional
362
+ Fraction of data to be optionally held out for test set. Defaults to None, in which no test
363
+ set is created.
364
+ val_frac : float, optional
365
+ Fraction of training data to be set aside for validation in the case where a single
366
+ train/val split is desired. Defaults to None.
367
+
368
+ Raises
369
+ ------
370
+ UnboundLocalError
371
+ Raised if split_on is passed, but metadata is left as None. This is because split_on
372
+ defines the keys in which metadata dict must be indexed to determine the group index of the
373
+ data
374
+
375
+ Returns
376
+ -------
377
+ split_defs : dict
378
+ dictionary of folds, each containing indices of training and validation data.
379
+ ex.
380
+ {
381
+ "Fold_00": {
382
+ "train": [1,2,3,5,6,7,9,10,11],
383
+ "val": [0, 4, 8, 12]
384
+ },
385
+ "test": [13, 14, 15, 16]
386
+ }
387
+ """
388
+
389
+ check_args(num_folds, test_frac, val_frac)
390
+ total_partitions = num_folds + 1 if test_frac else num_folds
391
+ index, labels = check_labels(labels, total_partitions)
392
+ stratify &= check_stratifiable(labels, total_partitions)
393
+ if split_on:
394
+ if metadata is None:
395
+ raise UnboundLocalError("If split_on is specified, metadata must also be provided")
396
+ groups = get_group_ids(metadata, split_on, len(labels))
397
+ groupable = check_groups(groups, total_partitions)
398
+ if not groupable:
399
+ groups = None
400
+ else:
401
+ groups = None
402
+ split_defs = {}
403
+ if test_frac:
404
+ tv_idx, test_idx = single_split(index, labels, test_frac, groups, stratify)
405
+ tv_labels = labels[tv_idx]
406
+ tv_groups = groups[tv_idx] if groups is not None else None
407
+ split_defs["test"] = test_idx
408
+ else:
409
+ tv_idx = np.arange(len(labels)).reshape((-1, 1))
410
+ tv_labels = labels
411
+ tv_groups = groups
412
+ if num_folds == 1:
413
+ train_idx, val_idx = single_split(tv_idx, tv_labels, val_frac, tv_groups, stratify) # type: ignore
414
+ split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
415
+ else:
416
+ tv_splits = make_splits(tv_idx, tv_labels, num_folds, tv_groups, stratify)
417
+ for i, split in enumerate(tv_splits):
418
+ train_split = tv_idx[split["train"]]
419
+ val_split = tv_idx[split["eval"]]
420
+ split_defs[f"fold_{i}"] = {"train": train_split.squeeze(), "val": val_split.squeeze()}
421
+ return split_defs
@@ -86,7 +86,7 @@ class SufficiencyOutput(OutputMetadata):
86
86
  return SufficiencyOutput(projection, self.params, output)
87
87
 
88
88
  def plot(self, class_names: Sequence[str] | None = None) -> list[Figure]:
89
- """Plotting function for data sufficiency tasks
89
+ """Plotting function for data :term:`sufficience<Sufficiency>` tasks
90
90
 
91
91
  Parameters
92
92
  ----------
@@ -369,7 +369,7 @@ def plot_measure(
369
369
 
370
370
  class Sufficiency:
371
371
  """
372
- Project dataset sufficiency using given a model and evaluation criteria
372
+ Project dataset :term:`sufficiency<Sufficiency>` using given a model and evaluation criteria
373
373
 
374
374
  Parameters
375
375
  ----------
@@ -488,7 +488,7 @@ class Sufficiency:
488
488
  ----------
489
489
  eval_at : int | Iterable[int] | None, default None
490
490
  Specify this to collect accuracies over a specific set of dataset lengths, rather
491
- than letting Sufficiency internally create the lengths to evaluate at.
491
+ than letting :term:`sufficiency<Sufficiency>` internally create the lengths to evaluate at.
492
492
  niter : int, default 1000
493
493
  Iterations to perform when using the basin-hopping method to curve-fit measure(s).
494
494
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Drift detectors identify if the statistical properties of the data has changed.
2
+ :term:`Drift` detectors identify if the statistical properties of the data has changed.
3
3
  """
4
4
 
5
5
  from dataeval import _IS_TORCH_AVAILABLE
@@ -1,5 +1,6 @@
1
1
  """
2
- Update strategies inform how the drift detector classes update the reference data when monitoring for drift.
2
+ Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
3
+ for drift.
3
4
  """
4
5
 
5
6
  from dataeval._internal.detectors.drift.base import LastSeenUpdate, ReservoirSamplingUpdate
@@ -1,5 +1,5 @@
1
1
  """
2
- Out-of-distribution detectors identify data that is different from the data used to train a particular model.
2
+ Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
3
3
  """
4
4
 
5
5
  from dataeval import _IS_TENSORFLOW_AVAILABLE
@@ -12,12 +12,4 @@ if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
12
12
  from dataeval._internal.detectors.ood.vae import OOD_VAE
13
13
  from dataeval._internal.detectors.ood.vaegmm import OOD_VAEGMM
14
14
 
15
- __all__ = [
16
- "OOD_AE",
17
- "OOD_AEGMM",
18
- "OOD_LLR",
19
- "OOD_VAE",
20
- "OOD_VAEGMM",
21
- "OODOutput",
22
- "OODScoreOutput",
23
- ]
15
+ __all__ = ["OOD_AE", "OOD_AEGMM", "OOD_LLR", "OOD_VAE", "OOD_VAEGMM", "OODOutput", "OODScoreOutput"]
@@ -1,7 +1,7 @@
1
1
  """
2
2
  The utility classes and functions are provided by DataEval to assist users
3
3
  in setting up architectures that are guaranteed to work with applicable DataEval
4
- metrics. Currently DataEval supports both Tensorflow and PyTorch backends.
4
+ metrics. Currently DataEval supports both :term:`TensorFlow` and PyTorch backends.
5
5
  """
6
6
 
7
7
  from dataeval import _IS_TENSORFLOW_AVAILABLE, _IS_TORCH_AVAILABLE
@@ -1,5 +1,6 @@
1
1
  """
2
- Tensorflow models are used in out-of-distribution detectors in the :mod:`dataeval.detectors.ood` module.
2
+ TensorFlow models are used in :term:`out of distribution<Out-of-distribution (OOD)>` detectors in the
3
+ :mod:`dataeval.detectors.ood` module.
3
4
 
4
5
  DataEval provides both basic default models through the utility :func:`dataeval.utils.tensorflow.models.create_model`
5
6
  as well as constructors which allow for customization of the encoder, decoder and any other applicable
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.72.0
3
+ Version: 0.72.1
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -8,7 +8,7 @@ Author: Andrew Weng
8
8
  Author-email: andrew.weng@ariacoustics.com
9
9
  Maintainer: ARiA
10
10
  Maintainer-email: dataeval@ariacoustics.com
11
- Requires-Python: >=3.9,<3.12
11
+ Requires-Python: >=3.9,<3.13
12
12
  Classifier: Development Status :: 4 - Beta
13
13
  Classifier: Intended Audience :: Science/Research
14
14
  Classifier: License :: OSI Approved :: MIT License
@@ -17,19 +17,20 @@ Classifier: Programming Language :: Python :: 3
17
17
  Classifier: Programming Language :: Python :: 3.9
18
18
  Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
20
21
  Classifier: Programming Language :: Python :: 3 :: Only
21
22
  Classifier: Topic :: Scientific/Engineering
22
23
  Provides-Extra: all
23
24
  Provides-Extra: tensorflow
24
25
  Provides-Extra: torch
25
26
  Requires-Dist: hdbscan (>=0.8.36)
27
+ Requires-Dist: markupsafe (<3.0.2) ; extra == "tensorflow" or extra == "all"
26
28
  Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
27
29
  Requires-Dist: numpy (>1.24.3)
28
- Requires-Dist: nvidia-cudnn-cu11 (>=8.6.0.163) ; extra == "tensorflow" or extra == "torch" or extra == "all"
29
30
  Requires-Dist: pillow (>=10.3.0)
30
31
  Requires-Dist: scikit-learn (>=1.5.0)
31
32
  Requires-Dist: scipy (>=1.10)
32
- Requires-Dist: tensorflow (>=2.16) ; extra == "tensorflow" or extra == "all"
33
+ Requires-Dist: tensorflow (>=2.16,<2.18) ; extra == "tensorflow" or extra == "all"
33
34
  Requires-Dist: tensorflow_probability (>=0.24) ; extra == "tensorflow" or extra == "all"
34
35
  Requires-Dist: tf-keras (>=2.16) ; extra == "tensorflow" or extra == "all"
35
36
  Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
@@ -44,7 +45,7 @@ Description-Content-Type: text/markdown
44
45
 
45
46
  ## About DataEval
46
47
 
47
- DataEval focuses on characterizing image data and its impact on model performance across classification and object-detection tasks.
48
+ DataEval focuses on characterizing image data and its impact on model performance across Classification and object-detection tasks.
48
49
 
49
50
  <!-- start about -->
50
51
 
@@ -0,0 +1,81 @@
1
+ dataeval/__init__.py,sha256=0etPX9QsT5z3_nj5m2TuYfw5PbQOdOj_Il1jgPWjWnw,620
2
+ dataeval/_internal/datasets.py,sha256=wufvhWPMFsTSTq3P1_-k8TBJGGhc8mo5b8NHRW9vgbs,14646
3
+ dataeval/_internal/detectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ dataeval/_internal/detectors/clusterer.py,sha256=0c3wGdyBKDkXBgmAJ6Y4jZ44CDDUykgOJTnA9yAD7DA,20830
5
+ dataeval/_internal/detectors/drift/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ dataeval/_internal/detectors/drift/base.py,sha256=gQKY9ahDpVEAhsAc68j2P75KmEKdLTztiyA2ccSqi9k,16236
7
+ dataeval/_internal/detectors/drift/cvm.py,sha256=p9cbs53Ei5WGaZBtmV5rHRspROmZY1IFDE9EMyA5K00,4105
8
+ dataeval/_internal/detectors/drift/ks.py,sha256=KkkUTa2dnamFK_aYSDSIcYYJFxCWbY8M1lvbFsttnLc,4193
9
+ dataeval/_internal/detectors/drift/mmd.py,sha256=v7KPrbnR0-9TQ-obHAaBa0cEELshuQpfTRSXKHomvSY,7679
10
+ dataeval/_internal/detectors/drift/torch.py,sha256=8Rdcb0Ea7_L_SCHldUD55CKRYd7Em1H_xWCuWFcAzgs,11568
11
+ dataeval/_internal/detectors/drift/uncertainty.py,sha256=CEFjFndDnK1DaWI_fI6Uh49-Guc2WcWmbRJ3jG1uQrI,5194
12
+ dataeval/_internal/detectors/duplicates.py,sha256=HCHq-KlqPbjhif_tyvFloE4ypJjDi1DhdAix-GFh6ic,5344
13
+ dataeval/_internal/detectors/merged_stats.py,sha256=okXGrqAgsqfANMxfIjiUQlZWlaIh5TVvIB9UPsOJZ7k,1351
14
+ dataeval/_internal/detectors/ood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ dataeval/_internal/detectors/ood/ae.py,sha256=ufzuxFMj4oJgAi5l1NBrdhbmDXeP2dh62f1K9l-g6tM,2207
16
+ dataeval/_internal/detectors/ood/aegmm.py,sha256=YlVCZ4MBGQZqYAwQxoDuCyzZZ_kEIj5DaxI5Aga7JXI,2558
17
+ dataeval/_internal/detectors/ood/base.py,sha256=p67sFaDmYX-9xUq2CeNup8Pk0VcilJ1MuSs3rmMIF9w,8650
18
+ dataeval/_internal/detectors/ood/llr.py,sha256=HsSJZv8YjLucllR-Rlj9syaE7MqMgPVpJdMJkQjWiS8,10295
19
+ dataeval/_internal/detectors/ood/vae.py,sha256=z5h6E6TmkoCMkG6kRXqf81j4mmB8W_88G7YVITeVABc,3077
20
+ dataeval/_internal/detectors/ood/vaegmm.py,sha256=ax2GE_TXET6N6m3nHnnMWSh8GfJDyl8t0jrV8xt7Ol8,2982
21
+ dataeval/_internal/detectors/outliers.py,sha256=jg9vMnk8xep0SbZFgbCcIpgTwDqdCo4xaLHKrqgVwNc,10214
22
+ dataeval/_internal/interop.py,sha256=FLXJY-5hwJcKCtruyvaarqynXCMfcLbQSFvGnrWQDPo,1338
23
+ dataeval/_internal/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
+ dataeval/_internal/metrics/balance.py,sha256=IACHTcfmS927fjgEUJEnOCcLQ05hNKeybHAjNNHUw9Q,6314
25
+ dataeval/_internal/metrics/ber.py,sha256=b6BRzIKgPuXTVL4Dk4smaY5WOOlwjsTf2UOUxGAQQL4,4899
26
+ dataeval/_internal/metrics/coverage.py,sha256=ZH57GtMmn0KTaz_eh73kBgLP1tSxAyuS2knzx1-etuw,3630
27
+ dataeval/_internal/metrics/divergence.py,sha256=BeAtKmUL27u8mWGsEqkoANZBrQLRYCybQiIVS_Z9Hg4,4266
28
+ dataeval/_internal/metrics/diversity.py,sha256=XLCT_C3Uq_HKOcfHSnk7h35Jcx0XWY7IF5UJcU82qJ4,7743
29
+ dataeval/_internal/metrics/parity.py,sha256=LMEM08E0ScbT2MLmVN8RIJnPEQQD2zTwMjmpwxOeOKs,16613
30
+ dataeval/_internal/metrics/stats/base.py,sha256=bwQrL4e0dR_8Yoc3QLEs7DxEFEE36OcWsgTyk1o1su8,11267
31
+ dataeval/_internal/metrics/stats/boxratiostats.py,sha256=60lhMeIXvbhZsSJkZAddeHsmez-IFNp5jKZjSk-l5og,6362
32
+ dataeval/_internal/metrics/stats/datasetstats.py,sha256=zTtGnhoVZ2cGpKYThCIPHZkHxc0FhnCyzj4iuaLhETI,6221
33
+ dataeval/_internal/metrics/stats/dimensionstats.py,sha256=ffcrgo90Fc1hxSSjQDziUKKPtYzDT9h1ExjoVAJiakw,3965
34
+ dataeval/_internal/metrics/stats/hashstats.py,sha256=Yxvgxmvwd1ySvrC316dFrgNbugcZzadP1c6T-IsBpIE,2130
35
+ dataeval/_internal/metrics/stats/labelstats.py,sha256=Kd9FCVmyAnyuMTuo26XeJL-hNDG_Nk6175Hbs2WVXC4,4106
36
+ dataeval/_internal/metrics/stats/pixelstats.py,sha256=u_539KGAV7UTZ3px52n13B8vlEm8qHWx9U88EBOM8Ps,4456
37
+ dataeval/_internal/metrics/stats/visualstats.py,sha256=EW-JJSQSLAZOMq3EOwKHPX5z7ymIJajPSeIGEtEk930,4838
38
+ dataeval/_internal/metrics/uap.py,sha256=-vAaqCOEBraaZtc5uLI5wCJoej1hzH6ygRaKbbGX8D0,2181
39
+ dataeval/_internal/metrics/utils.py,sha256=vmurS57HCwriNZnnDqgXzIYLUB1hO2fQJ6mVAbMiJnw,13575
40
+ dataeval/_internal/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ dataeval/_internal/models/pytorch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
+ dataeval/_internal/models/pytorch/autoencoder.py,sha256=SVfwtqZQtTYtqdALLb_NmQChXctDLPB0wazZMhb-_2c,8505
43
+ dataeval/_internal/models/pytorch/blocks.py,sha256=pm2xwsDZjZJYXrhhiz8husvh2vHmrkFMSYEn-EDUD5Q,1354
44
+ dataeval/_internal/models/pytorch/utils.py,sha256=cJjxrNNKaVbhJknX0nX_HiFd43LtytEpDH7P6BbyYrw,1699
45
+ dataeval/_internal/models/tensorflow/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
+ dataeval/_internal/models/tensorflow/autoencoder.py,sha256=-Yhcp3mo3uMqtED0KvFIMpmlf5ogDz8XnqCaukDFlco,10453
47
+ dataeval/_internal/models/tensorflow/gmm.py,sha256=QoEgbeax1GETqRmUF7A2ih9uFOZfFAjGzgH2ljExlAc,3669
48
+ dataeval/_internal/models/tensorflow/losses.py,sha256=aWEb2m3CO-fvPXmYFlZtjry4SyhJcVKIqVodMNmvXxA,3997
49
+ dataeval/_internal/models/tensorflow/pixelcnn.py,sha256=nIDNnhYtSaZeL2Qo15Btm4B67fH9cvjpIPn0Kf3jx-g,48363
50
+ dataeval/_internal/models/tensorflow/trainer.py,sha256=LJ3t6Ud95cofKN-cgb5o5nDrYSFse7LSDOYIBkMgDJk,4094
51
+ dataeval/_internal/models/tensorflow/utils.py,sha256=Rxx5LMXjqpDLS8VFDN6YcCNsbq1Nl1WeITMEZcbIqW4,8749
52
+ dataeval/_internal/output.py,sha256=qVbOi41dvfQICQ4uxysHPWBRKo1XR61kXHPL_vKOPm0,2545
53
+ dataeval/_internal/split_dataset.py,sha256=ReGcreMT2bfcK6UNuGCJmYBLb_ylYLegfshVobd4Vrg,16733
54
+ dataeval/_internal/utils.py,sha256=jo6bGJZAgyuZqRpAAC4gwhAHYE12316na19ZuFwMqes,1504
55
+ dataeval/_internal/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
+ dataeval/_internal/workflows/sufficiency.py,sha256=B9ABh8kt_PWJmgM8if7VdOmoZdUYlOEZoAr9vpjaE1U,18303
57
+ dataeval/detectors/__init__.py,sha256=8nJk2U99_SQD7RtEKjyS4WJct8gX1HgjEm4oHTnRhTI,320
58
+ dataeval/detectors/drift/__init__.py,sha256=Ncv7Xn9rEme2aPWEAJBSrjV0Yug3jhpCuzVxH3xmnf0,808
59
+ dataeval/detectors/drift/kernels/__init__.py,sha256=djIbmvYoHWpWxfdYtiouEC2KqzvgmtEqlg1i5p-UCgM,266
60
+ dataeval/detectors/drift/updates/__init__.py,sha256=yQexC0K4EQvT5Bmfrb6mtfgnV8iPMcsCdrTkb-_vnUI,282
61
+ dataeval/detectors/linters/__init__.py,sha256=m5F5JgGBcqGb3J_qXQ3PBkKyePjOklrYbM9dGUsgxFA,489
62
+ dataeval/detectors/ood/__init__.py,sha256=tRf7xKjMqTEsa4brT9-VP6Ylwqer8lNYYMEFNhD6-4Q,721
63
+ dataeval/metrics/__init__.py,sha256=U0sRw5eiqeeDLbLPxT_rznZsvtNwONVxKVwfC0qVOgo,223
64
+ dataeval/metrics/bias/__init__.py,sha256=Wn1Ui_g-9cR4c4IS7RFKJ6UH5DLXKjEBoXTuEYPXSBc,619
65
+ dataeval/metrics/estimators/__init__.py,sha256=4VFMKLPsJdaWiflf84bXGQ2k8ertFQ4WEPhyWqjFFvE,377
66
+ dataeval/metrics/stats/__init__.py,sha256=AKlNelORMOM2OA9XIvwZ9nOn6dK6k-r-69ldEAuqgLA,1156
67
+ dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
+ dataeval/utils/__init__.py,sha256=EAZyCZsNWnwKPtztjuQQsYN5IA9fXc0z-4cQhT9gZQs,534
69
+ dataeval/utils/tensorflow/__init__.py,sha256=M_W3frQUpfVZFeKGMjZlPtYD55ES9gRf0gL2a0dXqGw,565
70
+ dataeval/utils/tensorflow/loss/__init__.py,sha256=s7tD_5dYWcNDmntGiEHhG7bVDsMAY1UO8FpQFe9cUns,195
71
+ dataeval/utils/tensorflow/models/__init__.py,sha256=1R9Oi5DOYwT0W3JSEfoMsPOvhYFaKqKilwkrUifNnig,385
72
+ dataeval/utils/tensorflow/recon/__init__.py,sha256=xe6gAQqK9tyAoDQTtaJAxIPK1humt5QzsG_9NPsqx58,116
73
+ dataeval/utils/torch/__init__.py,sha256=430fNKbqLByuGSeNhnoIJy3g9Z94ckZsAKWUZ15MVP4,575
74
+ dataeval/utils/torch/datasets/__init__.py,sha256=94k7fMQdxYlQXDYouAHUgrQJ2oBwnvq4koFJpyhlUVA,292
75
+ dataeval/utils/torch/models/__init__.py,sha256=q1BzoLHWA0uBXzT2glWJDrxVA1BN7xnkT2r_d-7Dlyw,246
76
+ dataeval/utils/torch/trainer/__init__.py,sha256=hpcrlCCXPzb8b7FOzEAKqFy6Z7Zl4V_cx3yA7n3L1L4,177
77
+ dataeval/workflows/__init__.py,sha256=VFeJyMhZxvj8WnU5Un32mwO8lNfBQOBjD9IdOqexnAE,320
78
+ dataeval-0.72.1.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
79
+ dataeval-0.72.1.dist-info/METADATA,sha256=ivnqRxBPNcIoW6g4SPJBdOAnrSlhHra2wcYYVso8egc,4606
80
+ dataeval-0.72.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
81
+ dataeval-0.72.1.dist-info/RECORD,,