dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +13 -9
- dataeval/_internal/detectors/clusterer.py +63 -49
- dataeval/_internal/detectors/drift/base.py +248 -51
- dataeval/_internal/detectors/drift/cvm.py +28 -26
- dataeval/_internal/detectors/drift/ks.py +31 -28
- dataeval/_internal/detectors/drift/mmd.py +62 -42
- dataeval/_internal/detectors/drift/torch.py +69 -60
- dataeval/_internal/detectors/drift/uncertainty.py +32 -32
- dataeval/_internal/detectors/duplicates.py +67 -31
- dataeval/_internal/detectors/ood/ae.py +15 -29
- dataeval/_internal/detectors/ood/aegmm.py +33 -27
- dataeval/_internal/detectors/ood/base.py +86 -47
- dataeval/_internal/detectors/ood/llr.py +34 -31
- dataeval/_internal/detectors/ood/vae.py +32 -31
- dataeval/_internal/detectors/ood/vaegmm.py +34 -28
- dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
- dataeval/_internal/flags.py +44 -21
- dataeval/_internal/interop.py +5 -3
- dataeval/_internal/metrics/balance.py +42 -5
- dataeval/_internal/metrics/ber.py +11 -8
- dataeval/_internal/metrics/coverage.py +15 -8
- dataeval/_internal/metrics/divergence.py +41 -7
- dataeval/_internal/metrics/diversity.py +57 -19
- dataeval/_internal/metrics/parity.py +141 -66
- dataeval/_internal/metrics/stats.py +330 -313
- dataeval/_internal/metrics/uap.py +33 -4
- dataeval/_internal/metrics/utils.py +79 -40
- dataeval/_internal/models/pytorch/autoencoder.py +127 -22
- dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
- dataeval/_internal/models/tensorflow/gmm.py +4 -2
- dataeval/_internal/models/tensorflow/losses.py +17 -13
- dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
- dataeval/_internal/models/tensorflow/trainer.py +10 -7
- dataeval/_internal/models/tensorflow/utils.py +23 -20
- dataeval/_internal/output.py +85 -0
- dataeval/_internal/utils.py +5 -3
- dataeval/_internal/workflows/sufficiency.py +122 -121
- dataeval/detectors/__init__.py +6 -25
- dataeval/detectors/drift/__init__.py +16 -0
- dataeval/detectors/drift/kernels/__init__.py +6 -0
- dataeval/detectors/drift/updates/__init__.py +3 -0
- dataeval/detectors/linters/__init__.py +5 -0
- dataeval/detectors/ood/__init__.py +11 -0
- dataeval/flags/__init__.py +2 -2
- dataeval/metrics/__init__.py +2 -26
- dataeval/metrics/bias/__init__.py +14 -0
- dataeval/metrics/estimators/__init__.py +9 -0
- dataeval/metrics/stats/__init__.py +6 -0
- dataeval/tensorflow/__init__.py +3 -0
- dataeval/tensorflow/loss/__init__.py +3 -0
- dataeval/tensorflow/models/__init__.py +5 -0
- dataeval/tensorflow/recon/__init__.py +3 -0
- dataeval/torch/__init__.py +3 -0
- dataeval/{models/torch → torch/models}/__init__.py +1 -2
- dataeval/torch/trainer/__init__.py +3 -0
- dataeval/utils/__init__.py +3 -6
- dataeval/workflows/__init__.py +2 -4
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
- dataeval-0.66.0.dist-info/RECORD +72 -0
- dataeval/_internal/metrics/base.py +0 -10
- dataeval/models/__init__.py +0 -15
- dataeval/models/tensorflow/__init__.py +0 -6
- dataeval-0.64.0.dist-info/RECORD +0 -60
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
@@ -1,18 +1,22 @@
|
|
1
|
+
__version__ = "0.66.0"
|
2
|
+
|
1
3
|
from importlib.util import find_spec
|
2
4
|
|
3
|
-
|
5
|
+
_IS_TORCH_AVAILABLE = find_spec("torch") is not None
|
6
|
+
_IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("tensorflow_probability") is not None
|
7
|
+
|
8
|
+
del find_spec
|
4
9
|
|
5
|
-
|
10
|
+
from . import detectors, flags, metrics # noqa: E402
|
6
11
|
|
7
12
|
__all__ = ["detectors", "flags", "metrics"]
|
8
13
|
|
9
|
-
if
|
10
|
-
from . import
|
14
|
+
if _IS_TORCH_AVAILABLE: # pragma: no cover
|
15
|
+
from . import torch, utils, workflows
|
11
16
|
|
12
|
-
__all__ += ["
|
13
|
-
elif find_spec("tensorflow") is not None: # pragma: no cover
|
14
|
-
from . import models
|
17
|
+
__all__ += ["torch", "utils", "workflows"]
|
15
18
|
|
16
|
-
|
19
|
+
if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
|
20
|
+
from . import tensorflow
|
17
21
|
|
18
|
-
|
22
|
+
__all__ += ["tensorflow"]
|
@@ -1,26 +1,52 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Iterable, NamedTuple, cast
|
2
5
|
|
3
6
|
import numpy as np
|
4
|
-
from numpy.typing import ArrayLike
|
7
|
+
from numpy.typing import ArrayLike, NDArray
|
5
8
|
from scipy.cluster.hierarchy import linkage
|
6
9
|
from scipy.spatial.distance import pdist, squareform
|
7
10
|
|
8
11
|
from dataeval._internal.interop import to_numpy
|
12
|
+
from dataeval._internal.metrics.utils import flatten
|
13
|
+
from dataeval._internal.output import OutputMetadata, set_metadata
|
9
14
|
|
10
15
|
|
11
|
-
|
16
|
+
@dataclass(frozen=True)
|
17
|
+
class ClustererOutput(OutputMetadata):
|
18
|
+
"""
|
19
|
+
Attributes
|
20
|
+
----------
|
21
|
+
outliers : List[int]
|
22
|
+
Indices that do not fall within a cluster
|
23
|
+
potential_outliers : List[int]
|
24
|
+
Indices which are near the border between belonging in the cluster and being an outlier
|
25
|
+
duplicates : List[List[int]]
|
26
|
+
Groups of indices that are exact duplicates
|
27
|
+
potential_duplicates : List[List[int]]
|
28
|
+
Groups of indices which are not exact but closely related data points
|
29
|
+
"""
|
30
|
+
|
31
|
+
outliers: list[int]
|
32
|
+
potential_outliers: list[int]
|
33
|
+
duplicates: list[list[int]]
|
34
|
+
potential_duplicates: list[list[int]]
|
35
|
+
|
36
|
+
|
37
|
+
def extend_linkage(link_arr: NDArray) -> NDArray:
|
12
38
|
"""
|
13
39
|
Adds a column to the linkage matrix link_arr that tracks the new id assigned
|
14
40
|
to each row
|
15
41
|
|
16
42
|
Parameters
|
17
43
|
----------
|
18
|
-
link_arr :
|
44
|
+
link_arr : NDArray
|
19
45
|
linkage matrix
|
20
46
|
|
21
47
|
Returns
|
22
48
|
-------
|
23
|
-
|
49
|
+
NDArray
|
24
50
|
linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
|
25
51
|
"""
|
26
52
|
# Adjusting linkage matrix to accommodate renumbering
|
@@ -35,7 +61,7 @@ def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
|
|
35
61
|
class Cluster:
|
36
62
|
__slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
|
37
63
|
|
38
|
-
def __init__(self, merged: int, samples:
|
64
|
+
def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
|
39
65
|
self.merged = merged
|
40
66
|
self.samples = np.array(samples, dtype=np.int32)
|
41
67
|
self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
|
@@ -57,7 +83,7 @@ class Cluster:
|
|
57
83
|
self.out1 = dist > out1
|
58
84
|
self.out2 = dist > out2
|
59
85
|
|
60
|
-
def copy(self) ->
|
86
|
+
def copy(self) -> Cluster:
|
61
87
|
return Cluster(False, self.samples, self.sample_dist, True)
|
62
88
|
|
63
89
|
def __repr__(self) -> str:
|
@@ -70,7 +96,7 @@ class Cluster:
|
|
70
96
|
return f"{self.__class__.__name__}(**{repr(_params)})"
|
71
97
|
|
72
98
|
|
73
|
-
class Clusters(
|
99
|
+
class Clusters(dict[int, dict[int, Cluster]]):
|
74
100
|
def __init__(self, *args, **kwargs):
|
75
101
|
super().__init__(*args, **kwargs)
|
76
102
|
self.max_level: int = 1
|
@@ -92,10 +118,10 @@ class ClusterMergeEntry:
|
|
92
118
|
self.inner_cluster = inner_cluster
|
93
119
|
self.status = status
|
94
120
|
|
95
|
-
def __lt__(self, value:
|
121
|
+
def __lt__(self, value: ClusterMergeEntry) -> bool:
|
96
122
|
return self.level.__lt__(value.level)
|
97
123
|
|
98
|
-
def __gt__(self, value:
|
124
|
+
def __gt__(self, value: ClusterMergeEntry) -> bool:
|
99
125
|
return self.level.__gt__(value.level)
|
100
126
|
|
101
127
|
|
@@ -130,13 +156,13 @@ class Clusterer:
|
|
130
156
|
self._on_init(dataset)
|
131
157
|
|
132
158
|
def _on_init(self, dataset: ArrayLike):
|
133
|
-
self._data:
|
159
|
+
self._data: NDArray = flatten(to_numpy(dataset))
|
134
160
|
self._validate_data(self._data)
|
135
161
|
self._num_samples = len(self._data)
|
136
162
|
|
137
|
-
self._darr:
|
138
|
-
self._sqdmat:
|
139
|
-
self._larr:
|
163
|
+
self._darr: NDArray = pdist(self._data, metric="euclidean")
|
164
|
+
self._sqdmat: NDArray = squareform(self._darr)
|
165
|
+
self._larr: NDArray = extend_linkage(linkage(self._darr))
|
140
166
|
self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
|
141
167
|
|
142
168
|
min_num = int(self._num_samples * 0.05)
|
@@ -146,7 +172,7 @@ class Clusterer:
|
|
146
172
|
self._last_good_merge_levels = None
|
147
173
|
|
148
174
|
@property
|
149
|
-
def data(self) ->
|
175
|
+
def data(self) -> NDArray:
|
150
176
|
return self._data
|
151
177
|
|
152
178
|
@data.setter
|
@@ -160,16 +186,16 @@ class Clusterer:
|
|
160
186
|
return self._clusters
|
161
187
|
|
162
188
|
@property
|
163
|
-
def last_good_merge_levels(self) ->
|
189
|
+
def last_good_merge_levels(self) -> dict[int, int]:
|
164
190
|
if self._last_good_merge_levels is None:
|
165
191
|
self._last_good_merge_levels = self._get_last_merge_levels()
|
166
192
|
return self._last_good_merge_levels
|
167
193
|
|
168
194
|
@classmethod
|
169
|
-
def _validate_data(cls, x:
|
195
|
+
def _validate_data(cls, x: NDArray):
|
170
196
|
"""Checks that the data has the correct size, shape, and format"""
|
171
197
|
if not isinstance(x, np.ndarray):
|
172
|
-
raise TypeError(f"Data should be of type
|
198
|
+
raise TypeError(f"Data should be of type NDArray; got {type(x)}")
|
173
199
|
|
174
200
|
if x.ndim != 2:
|
175
201
|
raise ValueError(
|
@@ -184,7 +210,7 @@ class Clusterer:
|
|
184
210
|
def _create_clusters(self) -> Clusters:
|
185
211
|
"""Generates clusters based on linkage matrix"""
|
186
212
|
next_cluster_id = 0
|
187
|
-
cluster_map:
|
213
|
+
cluster_map: dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
|
188
214
|
clusters: Clusters = Clusters()
|
189
215
|
|
190
216
|
# Walking through the linkage array to generate clusters
|
@@ -212,7 +238,7 @@ class Clusterer:
|
|
212
238
|
# Update clusters to include previously skipped levels
|
213
239
|
clusters = self._fill_levels(clusters, left, right)
|
214
240
|
elif left or right:
|
215
|
-
child, other_id = cast(
|
241
|
+
child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
|
216
242
|
cc = clusters[child.level][child.cid]
|
217
243
|
samples = np.concatenate([cc.samples, [other_id]])
|
218
244
|
sample_dist = np.concatenate([cc.sample_dist, sample_dist])
|
@@ -240,7 +266,7 @@ class Clusterer:
|
|
240
266
|
clusters[level_id].setdefault(cid, cluster)
|
241
267
|
return clusters
|
242
268
|
|
243
|
-
def _get_cluster_distances(self) ->
|
269
|
+
def _get_cluster_distances(self) -> NDArray:
|
244
270
|
"""Calculates the minimum distances between clusters are each level"""
|
245
271
|
# Cluster distance matrix
|
246
272
|
max_level = self.clusters.max_level
|
@@ -261,7 +287,7 @@ class Clusterer:
|
|
261
287
|
|
262
288
|
return cluster_matrix
|
263
289
|
|
264
|
-
def _calc_merge_indices(self, merge_mean:
|
290
|
+
def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
|
265
291
|
"""
|
266
292
|
Determine what clusters should be merged and return their indices
|
267
293
|
"""
|
@@ -284,7 +310,7 @@ class Clusterer:
|
|
284
310
|
mask2 = mask2_vals < one_std_check
|
285
311
|
return np.logical_or(desired_merge, mask2)
|
286
312
|
|
287
|
-
def _generate_merge_list(self, cluster_matrix:
|
313
|
+
def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
|
288
314
|
"""
|
289
315
|
Runs through the clusters dictionary determining when clusters merge,
|
290
316
|
and how close are those clusters when they merge.
|
@@ -301,7 +327,7 @@ class Clusterer:
|
|
301
327
|
"""
|
302
328
|
intra_max = []
|
303
329
|
merge_mean = []
|
304
|
-
merge_list:
|
330
|
+
merge_list: list[ClusterMergeEntry] = []
|
305
331
|
|
306
332
|
for level, cluster_set in self.clusters.items():
|
307
333
|
for outer_cluster, cluster in cluster_set.items():
|
@@ -339,7 +365,7 @@ class Clusterer:
|
|
339
365
|
|
340
366
|
return merge_list
|
341
367
|
|
342
|
-
def _get_last_merge_levels(self) ->
|
368
|
+
def _get_last_merge_levels(self) -> dict[int, int]:
|
343
369
|
"""
|
344
370
|
Creates a dictionary for important cluster ids mapped to their last good merge level
|
345
371
|
|
@@ -348,7 +374,7 @@ class Clusterer:
|
|
348
374
|
Dict[int, int]
|
349
375
|
A mapping of a cluster id to its last good merge level
|
350
376
|
"""
|
351
|
-
last_merge_levels:
|
377
|
+
last_merge_levels: dict[int, int] = {}
|
352
378
|
|
353
379
|
if self._max_clusters <= 1:
|
354
380
|
last_merge_levels = {0: int(self._num_samples * 0.1)}
|
@@ -371,7 +397,7 @@ class Clusterer:
|
|
371
397
|
|
372
398
|
return last_merge_levels
|
373
399
|
|
374
|
-
def find_outliers(self, last_merge_levels:
|
400
|
+
def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
|
375
401
|
"""
|
376
402
|
Retrieves outliers based on when the sample was added to the cluster
|
377
403
|
and how far it was from the cluster when it was added
|
@@ -415,9 +441,9 @@ class Clusterer:
|
|
415
441
|
|
416
442
|
return sorted(outliers), sorted(possible_outliers)
|
417
443
|
|
418
|
-
def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) ->
|
444
|
+
def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
|
419
445
|
"""Merges and sorts groups of indices that share any common index"""
|
420
|
-
groups:
|
446
|
+
groups: list[list[int]] = []
|
421
447
|
for indices in zip(*index_groups):
|
422
448
|
indices = set(indices)
|
423
449
|
temp = []
|
@@ -430,7 +456,7 @@ class Clusterer:
|
|
430
456
|
groups = temp
|
431
457
|
return sorted(groups)
|
432
458
|
|
433
|
-
def find_duplicates(self, last_merge_levels:
|
459
|
+
def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
|
434
460
|
"""
|
435
461
|
Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
|
436
462
|
|
@@ -464,35 +490,23 @@ class Clusterer:
|
|
464
490
|
|
465
491
|
return exact_dupes, near_dupes
|
466
492
|
|
467
|
-
|
493
|
+
# TODO: Move data input to evaluate from class
|
494
|
+
@set_metadata("dataeval.detectors", ["data"])
|
495
|
+
def evaluate(self) -> ClustererOutput:
|
468
496
|
"""Finds and flags indices of the data for outliers and duplicates
|
469
497
|
|
470
498
|
Returns
|
471
499
|
-------
|
472
|
-
|
473
|
-
outliers
|
474
|
-
List of indices that do not fall within a cluster
|
475
|
-
potential_outliers :
|
476
|
-
List of indices which are near the border between belonging in the cluster and being an outlier
|
477
|
-
duplicates :
|
478
|
-
List of groups of indices that are exact duplicates
|
479
|
-
potential_duplicates :
|
480
|
-
List of groups of indices which are not exact but closely related data points
|
500
|
+
ClustererOutput
|
501
|
+
The outliers and duplicate indices found in the data
|
481
502
|
|
482
503
|
Example
|
483
504
|
-------
|
484
505
|
>>> cluster.evaluate()
|
485
|
-
|
506
|
+
ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
|
486
507
|
""" # noqa: E501
|
487
508
|
|
488
509
|
outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
|
489
510
|
duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
|
490
511
|
|
491
|
-
|
492
|
-
"outliers": outliers,
|
493
|
-
"potential_outliers": potential_outliers,
|
494
|
-
"duplicates": duplicates,
|
495
|
-
"potential_duplicates": potential_duplicates,
|
496
|
-
}
|
497
|
-
|
498
|
-
return ret
|
512
|
+
return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)
|