dataeval 0.76.1__py3-none-any.whl → 0.81.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/{output.py → _output.py} +14 -0
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +41 -30
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +0 -3
- dataeval/detectors/linters/duplicates.py +17 -8
- dataeval/detectors/linters/outliers.py +23 -14
- dataeval/detectors/ood/ae.py +29 -8
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/metadata_ks_compare.py +1 -1
- dataeval/detectors/ood/mixin.py +20 -5
- dataeval/detectors/ood/output.py +1 -1
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +5 -0
- dataeval/metadata/_ood.py +238 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +5 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
- dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
- dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
- dataeval/metrics/bias/{parity.py → _parity.py} +89 -61
- dataeval/metrics/estimators/__init__.py +14 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
- dataeval/metrics/estimators/_clusterer.py +104 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/metrics/stats/{base.py → _base.py} +52 -16
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
- dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
- dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
- dataeval/metrics/stats/{labelstats.py → _labelstats.py} +4 -4
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
- dataeval/typing.py +54 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +18 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +4 -4
- dataeval/utils/data/__init__.py +22 -0
- dataeval/utils/data/_embeddings.py +105 -0
- dataeval/utils/data/_images.py +65 -0
- dataeval/utils/data/_metadata.py +352 -0
- dataeval/utils/data/_selection.py +119 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
- dataeval/utils/data/_targets.py +73 -0
- dataeval/utils/data/_types.py +58 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +60 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +51 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/sufficiency.py +10 -9
- {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/METADATA +4 -1
- dataeval-0.81.0.dist-info/RECORD +94 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.1.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.1.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
@@ -1,512 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = []
|
4
|
-
|
5
|
-
from dataclasses import dataclass
|
6
|
-
from typing import Any, Iterable, NamedTuple, cast
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
from numpy.typing import ArrayLike, NDArray
|
10
|
-
from scipy.cluster.hierarchy import linkage
|
11
|
-
from scipy.spatial.distance import pdist, squareform
|
12
|
-
|
13
|
-
from dataeval.interop import to_numpy
|
14
|
-
from dataeval.output import Output, set_metadata
|
15
|
-
from dataeval.utils.shared import flatten
|
16
|
-
|
17
|
-
|
18
|
-
@dataclass(frozen=True)
|
19
|
-
class ClustererOutput(Output):
|
20
|
-
"""
|
21
|
-
Output class for :class:`Clusterer` lint detector.
|
22
|
-
|
23
|
-
Attributes
|
24
|
-
----------
|
25
|
-
outliers : List[int]
|
26
|
-
Indices that do not fall within a cluster
|
27
|
-
potential_outliers : List[int]
|
28
|
-
Indices which are near the border between belonging in the cluster and being an outlier
|
29
|
-
duplicates : List[List[int]]
|
30
|
-
Groups of indices that are exact :term:`duplicates<Duplicates>`
|
31
|
-
potential_duplicates : List[List[int]]
|
32
|
-
Groups of indices which are not exact but closely related data points
|
33
|
-
"""
|
34
|
-
|
35
|
-
outliers: list[int]
|
36
|
-
potential_outliers: list[int]
|
37
|
-
duplicates: list[list[int]]
|
38
|
-
potential_duplicates: list[list[int]]
|
39
|
-
|
40
|
-
|
41
|
-
def _extend_linkage(link_arr: NDArray) -> NDArray:
|
42
|
-
"""
|
43
|
-
Adds a column to the linkage matrix link_arr that tracks the new id assigned
|
44
|
-
to each row
|
45
|
-
|
46
|
-
Parameters
|
47
|
-
----------
|
48
|
-
link_arr : NDArray
|
49
|
-
linkage matrix
|
50
|
-
|
51
|
-
Returns
|
52
|
-
-------
|
53
|
-
NDArray
|
54
|
-
linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
|
55
|
-
"""
|
56
|
-
# Adjusting linkage matrix to accommodate renumbering
|
57
|
-
rows, cols = link_arr.shape
|
58
|
-
arr = np.zeros((rows, cols + 1))
|
59
|
-
arr[:, :-1] = link_arr
|
60
|
-
arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)
|
61
|
-
|
62
|
-
return arr
|
63
|
-
|
64
|
-
|
65
|
-
class _Cluster:
|
66
|
-
__slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
|
67
|
-
|
68
|
-
def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
|
69
|
-
self.merged = merged
|
70
|
-
self.samples = np.array(samples, dtype=np.int32)
|
71
|
-
self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
|
72
|
-
self.is_copy = is_copy
|
73
|
-
|
74
|
-
dist = float(self.sample_dist[-1])
|
75
|
-
|
76
|
-
self.count = len(self.samples)
|
77
|
-
if is_copy:
|
78
|
-
self.dist_avg = 0.0
|
79
|
-
self.dist_std = 0.0
|
80
|
-
self.out1 = False
|
81
|
-
self.out2 = False
|
82
|
-
else:
|
83
|
-
self.dist_avg = float(np.mean(self.sample_dist))
|
84
|
-
self.dist_std = float(np.std(self.sample_dist)) if len(self.sample_dist) > 1 else 1e-5
|
85
|
-
out1 = self.dist_avg + self.dist_std
|
86
|
-
out2 = out1 + self.dist_std
|
87
|
-
self.out1 = dist > out1
|
88
|
-
self.out2 = dist > out2
|
89
|
-
|
90
|
-
def copy(self) -> _Cluster:
|
91
|
-
return _Cluster(False, self.samples, self.sample_dist, True)
|
92
|
-
|
93
|
-
def __repr__(self) -> str:
|
94
|
-
_params = {
|
95
|
-
"merged": self.merged,
|
96
|
-
"samples": self.samples,
|
97
|
-
"sample_dist": self.sample_dist,
|
98
|
-
"is_copy": self.is_copy,
|
99
|
-
}
|
100
|
-
return f"{self.__class__.__name__}(**{repr(_params)})"
|
101
|
-
|
102
|
-
|
103
|
-
class _Clusters(dict[int, dict[int, _Cluster]]):
|
104
|
-
def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
|
105
|
-
super().__init__(*args)
|
106
|
-
self.max_level: int = 1
|
107
|
-
|
108
|
-
|
109
|
-
class _ClusterPosition(NamedTuple):
|
110
|
-
"""Keeps track of a cluster's level and ID"""
|
111
|
-
|
112
|
-
level: int
|
113
|
-
cid: int
|
114
|
-
|
115
|
-
|
116
|
-
class _ClusterMergeEntry:
|
117
|
-
__slots__ = "level", "outer_cluster", "inner_cluster", "status"
|
118
|
-
|
119
|
-
def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
|
120
|
-
self.level = level
|
121
|
-
self.outer_cluster = outer_cluster
|
122
|
-
self.inner_cluster = inner_cluster
|
123
|
-
self.status = status
|
124
|
-
|
125
|
-
def __lt__(self, value: _ClusterMergeEntry) -> bool:
|
126
|
-
return self.level.__lt__(value.level)
|
127
|
-
|
128
|
-
def __gt__(self, value: _ClusterMergeEntry) -> bool:
|
129
|
-
return self.level.__gt__(value.level)
|
130
|
-
|
131
|
-
|
132
|
-
class Clusterer:
|
133
|
-
"""
|
134
|
-
Uses hierarchical clustering to flag dataset properties of interest like outliers \
|
135
|
-
and :term:`duplicates<Duplicates>`.
|
136
|
-
|
137
|
-
Parameters
|
138
|
-
----------
|
139
|
-
dataset : ArrayLike, shape - (N, P)
|
140
|
-
A dataset in an ArrayLike format.
|
141
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
|
142
|
-
|
143
|
-
Warning
|
144
|
-
-------
|
145
|
-
The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
|
146
|
-
|
147
|
-
Note
|
148
|
-
----
|
149
|
-
The Clusterer works best when the length of the feature dimension, P, is less than 500.
|
150
|
-
If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
|
151
|
-
"""
|
152
|
-
|
153
|
-
def __init__(self, dataset: ArrayLike) -> None:
|
154
|
-
# Allows an update to dataset to reset the state rather than instantiate a new class
|
155
|
-
self._on_init(dataset)
|
156
|
-
|
157
|
-
def _on_init(self, dataset: ArrayLike):
|
158
|
-
self._data: NDArray[Any] = flatten(to_numpy(dataset))
|
159
|
-
self._validate_data(self._data)
|
160
|
-
self._num_samples = len(self._data)
|
161
|
-
|
162
|
-
self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
|
163
|
-
self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
|
164
|
-
self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
|
165
|
-
self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
|
166
|
-
|
167
|
-
min_num = int(self._num_samples * 0.05)
|
168
|
-
self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
|
169
|
-
|
170
|
-
self._clusters: _Clusters | None = None
|
171
|
-
self._last_good_merge_levels: dict[int, int] | None = None
|
172
|
-
|
173
|
-
@property
|
174
|
-
def data(self) -> NDArray[Any]:
|
175
|
-
return self._data
|
176
|
-
|
177
|
-
@data.setter
|
178
|
-
def data(self, x: ArrayLike) -> None:
|
179
|
-
self._on_init(x)
|
180
|
-
|
181
|
-
@property
|
182
|
-
def clusters(self) -> _Clusters:
|
183
|
-
if self._clusters is None:
|
184
|
-
self._clusters = self._create_clusters()
|
185
|
-
return self._clusters
|
186
|
-
|
187
|
-
@property
|
188
|
-
def last_good_merge_levels(self) -> dict[int, int]:
|
189
|
-
if self._last_good_merge_levels is None:
|
190
|
-
self._last_good_merge_levels = self._get_last_merge_levels()
|
191
|
-
return self._last_good_merge_levels
|
192
|
-
|
193
|
-
@classmethod
|
194
|
-
def _validate_data(cls, x: NDArray):
|
195
|
-
"""Checks that the data has the correct size, shape, and format"""
|
196
|
-
if not isinstance(x, np.ndarray):
|
197
|
-
raise TypeError(f"Data should be of type NDArray; got {type(x)}")
|
198
|
-
|
199
|
-
if x.ndim != 2:
|
200
|
-
raise ValueError(
|
201
|
-
f"Data should only have 2 dimensions; got {x.ndim}. Data should be flattened before being input"
|
202
|
-
)
|
203
|
-
samples, features = x.shape # Due to above check, we know shape has a length of 2
|
204
|
-
if samples < 2:
|
205
|
-
raise ValueError(f"Data should have at least 2 samples; got {samples}")
|
206
|
-
if features < 1:
|
207
|
-
raise ValueError(f"Samples should have at least 1 feature; got {features}")
|
208
|
-
|
209
|
-
def _create_clusters(self) -> _Clusters:
|
210
|
-
"""Generates clusters based on linkage matrix"""
|
211
|
-
next_cluster_id = 0
|
212
|
-
cluster_map: dict[int, _ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
|
213
|
-
clusters: _Clusters = _Clusters()
|
214
|
-
|
215
|
-
# Walking through the linkage array to generate clusters
|
216
|
-
for arr_i in self._larr:
|
217
|
-
left_id = int(arr_i[0])
|
218
|
-
right_id = int(arr_i[1])
|
219
|
-
sample_dist = np.array([arr_i[2]], dtype=np.float32)
|
220
|
-
merged = False
|
221
|
-
|
222
|
-
# Determine if the id is already associated with a cluster
|
223
|
-
left = cluster_map.get(left_id)
|
224
|
-
right = cluster_map.get(right_id)
|
225
|
-
|
226
|
-
if left and right:
|
227
|
-
merged = max([left.cid, right.cid])
|
228
|
-
lc = clusters[left.level][left.cid]
|
229
|
-
rc = clusters[right.level][right.cid]
|
230
|
-
left_first = len(lc.samples) >= len(rc.samples)
|
231
|
-
samples = np.concatenate([lc.samples, rc.samples] if left_first else [rc.samples, lc.samples])
|
232
|
-
sample_dist = np.concatenate([rc.sample_dist, lc.sample_dist, sample_dist])
|
233
|
-
level, cid = max(left.level, right.level) + 1, min(left.cid, right.cid)
|
234
|
-
|
235
|
-
# Only tracking the levels in which clusters merge for the cluster distance matrix
|
236
|
-
clusters.max_level = max(clusters.max_level, left.level, right.level)
|
237
|
-
# Update clusters to include previously skipped levels
|
238
|
-
clusters = self._fill_levels(clusters, left, right)
|
239
|
-
elif left or right:
|
240
|
-
child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
|
241
|
-
cc = clusters[child.level][child.cid]
|
242
|
-
samples = np.concatenate([cc.samples, [other_id]])
|
243
|
-
sample_dist = np.concatenate([cc.sample_dist, sample_dist])
|
244
|
-
level, cid = child.level + 1, child.cid
|
245
|
-
else:
|
246
|
-
samples = np.array([left_id, right_id], dtype=np.int32)
|
247
|
-
level, cid = 0, next_cluster_id
|
248
|
-
next_cluster_id += 1
|
249
|
-
|
250
|
-
# Set the cluster and associate the linkage id with the cluster
|
251
|
-
if level not in clusters:
|
252
|
-
clusters[level] = {}
|
253
|
-
|
254
|
-
clusters[level][cid] = _Cluster(merged, samples, sample_dist)
|
255
|
-
cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
|
256
|
-
|
257
|
-
return clusters
|
258
|
-
|
259
|
-
def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
|
260
|
-
# Sets each level's cluster info if it does not exist
|
261
|
-
if left.level != right.level:
|
262
|
-
(level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
|
263
|
-
cluster = clusters[level][cid].copy()
|
264
|
-
for level_id in range(max_level, level, -1):
|
265
|
-
clusters[level_id].setdefault(cid, cluster)
|
266
|
-
return clusters
|
267
|
-
|
268
|
-
def _get_cluster_distances(self) -> NDArray:
|
269
|
-
"""Calculates the minimum distances between clusters are each level"""
|
270
|
-
# Cluster distance matrix
|
271
|
-
max_level = self.clusters.max_level
|
272
|
-
cluster_matrix = np.full((max_level, self._max_clusters, self._max_clusters), -1.0, dtype=np.float32)
|
273
|
-
|
274
|
-
for level, cluster_set in self.clusters.items():
|
275
|
-
if level < max_level:
|
276
|
-
cluster_ids = sorted(cluster_set.keys())
|
277
|
-
for i, cluster_id in enumerate(cluster_ids):
|
278
|
-
cluster_matrix[level, cluster_id, cluster_id] = self.clusters[level][cluster_id].dist_avg
|
279
|
-
for int_id in range(i + 1, len(cluster_ids)):
|
280
|
-
compare_id = cluster_ids[int_id]
|
281
|
-
sample_a = self.clusters[level][cluster_id].samples
|
282
|
-
sample_b = self.clusters[level][compare_id].samples
|
283
|
-
min_mat = self._sqdmat[np.ix_(sample_a, sample_b)].min()
|
284
|
-
cluster_matrix[level, cluster_id, compare_id] = min_mat
|
285
|
-
cluster_matrix[level, compare_id, cluster_id] = min_mat
|
286
|
-
|
287
|
-
return cluster_matrix
|
288
|
-
|
289
|
-
def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
|
290
|
-
"""
|
291
|
-
Determine what clusters should be merged and return their indices
|
292
|
-
"""
|
293
|
-
intra_max_uniques = np.unique(intra_max)
|
294
|
-
intra_log_values = np.log(intra_max_uniques)
|
295
|
-
two_std_all = intra_log_values.mean() + 2 * intra_log_values.std()
|
296
|
-
merge_value = np.log(merge_mean)
|
297
|
-
# Mask of indices we know we want to merge
|
298
|
-
desired_merge = merge_value < two_std_all
|
299
|
-
|
300
|
-
# List[Values] for indices we might want to merge
|
301
|
-
check = merge_value[~desired_merge]
|
302
|
-
# Check distance from value to 2 stds of all values
|
303
|
-
check = np.abs((check - two_std_all) / two_std_all)
|
304
|
-
# Mask List[Values < 1]
|
305
|
-
mask = check < 1
|
306
|
-
one_std_check = check[mask].mean() + check[mask].std()
|
307
|
-
# Mask of indices that should also be merged
|
308
|
-
mask2_vals = np.abs((merge_value - two_std_all) / two_std_all)
|
309
|
-
mask2 = mask2_vals < one_std_check
|
310
|
-
return np.logical_or(desired_merge, mask2)
|
311
|
-
|
312
|
-
def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
|
313
|
-
"""
|
314
|
-
Runs through the clusters dictionary determining when clusters merge,
|
315
|
-
and how close are those clusters when they merge.
|
316
|
-
|
317
|
-
Parameters
|
318
|
-
----------
|
319
|
-
cluster_matrix:
|
320
|
-
The distance matrix for all clusters to all others
|
321
|
-
|
322
|
-
Returns
|
323
|
-
-------
|
324
|
-
List[ClusterMergeEntry]:
|
325
|
-
A list with each cluster's merge history
|
326
|
-
"""
|
327
|
-
intra_max = []
|
328
|
-
merge_mean = []
|
329
|
-
merge_list: list[_ClusterMergeEntry] = []
|
330
|
-
|
331
|
-
for level, cluster_set in self.clusters.items():
|
332
|
-
for outer_cluster, cluster in cluster_set.items():
|
333
|
-
inner_cluster = cluster.merged
|
334
|
-
if not inner_cluster:
|
335
|
-
continue
|
336
|
-
# Extract necessary information
|
337
|
-
num_samples = len(cluster.samples)
|
338
|
-
out1 = cluster.out1
|
339
|
-
out2 = cluster.out2
|
340
|
-
|
341
|
-
# If outside 2-std or 1-std and larger than a minimum sized cluster, take the mean distance, else max
|
342
|
-
aggregate_func = (
|
343
|
-
np.mean if out2 or (out1 and num_samples >= self._min_num_samples_per_cluster) else np.max
|
344
|
-
)
|
345
|
-
|
346
|
-
distances = cluster_matrix[:level, outer_cluster, inner_cluster]
|
347
|
-
intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
|
348
|
-
positive_mask = intra_distance >= 0
|
349
|
-
intra_filtered = intra_distance[positive_mask]
|
350
|
-
|
351
|
-
# TODO: Append now, take max over axis later?
|
352
|
-
intra_max.append(np.max(intra_filtered))
|
353
|
-
# Calculate the corresponding distance stats
|
354
|
-
distance_stats_arr = aggregate_func(distances)
|
355
|
-
merge_mean.append(distance_stats_arr)
|
356
|
-
merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
|
357
|
-
|
358
|
-
all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
|
359
|
-
|
360
|
-
for i, is_mergeable in enumerate(all_merge_indices):
|
361
|
-
merge_list[i].status = is_mergeable
|
362
|
-
|
363
|
-
merge_list = sorted(merge_list, reverse=True)
|
364
|
-
|
365
|
-
return merge_list
|
366
|
-
|
367
|
-
def _get_last_merge_levels(self) -> dict[int, int]:
|
368
|
-
"""
|
369
|
-
Creates a dictionary for important cluster ids mapped to their last good merge level
|
370
|
-
|
371
|
-
Returns
|
372
|
-
-------
|
373
|
-
Dict[int, int]
|
374
|
-
A mapping of a cluster id to its last good merge level
|
375
|
-
"""
|
376
|
-
last_merge_levels: dict[int, int] = {}
|
377
|
-
|
378
|
-
if self._max_clusters <= 1:
|
379
|
-
last_merge_levels = {0: int(self._num_samples * 0.1)}
|
380
|
-
else:
|
381
|
-
cluster_matrix = self._get_cluster_distances()
|
382
|
-
merge_list = self._generate_merge_list(cluster_matrix)
|
383
|
-
for entry in merge_list:
|
384
|
-
if not entry.status:
|
385
|
-
if entry.outer_cluster not in last_merge_levels:
|
386
|
-
last_merge_levels[entry.outer_cluster] = 0
|
387
|
-
if entry.inner_cluster not in last_merge_levels:
|
388
|
-
last_merge_levels[entry.inner_cluster] = 0
|
389
|
-
if last_merge_levels[entry.outer_cluster] > entry.level:
|
390
|
-
last_merge_levels[entry.outer_cluster] = entry.level - 1
|
391
|
-
else:
|
392
|
-
if entry.outer_cluster in last_merge_levels:
|
393
|
-
last_merge_levels[entry.outer_cluster] = max(
|
394
|
-
last_merge_levels[entry.outer_cluster], entry.level
|
395
|
-
)
|
396
|
-
|
397
|
-
return last_merge_levels
|
398
|
-
|
399
|
-
def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
|
400
|
-
"""
|
401
|
-
Retrieves Outliers based on when the sample was added to the cluster
|
402
|
-
and how far it was from the cluster when it was added
|
403
|
-
|
404
|
-
Parameters
|
405
|
-
----------
|
406
|
-
last_merge_levels : Dict[int, int]
|
407
|
-
A mapping of a cluster id to its last good merge level
|
408
|
-
|
409
|
-
Returns
|
410
|
-
-------
|
411
|
-
Tuple[List[int], List[int]]
|
412
|
-
The outliers and possible outliers as sorted lists of indices
|
413
|
-
"""
|
414
|
-
outliers = set()
|
415
|
-
possible_outliers = set()
|
416
|
-
already_seen = set()
|
417
|
-
last_level = {}
|
418
|
-
|
419
|
-
for level, cluster_set in self.clusters.items():
|
420
|
-
for cluster_id, cluster in cluster_set.items():
|
421
|
-
if cluster_id in last_merge_levels:
|
422
|
-
last_level[cluster_id] = level
|
423
|
-
|
424
|
-
for level, cluster_set in self.clusters.items():
|
425
|
-
for cluster_id, cluster in cluster_set.items():
|
426
|
-
if not cluster.merged and cluster_id in last_merge_levels and level > last_merge_levels[cluster_id]:
|
427
|
-
if cluster_id in already_seen and cluster.samples[-1] not in outliers:
|
428
|
-
outliers.add(cluster.samples[-1])
|
429
|
-
elif cluster.out2:
|
430
|
-
if len(cluster.samples) < self._min_num_samples_per_cluster:
|
431
|
-
outliers.update(cluster.samples.tolist())
|
432
|
-
elif cluster.samples[-1] not in outliers:
|
433
|
-
outliers.add(cluster.samples[-1])
|
434
|
-
if cluster_id not in already_seen:
|
435
|
-
already_seen.add(cluster_id)
|
436
|
-
elif cluster.out1 and len(cluster.samples) >= self._min_num_samples_per_cluster:
|
437
|
-
possible_outliers.add(cluster.samples[-1])
|
438
|
-
elif level == last_level[cluster_id] and len(cluster.samples) < self._min_num_samples_per_cluster:
|
439
|
-
outliers.update(cluster.samples.tolist())
|
440
|
-
|
441
|
-
return sorted(outliers), sorted(possible_outliers)
|
442
|
-
|
443
|
-
def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
|
444
|
-
"""Merges and sorts groups of indices that share any common index"""
|
445
|
-
groups: list[list[int]] = []
|
446
|
-
for indices in zip(*index_groups):
|
447
|
-
indices = set(indices)
|
448
|
-
temp = []
|
449
|
-
for group in groups:
|
450
|
-
if not set(group).isdisjoint(indices):
|
451
|
-
indices.update(group)
|
452
|
-
else:
|
453
|
-
temp.append(group)
|
454
|
-
temp.append(sorted(indices))
|
455
|
-
groups = temp
|
456
|
-
return sorted(groups)
|
457
|
-
|
458
|
-
def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
|
459
|
-
"""
|
460
|
-
Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
|
461
|
-
|
462
|
-
Parameters
|
463
|
-
----------
|
464
|
-
last_merge_levels : Dict[int, int]
|
465
|
-
A mapping of a cluster id to its last good merge level
|
466
|
-
|
467
|
-
Returns
|
468
|
-
-------
|
469
|
-
Tuple[List[List[int]], List[List[int]]]
|
470
|
-
The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
|
471
|
-
"""
|
472
|
-
|
473
|
-
duplicates_std = []
|
474
|
-
for cluster_id, level in last_merge_levels.items():
|
475
|
-
samples = self.clusters[level][cluster_id].samples
|
476
|
-
if len(samples) >= self._min_num_samples_per_cluster:
|
477
|
-
duplicates_std.append(self.clusters[level][cluster_id].dist_std)
|
478
|
-
diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
|
479
|
-
np.fill_diagonal(diag_mask, 0)
|
480
|
-
diag_mask = np.triu(diag_mask)
|
481
|
-
|
482
|
-
exact_mask = self._sqdmat <= (np.mean(duplicates_std) / 100)
|
483
|
-
exact_indices = np.nonzero(exact_mask & diag_mask)
|
484
|
-
exact_dupes = self._sorted_union_find(exact_indices)
|
485
|
-
|
486
|
-
near_mask = self._sqdmat <= np.mean(duplicates_std)
|
487
|
-
near_indices = np.nonzero(near_mask & diag_mask & ~exact_mask)
|
488
|
-
near_dupes = self._sorted_union_find(near_indices)
|
489
|
-
|
490
|
-
return exact_dupes, near_dupes
|
491
|
-
|
492
|
-
# TODO: Move data input to evaluate from class
|
493
|
-
@set_metadata(state=["data"])
|
494
|
-
def evaluate(self) -> ClustererOutput:
|
495
|
-
"""Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
|
496
|
-
|
497
|
-
Returns
|
498
|
-
-------
|
499
|
-
ClustererOutput
|
500
|
-
The Outliers and duplicate indices found in the data
|
501
|
-
|
502
|
-
Example
|
503
|
-
-------
|
504
|
-
>>> cluster = Clusterer(clusterer_images)
|
505
|
-
>>> cluster.evaluate()
|
506
|
-
ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
|
507
|
-
""" # noqa: E501
|
508
|
-
|
509
|
-
outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
|
510
|
-
duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
|
511
|
-
|
512
|
-
return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)
|
@@ -1,49 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = []
|
4
|
-
|
5
|
-
from copy import deepcopy
|
6
|
-
from typing import Sequence, TypeVar
|
7
|
-
|
8
|
-
import numpy as np
|
9
|
-
|
10
|
-
from dataeval.metrics.stats.base import BaseStatsOutput
|
11
|
-
|
12
|
-
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
|
13
|
-
|
14
|
-
|
15
|
-
def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
16
|
-
if type(a) is not type(b):
|
17
|
-
raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
|
18
|
-
|
19
|
-
sum_dict = deepcopy(a.dict())
|
20
|
-
|
21
|
-
for k in sum_dict:
|
22
|
-
if isinstance(sum_dict[k], list):
|
23
|
-
sum_dict[k].extend(b.dict()[k])
|
24
|
-
else:
|
25
|
-
sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
|
26
|
-
|
27
|
-
return type(a)(**sum_dict)
|
28
|
-
|
29
|
-
|
30
|
-
def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
|
31
|
-
output = None
|
32
|
-
dataset_steps = []
|
33
|
-
cur_len = 0
|
34
|
-
for s in stats:
|
35
|
-
output = s if output is None else add_stats(output, s)
|
36
|
-
cur_len += len(s)
|
37
|
-
dataset_steps.append(cur_len)
|
38
|
-
if output is None:
|
39
|
-
raise TypeError("Cannot combine empty sequence of stats.")
|
40
|
-
return output, dataset_steps
|
41
|
-
|
42
|
-
|
43
|
-
def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
|
44
|
-
last_step = 0
|
45
|
-
for i, step in enumerate(dataset_steps):
|
46
|
-
if idx < step:
|
47
|
-
return i, idx - last_step
|
48
|
-
last_step = step
|
49
|
-
return -1, idx
|
@@ -1,119 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
__all__ = []
|
4
|
-
|
5
|
-
import numbers
|
6
|
-
import warnings
|
7
|
-
from typing import Any
|
8
|
-
|
9
|
-
import numpy as np
|
10
|
-
from numpy.typing import NDArray
|
11
|
-
|
12
|
-
|
13
|
-
def get_least_likely_features(
|
14
|
-
metadata: dict[str, list[Any] | NDArray[Any]],
|
15
|
-
new_metadata: dict[str, list[Any] | NDArray[Any]],
|
16
|
-
is_ood: NDArray[np.bool_],
|
17
|
-
) -> list[tuple[str, float]]:
|
18
|
-
"""Computes which metadata feature is most out-of-distribution (OOD) relative to a reference metadata set.
|
19
|
-
|
20
|
-
Given a reference metadata dictionary `metadata` (where each key maps to one scalar metadata feature), a second
|
21
|
-
metadata dictionary, and a corresponding boolean flag `is_ood` indicating whether each new example falls
|
22
|
-
out-of-distribution (OOD) relative to the reference, this function finds which metadata feature is the most OOD,
|
23
|
-
for each OOD example.
|
24
|
-
|
25
|
-
Parameters
|
26
|
-
----------
|
27
|
-
metadata: dict[str, list[Any] | NDArray[Any]]
|
28
|
-
A reference set of arrays of values, indexed by metadata feature names, with one value per data example per
|
29
|
-
feature.
|
30
|
-
new_metadata: dict[str, list[Any] | NDArray[Any]]
|
31
|
-
A second metadata set, to be tested against the reference metadata. It is ok if the two meta data objects
|
32
|
-
hold different numbers of examples.
|
33
|
-
is_ood: NDArray[np.bool_]
|
34
|
-
A boolean array, with one value per new_metadata example, that indicates which examples are OOD.
|
35
|
-
|
36
|
-
Returns
|
37
|
-
-------
|
38
|
-
list[tuple[str, float]]
|
39
|
-
An array of names of the features of each OOD new_metadata example that were the most OOD.
|
40
|
-
|
41
|
-
Examples
|
42
|
-
--------
|
43
|
-
Imagine we have 3 data examples, and that the corresponding metadata contains 2 features called time and
|
44
|
-
altitude, as shown below.
|
45
|
-
|
46
|
-
>>> metadata = {"time": [1.2, 3.4, 5.6], "altitude": [235, 6789, 101112]}
|
47
|
-
>>> new_metadata = {"time": [7.8, 11.12], "altitude": [532, -211101]}
|
48
|
-
>>> is_ood = np.array([True, True])
|
49
|
-
>>> get_least_likely_features(metadata, new_metadata, is_ood)
|
50
|
-
[('time', 2.0), ('altitude', 33.245346)]
|
51
|
-
"""
|
52
|
-
# Raise errors for bad inputs...
|
53
|
-
|
54
|
-
if metadata.keys() != new_metadata.keys():
|
55
|
-
raise ValueError(f"Reference and test metadata keys must be identical: {list(metadata)}, {list(new_metadata)}")
|
56
|
-
|
57
|
-
md_lengths = {len(np.atleast_1d(v)) for v in metadata.values()}
|
58
|
-
new_md_lengths = {len(np.atleast_1d(v)) for v in new_metadata.values()}
|
59
|
-
if len(md_lengths) > 1 or len(new_md_lengths) > 1:
|
60
|
-
raise ValueError(f"All features must have same length, got lengths {md_lengths}, {new_md_lengths}")
|
61
|
-
|
62
|
-
n_reference, n_new = md_lengths.pop(), new_md_lengths.pop() # possibly different numbers of metadata examples
|
63
|
-
|
64
|
-
if n_new != len(is_ood):
|
65
|
-
raise ValueError(f"is_ood flag must have same length as new metadata {n_new} but has length {len(is_ood)}.")
|
66
|
-
|
67
|
-
if n_reference < 3: # too hard to define "in-distribution" with this few reference samples.
|
68
|
-
warnings.warn(
|
69
|
-
"We need at least 3 reference metadata examples to determine which "
|
70
|
-
f"features are least likely, but only got {n_reference}",
|
71
|
-
UserWarning,
|
72
|
-
)
|
73
|
-
return []
|
74
|
-
|
75
|
-
if not any(is_ood):
|
76
|
-
return []
|
77
|
-
|
78
|
-
# ...inputs are good, look for most deviant standardized features.
|
79
|
-
|
80
|
-
# largest standardized absolute deviation from the median observed so far for each example
|
81
|
-
deviation = np.zeros_like(is_ood, dtype=np.float32)
|
82
|
-
|
83
|
-
# name of feature that corresponds to `deviation` for each example
|
84
|
-
kmax = np.empty(len(is_ood), dtype=object)
|
85
|
-
|
86
|
-
for k, v in metadata.items():
|
87
|
-
# exclude cases where random happens to be out on tails, not interesting.
|
88
|
-
if k == "random":
|
89
|
-
continue
|
90
|
-
|
91
|
-
# Skip non-numerical features
|
92
|
-
if not all(isinstance(vi, numbers.Number) for vi in v): # NB: np.nan *is* a number in this context.
|
93
|
-
continue
|
94
|
-
|
95
|
-
# Get standardization parameters from metadata
|
96
|
-
loc = np.median(v) # ok, because we checked all were numeric
|
97
|
-
dev = np.asarray(v) - loc # need to make array from v since it could be a list here.
|
98
|
-
posdev, negdev = dev[dev > 0], dev[dev < 0]
|
99
|
-
pos_scale = np.median(posdev) if posdev.any() else 1.0
|
100
|
-
neg_scale = np.abs(np.median(negdev)) if negdev.any() else 1.0
|
101
|
-
|
102
|
-
x, x0, dxp, dxn = np.atleast_1d(new_metadata[k]), loc, pos_scale, neg_scale # just abbreviations
|
103
|
-
dxp = dxp if dxp > 0 else 1.0 # avoids dividing by zero below
|
104
|
-
dxn = dxn if dxn > 0 else 1.0
|
105
|
-
|
106
|
-
# xdev must be floating-point to avoid getting zero in an integer division.
|
107
|
-
xdev = (x - x0).astype(np.float64)
|
108
|
-
pos = xdev >= 0
|
109
|
-
|
110
|
-
X = np.zeros_like(xdev)
|
111
|
-
X[pos], X[~pos] = xdev[pos] / dxp, xdev[~pos] / dxn # keeping track of possible asymmetry of x, but...
|
112
|
-
# ...below here, only need to think about absolute deviation.
|
113
|
-
|
114
|
-
abig = np.abs(X) > deviation
|
115
|
-
kmax[abig] = k
|
116
|
-
deviation[abig] = np.abs(X[abig])
|
117
|
-
|
118
|
-
unlikely_features = list(zip(kmax[is_ood], deviation[is_ood])) # feature names, along with how far out they are.
|
119
|
-
return unlikely_features
|