dataeval 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/{output.py → _output.py} +14 -0
  3. dataeval/config.py +77 -0
  4. dataeval/detectors/__init__.py +1 -1
  5. dataeval/detectors/drift/__init__.py +6 -6
  6. dataeval/detectors/drift/{base.py → _base.py} +41 -30
  7. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  8. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  9. dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
  10. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  11. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
  12. dataeval/detectors/drift/updates.py +1 -1
  13. dataeval/detectors/linters/__init__.py +0 -3
  14. dataeval/detectors/linters/duplicates.py +17 -8
  15. dataeval/detectors/linters/outliers.py +52 -43
  16. dataeval/detectors/ood/ae.py +29 -8
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/metadata_ks_compare.py +1 -1
  19. dataeval/detectors/ood/mixin.py +20 -5
  20. dataeval/detectors/ood/output.py +1 -1
  21. dataeval/detectors/ood/vae.py +73 -0
  22. dataeval/metadata/__init__.py +5 -0
  23. dataeval/metadata/_ood.py +238 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +5 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
  27. dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
  29. dataeval/metrics/bias/{parity.py → _parity.py} +89 -63
  30. dataeval/metrics/estimators/__init__.py +14 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
  32. dataeval/metrics/estimators/_clusterer.py +104 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
  35. dataeval/metrics/stats/__init__.py +7 -7
  36. dataeval/metrics/stats/{base.py → _base.py} +52 -16
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
  38. dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
  39. dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
  40. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
  41. dataeval/metrics/stats/{labelstats.py → _labelstats.py} +25 -25
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
  44. dataeval/typing.py +54 -0
  45. dataeval/utils/__init__.py +2 -2
  46. dataeval/utils/_array.py +169 -0
  47. dataeval/utils/_bin.py +199 -0
  48. dataeval/utils/_clusterer.py +144 -0
  49. dataeval/utils/_fast_mst.py +189 -0
  50. dataeval/utils/{image.py → _image.py} +6 -4
  51. dataeval/utils/_method.py +18 -0
  52. dataeval/utils/{shared.py → _mst.py} +3 -65
  53. dataeval/utils/{plot.py → _plot.py} +4 -4
  54. dataeval/utils/data/__init__.py +22 -0
  55. dataeval/utils/data/_embeddings.py +105 -0
  56. dataeval/utils/data/_images.py +65 -0
  57. dataeval/utils/data/_metadata.py +352 -0
  58. dataeval/utils/data/_selection.py +119 -0
  59. dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
  60. dataeval/utils/data/_targets.py +73 -0
  61. dataeval/utils/data/_types.py +58 -0
  62. dataeval/utils/data/collate.py +103 -0
  63. dataeval/utils/data/datasets/__init__.py +17 -0
  64. dataeval/utils/data/datasets/_base.py +254 -0
  65. dataeval/utils/data/datasets/_cifar10.py +134 -0
  66. dataeval/utils/data/datasets/_fileio.py +168 -0
  67. dataeval/utils/data/datasets/_milco.py +153 -0
  68. dataeval/utils/data/datasets/_mixin.py +56 -0
  69. dataeval/utils/data/datasets/_mnist.py +183 -0
  70. dataeval/utils/data/datasets/_ships.py +123 -0
  71. dataeval/utils/data/datasets/_voc.py +352 -0
  72. dataeval/utils/data/selections/__init__.py +15 -0
  73. dataeval/utils/data/selections/_classfilter.py +60 -0
  74. dataeval/utils/data/selections/_indices.py +26 -0
  75. dataeval/utils/data/selections/_limit.py +26 -0
  76. dataeval/utils/data/selections/_reverse.py +18 -0
  77. dataeval/utils/data/selections/_shuffle.py +29 -0
  78. dataeval/utils/metadata.py +198 -376
  79. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  80. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  81. dataeval/utils/torch/models.py +43 -2
  82. dataeval/workflows/sufficiency.py +10 -9
  83. {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/METADATA +44 -15
  84. dataeval-0.81.0.dist-info/RECORD +94 -0
  85. dataeval/detectors/linters/clusterer.py +0 -512
  86. dataeval/detectors/linters/merged_stats.py +0 -49
  87. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  88. dataeval/interop.py +0 -69
  89. dataeval/utils/dataset/__init__.py +0 -7
  90. dataeval/utils/dataset/datasets.py +0 -412
  91. dataeval/utils/dataset/read.py +0 -63
  92. dataeval-0.76.0.dist-info/RECORD +0 -67
  93. /dataeval/{log.py → _log.py} +0 -0
  94. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  95. {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
  96. {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
@@ -1,512 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from dataclasses import dataclass
6
- from typing import Any, Iterable, NamedTuple, cast
7
-
8
- import numpy as np
9
- from numpy.typing import ArrayLike, NDArray
10
- from scipy.cluster.hierarchy import linkage
11
- from scipy.spatial.distance import pdist, squareform
12
-
13
- from dataeval.interop import to_numpy
14
- from dataeval.output import Output, set_metadata
15
- from dataeval.utils.shared import flatten
16
-
17
-
18
- @dataclass(frozen=True)
19
- class ClustererOutput(Output):
20
- """
21
- Output class for :class:`Clusterer` lint detector.
22
-
23
- Attributes
24
- ----------
25
- outliers : List[int]
26
- Indices that do not fall within a cluster
27
- potential_outliers : List[int]
28
- Indices which are near the border between belonging in the cluster and being an outlier
29
- duplicates : List[List[int]]
30
- Groups of indices that are exact :term:`duplicates<Duplicates>`
31
- potential_duplicates : List[List[int]]
32
- Groups of indices which are not exact but closely related data points
33
- """
34
-
35
- outliers: list[int]
36
- potential_outliers: list[int]
37
- duplicates: list[list[int]]
38
- potential_duplicates: list[list[int]]
39
-
40
-
41
- def _extend_linkage(link_arr: NDArray) -> NDArray:
42
- """
43
- Adds a column to the linkage matrix link_arr that tracks the new id assigned
44
- to each row
45
-
46
- Parameters
47
- ----------
48
- link_arr : NDArray
49
- linkage matrix
50
-
51
- Returns
52
- -------
53
- NDArray
54
- linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
55
- """
56
- # Adjusting linkage matrix to accommodate renumbering
57
- rows, cols = link_arr.shape
58
- arr = np.zeros((rows, cols + 1))
59
- arr[:, :-1] = link_arr
60
- arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)
61
-
62
- return arr
63
-
64
-
65
- class _Cluster:
66
- __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
67
-
68
- def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
69
- self.merged = merged
70
- self.samples = np.array(samples, dtype=np.int32)
71
- self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
72
- self.is_copy = is_copy
73
-
74
- dist = float(self.sample_dist[-1])
75
-
76
- self.count = len(self.samples)
77
- if is_copy:
78
- self.dist_avg = 0.0
79
- self.dist_std = 0.0
80
- self.out1 = False
81
- self.out2 = False
82
- else:
83
- self.dist_avg = float(np.mean(self.sample_dist))
84
- self.dist_std = float(np.std(self.sample_dist)) if len(self.sample_dist) > 1 else 1e-5
85
- out1 = self.dist_avg + self.dist_std
86
- out2 = out1 + self.dist_std
87
- self.out1 = dist > out1
88
- self.out2 = dist > out2
89
-
90
- def copy(self) -> _Cluster:
91
- return _Cluster(False, self.samples, self.sample_dist, True)
92
-
93
- def __repr__(self) -> str:
94
- _params = {
95
- "merged": self.merged,
96
- "samples": self.samples,
97
- "sample_dist": self.sample_dist,
98
- "is_copy": self.is_copy,
99
- }
100
- return f"{self.__class__.__name__}(**{repr(_params)})"
101
-
102
-
103
- class _Clusters(dict[int, dict[int, _Cluster]]):
104
- def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
105
- super().__init__(*args)
106
- self.max_level: int = 1
107
-
108
-
109
- class _ClusterPosition(NamedTuple):
110
- """Keeps track of a cluster's level and ID"""
111
-
112
- level: int
113
- cid: int
114
-
115
-
116
- class _ClusterMergeEntry:
117
- __slots__ = "level", "outer_cluster", "inner_cluster", "status"
118
-
119
- def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
120
- self.level = level
121
- self.outer_cluster = outer_cluster
122
- self.inner_cluster = inner_cluster
123
- self.status = status
124
-
125
- def __lt__(self, value: _ClusterMergeEntry) -> bool:
126
- return self.level.__lt__(value.level)
127
-
128
- def __gt__(self, value: _ClusterMergeEntry) -> bool:
129
- return self.level.__gt__(value.level)
130
-
131
-
132
- class Clusterer:
133
- """
134
- Uses hierarchical clustering to flag dataset properties of interest like outliers \
135
- and :term:`duplicates<Duplicates>`.
136
-
137
- Parameters
138
- ----------
139
- dataset : ArrayLike, shape - (N, P)
140
- A dataset in an ArrayLike format.
141
- Function expects the data to have 2 dimensions, N number of observations in a P-dimensional space.
142
-
143
- Warning
144
- -------
145
- The Clusterer class is heavily dependent on computational resources, and may fail due to insufficient memory.
146
-
147
- Note
148
- ----
149
- The Clusterer works best when the length of the feature dimension, P, is less than 500.
150
- If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
151
- """
152
-
153
- def __init__(self, dataset: ArrayLike) -> None:
154
- # Allows an update to dataset to reset the state rather than instantiate a new class
155
- self._on_init(dataset)
156
-
157
- def _on_init(self, dataset: ArrayLike):
158
- self._data: NDArray[Any] = flatten(to_numpy(dataset))
159
- self._validate_data(self._data)
160
- self._num_samples = len(self._data)
161
-
162
- self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
163
- self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
164
- self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
165
- self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
166
-
167
- min_num = int(self._num_samples * 0.05)
168
- self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
169
-
170
- self._clusters: _Clusters | None = None
171
- self._last_good_merge_levels: dict[int, int] | None = None
172
-
173
- @property
174
- def data(self) -> NDArray[Any]:
175
- return self._data
176
-
177
- @data.setter
178
- def data(self, x: ArrayLike) -> None:
179
- self._on_init(x)
180
-
181
- @property
182
- def clusters(self) -> _Clusters:
183
- if self._clusters is None:
184
- self._clusters = self._create_clusters()
185
- return self._clusters
186
-
187
- @property
188
- def last_good_merge_levels(self) -> dict[int, int]:
189
- if self._last_good_merge_levels is None:
190
- self._last_good_merge_levels = self._get_last_merge_levels()
191
- return self._last_good_merge_levels
192
-
193
- @classmethod
194
- def _validate_data(cls, x: NDArray):
195
- """Checks that the data has the correct size, shape, and format"""
196
- if not isinstance(x, np.ndarray):
197
- raise TypeError(f"Data should be of type NDArray; got {type(x)}")
198
-
199
- if x.ndim != 2:
200
- raise ValueError(
201
- f"Data should only have 2 dimensions; got {x.ndim}. Data should be flattened before being input"
202
- )
203
- samples, features = x.shape # Due to above check, we know shape has a length of 2
204
- if samples < 2:
205
- raise ValueError(f"Data should have at least 2 samples; got {samples}")
206
- if features < 1:
207
- raise ValueError(f"Samples should have at least 1 feature; got {features}")
208
-
209
- def _create_clusters(self) -> _Clusters:
210
- """Generates clusters based on linkage matrix"""
211
- next_cluster_id = 0
212
- cluster_map: dict[int, _ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
213
- clusters: _Clusters = _Clusters()
214
-
215
- # Walking through the linkage array to generate clusters
216
- for arr_i in self._larr:
217
- left_id = int(arr_i[0])
218
- right_id = int(arr_i[1])
219
- sample_dist = np.array([arr_i[2]], dtype=np.float32)
220
- merged = False
221
-
222
- # Determine if the id is already associated with a cluster
223
- left = cluster_map.get(left_id)
224
- right = cluster_map.get(right_id)
225
-
226
- if left and right:
227
- merged = max([left.cid, right.cid])
228
- lc = clusters[left.level][left.cid]
229
- rc = clusters[right.level][right.cid]
230
- left_first = len(lc.samples) >= len(rc.samples)
231
- samples = np.concatenate([lc.samples, rc.samples] if left_first else [rc.samples, lc.samples])
232
- sample_dist = np.concatenate([rc.sample_dist, lc.sample_dist, sample_dist])
233
- level, cid = max(left.level, right.level) + 1, min(left.cid, right.cid)
234
-
235
- # Only tracking the levels in which clusters merge for the cluster distance matrix
236
- clusters.max_level = max(clusters.max_level, left.level, right.level)
237
- # Update clusters to include previously skipped levels
238
- clusters = self._fill_levels(clusters, left, right)
239
- elif left or right:
240
- child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
241
- cc = clusters[child.level][child.cid]
242
- samples = np.concatenate([cc.samples, [other_id]])
243
- sample_dist = np.concatenate([cc.sample_dist, sample_dist])
244
- level, cid = child.level + 1, child.cid
245
- else:
246
- samples = np.array([left_id, right_id], dtype=np.int32)
247
- level, cid = 0, next_cluster_id
248
- next_cluster_id += 1
249
-
250
- # Set the cluster and associate the linkage id with the cluster
251
- if level not in clusters:
252
- clusters[level] = {}
253
-
254
- clusters[level][cid] = _Cluster(merged, samples, sample_dist)
255
- cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
256
-
257
- return clusters
258
-
259
- def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
260
- # Sets each level's cluster info if it does not exist
261
- if left.level != right.level:
262
- (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
263
- cluster = clusters[level][cid].copy()
264
- for level_id in range(max_level, level, -1):
265
- clusters[level_id].setdefault(cid, cluster)
266
- return clusters
267
-
268
- def _get_cluster_distances(self) -> NDArray:
269
- """Calculates the minimum distances between clusters are each level"""
270
- # Cluster distance matrix
271
- max_level = self.clusters.max_level
272
- cluster_matrix = np.full((max_level, self._max_clusters, self._max_clusters), -1.0, dtype=np.float32)
273
-
274
- for level, cluster_set in self.clusters.items():
275
- if level < max_level:
276
- cluster_ids = sorted(cluster_set.keys())
277
- for i, cluster_id in enumerate(cluster_ids):
278
- cluster_matrix[level, cluster_id, cluster_id] = self.clusters[level][cluster_id].dist_avg
279
- for int_id in range(i + 1, len(cluster_ids)):
280
- compare_id = cluster_ids[int_id]
281
- sample_a = self.clusters[level][cluster_id].samples
282
- sample_b = self.clusters[level][compare_id].samples
283
- min_mat = self._sqdmat[np.ix_(sample_a, sample_b)].min()
284
- cluster_matrix[level, cluster_id, compare_id] = min_mat
285
- cluster_matrix[level, compare_id, cluster_id] = min_mat
286
-
287
- return cluster_matrix
288
-
289
- def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
290
- """
291
- Determine what clusters should be merged and return their indices
292
- """
293
- intra_max_uniques = np.unique(intra_max)
294
- intra_log_values = np.log(intra_max_uniques)
295
- two_std_all = intra_log_values.mean() + 2 * intra_log_values.std()
296
- merge_value = np.log(merge_mean)
297
- # Mask of indices we know we want to merge
298
- desired_merge = merge_value < two_std_all
299
-
300
- # List[Values] for indices we might want to merge
301
- check = merge_value[~desired_merge]
302
- # Check distance from value to 2 stds of all values
303
- check = np.abs((check - two_std_all) / two_std_all)
304
- # Mask List[Values < 1]
305
- mask = check < 1
306
- one_std_check = check[mask].mean() + check[mask].std()
307
- # Mask of indices that should also be merged
308
- mask2_vals = np.abs((merge_value - two_std_all) / two_std_all)
309
- mask2 = mask2_vals < one_std_check
310
- return np.logical_or(desired_merge, mask2)
311
-
312
- def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
313
- """
314
- Runs through the clusters dictionary determining when clusters merge,
315
- and how close are those clusters when they merge.
316
-
317
- Parameters
318
- ----------
319
- cluster_matrix:
320
- The distance matrix for all clusters to all others
321
-
322
- Returns
323
- -------
324
- List[ClusterMergeEntry]:
325
- A list with each cluster's merge history
326
- """
327
- intra_max = []
328
- merge_mean = []
329
- merge_list: list[_ClusterMergeEntry] = []
330
-
331
- for level, cluster_set in self.clusters.items():
332
- for outer_cluster, cluster in cluster_set.items():
333
- inner_cluster = cluster.merged
334
- if not inner_cluster:
335
- continue
336
- # Extract necessary information
337
- num_samples = len(cluster.samples)
338
- out1 = cluster.out1
339
- out2 = cluster.out2
340
-
341
- # If outside 2-std or 1-std and larger than a minimum sized cluster, take the mean distance, else max
342
- aggregate_func = (
343
- np.mean if out2 or (out1 and num_samples >= self._min_num_samples_per_cluster) else np.max
344
- )
345
-
346
- distances = cluster_matrix[:level, outer_cluster, inner_cluster]
347
- intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
348
- positive_mask = intra_distance >= 0
349
- intra_filtered = intra_distance[positive_mask]
350
-
351
- # TODO: Append now, take max over axis later?
352
- intra_max.append(np.max(intra_filtered))
353
- # Calculate the corresponding distance stats
354
- distance_stats_arr = aggregate_func(distances)
355
- merge_mean.append(distance_stats_arr)
356
- merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
357
-
358
- all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
359
-
360
- for i, is_mergeable in enumerate(all_merge_indices):
361
- merge_list[i].status = is_mergeable
362
-
363
- merge_list = sorted(merge_list, reverse=True)
364
-
365
- return merge_list
366
-
367
- def _get_last_merge_levels(self) -> dict[int, int]:
368
- """
369
- Creates a dictionary for important cluster ids mapped to their last good merge level
370
-
371
- Returns
372
- -------
373
- Dict[int, int]
374
- A mapping of a cluster id to its last good merge level
375
- """
376
- last_merge_levels: dict[int, int] = {}
377
-
378
- if self._max_clusters <= 1:
379
- last_merge_levels = {0: int(self._num_samples * 0.1)}
380
- else:
381
- cluster_matrix = self._get_cluster_distances()
382
- merge_list = self._generate_merge_list(cluster_matrix)
383
- for entry in merge_list:
384
- if not entry.status:
385
- if entry.outer_cluster not in last_merge_levels:
386
- last_merge_levels[entry.outer_cluster] = 0
387
- if entry.inner_cluster not in last_merge_levels:
388
- last_merge_levels[entry.inner_cluster] = 0
389
- if last_merge_levels[entry.outer_cluster] > entry.level:
390
- last_merge_levels[entry.outer_cluster] = entry.level - 1
391
- else:
392
- if entry.outer_cluster in last_merge_levels:
393
- last_merge_levels[entry.outer_cluster] = max(
394
- last_merge_levels[entry.outer_cluster], entry.level
395
- )
396
-
397
- return last_merge_levels
398
-
399
- def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
400
- """
401
- Retrieves Outliers based on when the sample was added to the cluster
402
- and how far it was from the cluster when it was added
403
-
404
- Parameters
405
- ----------
406
- last_merge_levels : Dict[int, int]
407
- A mapping of a cluster id to its last good merge level
408
-
409
- Returns
410
- -------
411
- Tuple[List[int], List[int]]
412
- The outliers and possible outliers as sorted lists of indices
413
- """
414
- outliers = set()
415
- possible_outliers = set()
416
- already_seen = set()
417
- last_level = {}
418
-
419
- for level, cluster_set in self.clusters.items():
420
- for cluster_id, cluster in cluster_set.items():
421
- if cluster_id in last_merge_levels:
422
- last_level[cluster_id] = level
423
-
424
- for level, cluster_set in self.clusters.items():
425
- for cluster_id, cluster in cluster_set.items():
426
- if not cluster.merged and cluster_id in last_merge_levels and level > last_merge_levels[cluster_id]:
427
- if cluster_id in already_seen and cluster.samples[-1] not in outliers:
428
- outliers.add(cluster.samples[-1])
429
- elif cluster.out2:
430
- if len(cluster.samples) < self._min_num_samples_per_cluster:
431
- outliers.update(cluster.samples.tolist())
432
- elif cluster.samples[-1] not in outliers:
433
- outliers.add(cluster.samples[-1])
434
- if cluster_id not in already_seen:
435
- already_seen.add(cluster_id)
436
- elif cluster.out1 and len(cluster.samples) >= self._min_num_samples_per_cluster:
437
- possible_outliers.add(cluster.samples[-1])
438
- elif level == last_level[cluster_id] and len(cluster.samples) < self._min_num_samples_per_cluster:
439
- outliers.update(cluster.samples.tolist())
440
-
441
- return sorted(outliers), sorted(possible_outliers)
442
-
443
- def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
444
- """Merges and sorts groups of indices that share any common index"""
445
- groups: list[list[int]] = []
446
- for indices in zip(*index_groups):
447
- indices = set(indices)
448
- temp = []
449
- for group in groups:
450
- if not set(group).isdisjoint(indices):
451
- indices.update(group)
452
- else:
453
- temp.append(group)
454
- temp.append(sorted(indices))
455
- groups = temp
456
- return sorted(groups)
457
-
458
- def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
459
- """
460
- Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
461
-
462
- Parameters
463
- ----------
464
- last_merge_levels : Dict[int, int]
465
- A mapping of a cluster id to its last good merge level
466
-
467
- Returns
468
- -------
469
- Tuple[List[List[int]], List[List[int]]]
470
- The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
471
- """
472
-
473
- duplicates_std = []
474
- for cluster_id, level in last_merge_levels.items():
475
- samples = self.clusters[level][cluster_id].samples
476
- if len(samples) >= self._min_num_samples_per_cluster:
477
- duplicates_std.append(self.clusters[level][cluster_id].dist_std)
478
- diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
479
- np.fill_diagonal(diag_mask, 0)
480
- diag_mask = np.triu(diag_mask)
481
-
482
- exact_mask = self._sqdmat <= (np.mean(duplicates_std) / 100)
483
- exact_indices = np.nonzero(exact_mask & diag_mask)
484
- exact_dupes = self._sorted_union_find(exact_indices)
485
-
486
- near_mask = self._sqdmat <= np.mean(duplicates_std)
487
- near_indices = np.nonzero(near_mask & diag_mask & ~exact_mask)
488
- near_dupes = self._sorted_union_find(near_indices)
489
-
490
- return exact_dupes, near_dupes
491
-
492
- # TODO: Move data input to evaluate from class
493
- @set_metadata(state=["data"])
494
- def evaluate(self) -> ClustererOutput:
495
- """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
496
-
497
- Returns
498
- -------
499
- ClustererOutput
500
- The Outliers and duplicate indices found in the data
501
-
502
- Example
503
- -------
504
- >>> cluster = Clusterer(clusterer_images)
505
- >>> cluster.evaluate()
506
- ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
507
- """ # noqa: E501
508
-
509
- outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
510
- duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
511
-
512
- return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)
@@ -1,49 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- from copy import deepcopy
6
- from typing import Sequence, TypeVar
7
-
8
- import numpy as np
9
-
10
- from dataeval.metrics.stats.base import BaseStatsOutput
11
-
12
- TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
13
-
14
-
15
- def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
16
- if type(a) is not type(b):
17
- raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
18
-
19
- sum_dict = deepcopy(a.dict())
20
-
21
- for k in sum_dict:
22
- if isinstance(sum_dict[k], list):
23
- sum_dict[k].extend(b.dict()[k])
24
- else:
25
- sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
26
-
27
- return type(a)(**sum_dict)
28
-
29
-
30
- def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
31
- output = None
32
- dataset_steps = []
33
- cur_len = 0
34
- for s in stats:
35
- output = s if output is None else add_stats(output, s)
36
- cur_len += len(s)
37
- dataset_steps.append(cur_len)
38
- if output is None:
39
- raise TypeError("Cannot combine empty sequence of stats.")
40
- return output, dataset_steps
41
-
42
-
43
- def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
44
- last_step = 0
45
- for i, step in enumerate(dataset_steps):
46
- if idx < step:
47
- return i, idx - last_step
48
- last_step = step
49
- return -1, idx
@@ -1,119 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = []
4
-
5
- import numbers
6
- import warnings
7
- from typing import Any
8
-
9
- import numpy as np
10
- from numpy.typing import NDArray
11
-
12
-
13
- def get_least_likely_features(
14
- metadata: dict[str, list[Any] | NDArray[Any]],
15
- new_metadata: dict[str, list[Any] | NDArray[Any]],
16
- is_ood: NDArray[np.bool_],
17
- ) -> list[tuple[str, float]]:
18
- """Computes which metadata feature is most out-of-distribution (OOD) relative to a reference metadata set.
19
-
20
- Given a reference metadata dictionary `metadata` (where each key maps to one scalar metadata feature), a second
21
- metadata dictionary, and a corresponding boolean flag `is_ood` indicating whether each new example falls
22
- out-of-distribution (OOD) relative to the reference, this function finds which metadata feature is the most OOD,
23
- for each OOD example.
24
-
25
- Parameters
26
- ----------
27
- metadata: dict[str, list[Any] | NDArray[Any]]
28
- A reference set of arrays of values, indexed by metadata feature names, with one value per data example per
29
- feature.
30
- new_metadata: dict[str, list[Any] | NDArray[Any]]
31
- A second metadata set, to be tested against the reference metadata. It is ok if the two meta data objects
32
- hold different numbers of examples.
33
- is_ood: NDArray[np.bool_]
34
- A boolean array, with one value per new_metadata example, that indicates which examples are OOD.
35
-
36
- Returns
37
- -------
38
- list[tuple[str, float]]
39
- An array of names of the features of each OOD new_metadata example that were the most OOD.
40
-
41
- Examples
42
- --------
43
- Imagine we have 3 data examples, and that the corresponding metadata contains 2 features called time and
44
- altitude, as shown below.
45
-
46
- >>> metadata = {"time": [1.2, 3.4, 5.6], "altitude": [235, 6789, 101112]}
47
- >>> new_metadata = {"time": [7.8, 11.12], "altitude": [532, -211101]}
48
- >>> is_ood = np.array([True, True])
49
- >>> get_least_likely_features(metadata, new_metadata, is_ood)
50
- [('time', 2.0), ('altitude', 33.245346)]
51
- """
52
- # Raise errors for bad inputs...
53
-
54
- if metadata.keys() != new_metadata.keys():
55
- raise ValueError(f"Reference and test metadata keys must be identical: {list(metadata)}, {list(new_metadata)}")
56
-
57
- md_lengths = {len(np.atleast_1d(v)) for v in metadata.values()}
58
- new_md_lengths = {len(np.atleast_1d(v)) for v in new_metadata.values()}
59
- if len(md_lengths) > 1 or len(new_md_lengths) > 1:
60
- raise ValueError(f"All features must have same length, got lengths {md_lengths}, {new_md_lengths}")
61
-
62
- n_reference, n_new = md_lengths.pop(), new_md_lengths.pop() # possibly different numbers of metadata examples
63
-
64
- if n_new != len(is_ood):
65
- raise ValueError(f"is_ood flag must have same length as new metadata {n_new} but has length {len(is_ood)}.")
66
-
67
- if n_reference < 3: # too hard to define "in-distribution" with this few reference samples.
68
- warnings.warn(
69
- "We need at least 3 reference metadata examples to determine which "
70
- f"features are least likely, but only got {n_reference}",
71
- UserWarning,
72
- )
73
- return []
74
-
75
- if not any(is_ood):
76
- return []
77
-
78
- # ...inputs are good, look for most deviant standardized features.
79
-
80
- # largest standardized absolute deviation from the median observed so far for each example
81
- deviation = np.zeros_like(is_ood, dtype=np.float32)
82
-
83
- # name of feature that corresponds to `deviation` for each example
84
- kmax = np.empty(len(is_ood), dtype=object)
85
-
86
- for k, v in metadata.items():
87
- # exclude cases where random happens to be out on tails, not interesting.
88
- if k == "random":
89
- continue
90
-
91
- # Skip non-numerical features
92
- if not all(isinstance(vi, numbers.Number) for vi in v): # NB: np.nan *is* a number in this context.
93
- continue
94
-
95
- # Get standardization parameters from metadata
96
- loc = np.median(v) # ok, because we checked all were numeric
97
- dev = np.asarray(v) - loc # need to make array from v since it could be a list here.
98
- posdev, negdev = dev[dev > 0], dev[dev < 0]
99
- pos_scale = np.median(posdev) if posdev.any() else 1.0
100
- neg_scale = np.abs(np.median(negdev)) if negdev.any() else 1.0
101
-
102
- x, x0, dxp, dxn = np.atleast_1d(new_metadata[k]), loc, pos_scale, neg_scale # just abbreviations
103
- dxp = dxp if dxp > 0 else 1.0 # avoids dividing by zero below
104
- dxn = dxn if dxn > 0 else 1.0
105
-
106
- # xdev must be floating-point to avoid getting zero in an integer division.
107
- xdev = (x - x0).astype(np.float64)
108
- pos = xdev >= 0
109
-
110
- X = np.zeros_like(xdev)
111
- X[pos], X[~pos] = xdev[pos] / dxp, xdev[~pos] / dxn # keeping track of possible asymmetry of x, but...
112
- # ...below here, only need to think about absolute deviation.
113
-
114
- abig = np.abs(X) > deviation
115
- kmax[abig] = k
116
- deviation[abig] = np.abs(X[abig])
117
-
118
- unlikely_features = list(zip(kmax[is_ood], deviation[is_ood])) # feature names, along with how far out they are.
119
- return unlikely_features