dataeval 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +18 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/clusterer.py +469 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/drift/base.py +265 -0
- dataeval/_internal/detectors/drift/cvm.py +97 -0
- dataeval/_internal/detectors/drift/ks.py +100 -0
- dataeval/_internal/detectors/drift/mmd.py +166 -0
- dataeval/_internal/detectors/drift/torch.py +310 -0
- dataeval/_internal/detectors/drift/uncertainty.py +149 -0
- dataeval/_internal/detectors/duplicates.py +49 -0
- dataeval/_internal/detectors/linter.py +78 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/detectors/ood/ae.py +77 -0
- dataeval/_internal/detectors/ood/aegmm.py +69 -0
- dataeval/_internal/detectors/ood/base.py +199 -0
- dataeval/_internal/detectors/ood/llr.py +284 -0
- dataeval/_internal/detectors/ood/vae.py +86 -0
- dataeval/_internal/detectors/ood/vaegmm.py +79 -0
- dataeval/_internal/flags.py +47 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/base.py +92 -0
- dataeval/_internal/metrics/ber.py +124 -0
- dataeval/_internal/metrics/coverage.py +80 -0
- dataeval/_internal/metrics/divergence.py +94 -0
- dataeval/_internal/metrics/hash.py +79 -0
- dataeval/_internal/metrics/parity.py +180 -0
- dataeval/_internal/metrics/stats.py +332 -0
- dataeval/_internal/metrics/uap.py +45 -0
- dataeval/_internal/metrics/utils.py +158 -0
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/autoencoder.py +202 -0
- dataeval/_internal/models/pytorch/blocks.py +46 -0
- dataeval/_internal/models/pytorch/utils.py +67 -0
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
- dataeval/_internal/models/tensorflow/gmm.py +115 -0
- dataeval/_internal/models/tensorflow/losses.py +107 -0
- dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
- dataeval/_internal/models/tensorflow/trainer.py +102 -0
- dataeval/_internal/models/tensorflow/utils.py +254 -0
- dataeval/_internal/workflows/sufficiency.py +555 -0
- dataeval/detectors/__init__.py +29 -0
- dataeval/flags/__init__.py +3 -0
- dataeval/metrics/__init__.py +7 -0
- dataeval/models/__init__.py +15 -0
- dataeval/models/tensorflow/__init__.py +6 -0
- dataeval/models/torch/__init__.py +8 -0
- dataeval/py.typed +0 -0
- dataeval/workflows/__init__.py +8 -0
- dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
- dataeval-0.61.0.dist-info/METADATA +114 -0
- dataeval-0.61.0.dist-info/RECORD +55 -0
- dataeval-0.61.0.dist-info/WHEEL +4 -0
dataeval/__init__.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
from importlib.util import find_spec
|
2
|
+
|
3
|
+
from . import detectors, flags, metrics
|
4
|
+
|
5
|
+
__version__ = "0.61.0"
|
6
|
+
|
7
|
+
__all__ = ["detectors", "flags", "metrics"]
|
8
|
+
|
9
|
+
if find_spec("torch") is not None: # pragma: no cover
|
10
|
+
from . import models, workflows
|
11
|
+
|
12
|
+
__all__ += ["models", "workflows"]
|
13
|
+
elif find_spec("tensorflow") is not None: # pragma: no cover
|
14
|
+
from . import models
|
15
|
+
|
16
|
+
__all__ += ["models"]
|
17
|
+
|
18
|
+
del find_spec
|
File without changes
|
@@ -0,0 +1,469 @@
|
|
1
|
+
from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from scipy.cluster.hierarchy import linkage
|
5
|
+
from scipy.spatial.distance import pdist, squareform
|
6
|
+
|
7
|
+
|
8
|
+
def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
|
9
|
+
"""
|
10
|
+
Adds a column to the linkage matrix link_arr that tracks the new id assigned
|
11
|
+
to each row
|
12
|
+
|
13
|
+
Parameters
|
14
|
+
----------
|
15
|
+
link_arr : np.ndarray
|
16
|
+
linkage matrix
|
17
|
+
|
18
|
+
Returns
|
19
|
+
-------
|
20
|
+
np.ndarray
|
21
|
+
linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
|
22
|
+
"""
|
23
|
+
# Adjusting linkage matrix to accommodate renumbering
|
24
|
+
rows, cols = link_arr.shape
|
25
|
+
arr = np.zeros((rows, cols + 1))
|
26
|
+
arr[:, :-1] = link_arr
|
27
|
+
arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)
|
28
|
+
|
29
|
+
return arr
|
30
|
+
|
31
|
+
|
32
|
+
class Cluster:
|
33
|
+
__slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
|
34
|
+
|
35
|
+
def __init__(self, merged: int, samples: np.ndarray, sample_dist: Union[float, np.ndarray], is_copy: bool = False):
|
36
|
+
self.merged = merged
|
37
|
+
self.samples = np.array(samples, dtype=np.int32)
|
38
|
+
self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
|
39
|
+
self.is_copy = is_copy
|
40
|
+
|
41
|
+
dist = float(self.sample_dist[-1])
|
42
|
+
|
43
|
+
self.count = len(self.samples)
|
44
|
+
if is_copy:
|
45
|
+
self.dist_avg = 0.0
|
46
|
+
self.dist_std = 0.0
|
47
|
+
self.out1 = False
|
48
|
+
self.out2 = False
|
49
|
+
else:
|
50
|
+
self.dist_avg = float(np.mean(self.sample_dist))
|
51
|
+
self.dist_std = float(np.std(self.sample_dist)) if len(self.sample_dist) > 1 else 1e-5
|
52
|
+
out1 = self.dist_avg + self.dist_std
|
53
|
+
out2 = out1 + self.dist_std
|
54
|
+
self.out1 = dist > out1
|
55
|
+
self.out2 = dist > out2
|
56
|
+
|
57
|
+
def copy(self) -> "Cluster":
|
58
|
+
return Cluster(False, self.samples, self.sample_dist, True)
|
59
|
+
|
60
|
+
def __repr__(self) -> str:
|
61
|
+
_params = {
|
62
|
+
"merged": self.merged,
|
63
|
+
"samples": self.samples,
|
64
|
+
"sample_dist": self.sample_dist,
|
65
|
+
"is_copy": self.is_copy,
|
66
|
+
}
|
67
|
+
return f"{self.__class__.__name__}(**{repr(_params)})"
|
68
|
+
|
69
|
+
|
70
|
+
class Clusters(Dict[int, Dict[int, Cluster]]):
|
71
|
+
def __init__(self, *args, **kwargs):
|
72
|
+
super().__init__(*args, **kwargs)
|
73
|
+
self.max_level: int = 1
|
74
|
+
|
75
|
+
|
76
|
+
class ClusterPosition(NamedTuple):
|
77
|
+
"""Keeps track of a cluster's level and ID"""
|
78
|
+
|
79
|
+
level: int
|
80
|
+
cid: int
|
81
|
+
|
82
|
+
|
83
|
+
class ClusterMergeEntry:
|
84
|
+
__slots__ = "level", "outer_cluster", "inner_cluster", "status"
|
85
|
+
|
86
|
+
def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
|
87
|
+
self.level = level
|
88
|
+
self.outer_cluster = outer_cluster
|
89
|
+
self.inner_cluster = inner_cluster
|
90
|
+
self.status = status
|
91
|
+
|
92
|
+
def __lt__(self, value: "ClusterMergeEntry") -> bool:
|
93
|
+
return self.level.__lt__(value.level)
|
94
|
+
|
95
|
+
def __gt__(self, value: "ClusterMergeEntry") -> bool:
|
96
|
+
return self.level.__gt__(value.level)
|
97
|
+
|
98
|
+
|
99
|
+
class Clusterer:
|
100
|
+
"""
|
101
|
+
Uses hierarchical clustering to flag dataset properties of interest like outliers and duplicates
|
102
|
+
|
103
|
+
Parameters
|
104
|
+
----------
|
105
|
+
dataset : np.ndarray
|
106
|
+
An array of images or image embeddings to perform clustering
|
107
|
+
"""
|
108
|
+
|
109
|
+
def __init__(self, dataset: np.ndarray):
|
110
|
+
# Allows an update to dataset to reset the state rather than instantiate a new class
|
111
|
+
self._on_init(dataset)
|
112
|
+
|
113
|
+
def _on_init(self, dataset: np.ndarray):
|
114
|
+
self._validate_data(dataset)
|
115
|
+
self._data: np.ndarray = dataset
|
116
|
+
self._num_samples = len(dataset)
|
117
|
+
|
118
|
+
self._darr: np.ndarray = pdist(dataset, metric="euclidean")
|
119
|
+
self._sqdmat: np.ndarray = squareform(self._darr)
|
120
|
+
self._larr: np.ndarray = extend_linkage(linkage(self._darr))
|
121
|
+
self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
|
122
|
+
|
123
|
+
min_num = int(self._num_samples * 0.05)
|
124
|
+
self._min_num_samples_per_cluster = min(max(2, min_num), 100)
|
125
|
+
|
126
|
+
self._clusters = None
|
127
|
+
self._last_good_merge_levels = None
|
128
|
+
|
129
|
+
@property
|
130
|
+
def data(self) -> np.ndarray:
|
131
|
+
return self._data
|
132
|
+
|
133
|
+
@data.setter
|
134
|
+
def data(self, x: np.ndarray):
|
135
|
+
self._on_init(x)
|
136
|
+
|
137
|
+
@property
|
138
|
+
def clusters(self) -> Clusters:
|
139
|
+
if self._clusters is None:
|
140
|
+
self._clusters = self._create_clusters()
|
141
|
+
return self._clusters
|
142
|
+
|
143
|
+
@property
|
144
|
+
def last_good_merge_levels(self) -> Dict[int, int]:
|
145
|
+
if self._last_good_merge_levels is None:
|
146
|
+
self._last_good_merge_levels = self._get_last_merge_levels()
|
147
|
+
return self._last_good_merge_levels
|
148
|
+
|
149
|
+
@classmethod
|
150
|
+
def _validate_data(cls, x: np.ndarray):
|
151
|
+
"""Checks that the data has the correct size, shape, and format"""
|
152
|
+
if not isinstance(x, np.ndarray):
|
153
|
+
raise TypeError(f"Data should be of type np.ndarray; got {type(x)}")
|
154
|
+
|
155
|
+
if x.ndim != 2:
|
156
|
+
raise ValueError(
|
157
|
+
f"Data should only have 2 dimensions; got {x.ndim}. Data should be flattened before being input"
|
158
|
+
)
|
159
|
+
samples, features = x.shape # Due to above check, we know shape has a length of 2
|
160
|
+
if samples < 2:
|
161
|
+
raise ValueError(f"Data should have at least 2 samples; got {samples}")
|
162
|
+
if features < 1:
|
163
|
+
raise ValueError(f"Samples should have at least 1 feature; got {features}")
|
164
|
+
|
165
|
+
def _create_clusters(self) -> Clusters:
|
166
|
+
"""Generates clusters based on linkage matrix"""
|
167
|
+
next_cluster_id = 0
|
168
|
+
cluster_map: Dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
|
169
|
+
clusters: Clusters = Clusters()
|
170
|
+
|
171
|
+
# Walking through the linkage array to generate clusters
|
172
|
+
for arr_i in self._larr:
|
173
|
+
left_id = int(arr_i[0])
|
174
|
+
right_id = int(arr_i[1])
|
175
|
+
sample_dist = np.array([arr_i[2]], dtype=np.float32)
|
176
|
+
merged = False
|
177
|
+
|
178
|
+
# Determine if the id is already associated with a cluster
|
179
|
+
left = cluster_map.get(left_id)
|
180
|
+
right = cluster_map.get(right_id)
|
181
|
+
|
182
|
+
if left and right:
|
183
|
+
merged = max([left.cid, right.cid])
|
184
|
+
lc = clusters[left.level][left.cid]
|
185
|
+
rc = clusters[right.level][right.cid]
|
186
|
+
left_first = len(lc.samples) >= len(rc.samples)
|
187
|
+
samples = np.concatenate([lc.samples, rc.samples] if left_first else [rc.samples, lc.samples])
|
188
|
+
sample_dist = np.concatenate([rc.sample_dist, lc.sample_dist, sample_dist])
|
189
|
+
level, cid = max(left.level, right.level) + 1, min(left.cid, right.cid)
|
190
|
+
|
191
|
+
# Only tracking the levels in which clusters merge for the cluster distance matrix
|
192
|
+
clusters.max_level = max(clusters.max_level, left.level, right.level)
|
193
|
+
# Update clusters to include previously skipped levels
|
194
|
+
clusters = self._fill_levels(clusters, left, right)
|
195
|
+
elif left or right:
|
196
|
+
child, other_id = cast(Tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
|
197
|
+
cc = clusters[child.level][child.cid]
|
198
|
+
samples = np.concatenate([cc.samples, [other_id]])
|
199
|
+
sample_dist = np.concatenate([cc.sample_dist, sample_dist])
|
200
|
+
level, cid = child.level + 1, child.cid
|
201
|
+
else:
|
202
|
+
samples = np.array([left_id, right_id], dtype=np.int32)
|
203
|
+
level, cid = 0, next_cluster_id
|
204
|
+
next_cluster_id += 1
|
205
|
+
|
206
|
+
# Set the cluster and associate the linkage id with the cluster
|
207
|
+
if level not in clusters:
|
208
|
+
clusters[level] = {}
|
209
|
+
|
210
|
+
clusters[level][cid] = Cluster(merged, samples, sample_dist)
|
211
|
+
cluster_map[int(arr_i[-1])] = ClusterPosition(level, cid)
|
212
|
+
|
213
|
+
return clusters
|
214
|
+
|
215
|
+
def _fill_levels(self, clusters: Clusters, left: ClusterPosition, right: ClusterPosition) -> Clusters:
|
216
|
+
# Sets each level's cluster info if it does not exist
|
217
|
+
if left.level != right.level:
|
218
|
+
(level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
|
219
|
+
cluster = clusters[level][cid].copy()
|
220
|
+
for level_id in range(max_level, level, -1):
|
221
|
+
clusters[level_id].setdefault(cid, cluster)
|
222
|
+
return clusters
|
223
|
+
|
224
|
+
def _get_cluster_distances(self) -> np.ndarray:
|
225
|
+
"""Calculates the minimum distances between clusters are each level"""
|
226
|
+
# Cluster distance matrix
|
227
|
+
max_level = self.clusters.max_level
|
228
|
+
cluster_matrix = np.full((max_level, self._max_clusters, self._max_clusters), -1.0, dtype=np.float32)
|
229
|
+
|
230
|
+
for level, cluster_set in self.clusters.items():
|
231
|
+
if level < max_level:
|
232
|
+
cluster_ids = sorted(cluster_set.keys())
|
233
|
+
for i, cluster_id in enumerate(cluster_ids):
|
234
|
+
cluster_matrix[level, cluster_id, cluster_id] = self.clusters[level][cluster_id].dist_avg
|
235
|
+
for int_id in range(i + 1, len(cluster_ids)):
|
236
|
+
compare_id = cluster_ids[int_id]
|
237
|
+
sample_a = self.clusters[level][cluster_id].samples
|
238
|
+
sample_b = self.clusters[level][compare_id].samples
|
239
|
+
min_mat = self._sqdmat[np.ix_(sample_a, sample_b)].min()
|
240
|
+
cluster_matrix[level, cluster_id, compare_id] = min_mat
|
241
|
+
cluster_matrix[level, compare_id, cluster_id] = min_mat
|
242
|
+
|
243
|
+
return cluster_matrix
|
244
|
+
|
245
|
+
def _calc_merge_indices(self, merge_mean: List[np.ndarray], intra_max: List[float]) -> np.ndarray:
|
246
|
+
"""
|
247
|
+
Determine what clusters should be merged and return their indices
|
248
|
+
"""
|
249
|
+
intra_max_uniques = np.unique(intra_max)
|
250
|
+
intra_log_values = np.log(intra_max_uniques)
|
251
|
+
two_std_all = intra_log_values.mean() + 2 * intra_log_values.std()
|
252
|
+
merge_value = np.log(merge_mean)
|
253
|
+
# Mask of indices we know we want to merge
|
254
|
+
desired_merge = merge_value < two_std_all
|
255
|
+
|
256
|
+
# List[Values] for indices we might want to merge
|
257
|
+
check = merge_value[~desired_merge]
|
258
|
+
# Check distance from value to 2 stds of all values
|
259
|
+
check = np.abs((check - two_std_all) / two_std_all)
|
260
|
+
# Mask List[Values < 1]
|
261
|
+
mask = check < 1
|
262
|
+
one_std_check = check[mask].mean() + check[mask].std()
|
263
|
+
# Mask of indices that should also be merged
|
264
|
+
mask2_vals = np.abs((merge_value - two_std_all) / two_std_all)
|
265
|
+
mask2 = mask2_vals < one_std_check
|
266
|
+
return np.logical_or(desired_merge, mask2)
|
267
|
+
|
268
|
+
def _generate_merge_list(self, cluster_matrix: np.ndarray) -> List[ClusterMergeEntry]:
|
269
|
+
"""
|
270
|
+
Runs through the clusters dictionary determining when clusters merge,
|
271
|
+
and how close are those clusters when they merge.
|
272
|
+
|
273
|
+
Parameters
|
274
|
+
----------
|
275
|
+
cluster_matrix:
|
276
|
+
The distance matrix for all clusters to all others
|
277
|
+
|
278
|
+
Returns
|
279
|
+
-------
|
280
|
+
List[ClusterMergeEntry]:
|
281
|
+
A list with each cluster's merge history
|
282
|
+
"""
|
283
|
+
intra_max = []
|
284
|
+
merge_mean = []
|
285
|
+
merge_list: List[ClusterMergeEntry] = []
|
286
|
+
|
287
|
+
for level, cluster_set in self.clusters.items():
|
288
|
+
for outer_cluster, cluster in cluster_set.items():
|
289
|
+
inner_cluster = cluster.merged
|
290
|
+
if not inner_cluster:
|
291
|
+
continue
|
292
|
+
# Extract necessary information
|
293
|
+
num_samples = len(cluster.samples)
|
294
|
+
out1 = cluster.out1
|
295
|
+
out2 = cluster.out2
|
296
|
+
|
297
|
+
# If outside 2-std or 1-std and larger than a minimum sized cluster, take the mean distance, else max
|
298
|
+
aggregate_func = (
|
299
|
+
np.mean if out2 or (out1 and num_samples >= self._min_num_samples_per_cluster) else np.max
|
300
|
+
)
|
301
|
+
|
302
|
+
distances = cluster_matrix[:level, outer_cluster, inner_cluster]
|
303
|
+
intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
|
304
|
+
positive_mask = intra_distance >= 0
|
305
|
+
intra_filtered = intra_distance[positive_mask]
|
306
|
+
|
307
|
+
# TODO: Append now, take max over axis later?
|
308
|
+
intra_max.append(np.max(intra_filtered))
|
309
|
+
# Calculate the corresponding distance stats
|
310
|
+
distance_stats_arr = aggregate_func(distances)
|
311
|
+
merge_mean.append(distance_stats_arr)
|
312
|
+
merge_list.append(ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
|
313
|
+
|
314
|
+
all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
|
315
|
+
|
316
|
+
for i, is_mergeable in enumerate(all_merge_indices):
|
317
|
+
merge_list[i].status = is_mergeable
|
318
|
+
|
319
|
+
merge_list = sorted(merge_list, reverse=True)
|
320
|
+
|
321
|
+
return merge_list
|
322
|
+
|
323
|
+
def _get_last_merge_levels(self) -> Dict[int, int]:
|
324
|
+
"""
|
325
|
+
Creates a dictionary for important cluster ids mapped to their last good merge level
|
326
|
+
|
327
|
+
Returns
|
328
|
+
-------
|
329
|
+
Dict[int, int]
|
330
|
+
A mapping of a cluster id to its last good merge level
|
331
|
+
"""
|
332
|
+
last_merge_levels: Dict[int, int] = {}
|
333
|
+
|
334
|
+
if self._max_clusters <= 1:
|
335
|
+
last_merge_levels = {0: int(self._num_samples * 0.1)}
|
336
|
+
else:
|
337
|
+
cluster_matrix = self._get_cluster_distances()
|
338
|
+
merge_list = self._generate_merge_list(cluster_matrix)
|
339
|
+
for entry in merge_list:
|
340
|
+
if not entry.status:
|
341
|
+
if entry.outer_cluster not in last_merge_levels:
|
342
|
+
last_merge_levels[entry.outer_cluster] = 0
|
343
|
+
if entry.inner_cluster not in last_merge_levels:
|
344
|
+
last_merge_levels[entry.inner_cluster] = 0
|
345
|
+
if last_merge_levels[entry.outer_cluster] > entry.level:
|
346
|
+
last_merge_levels[entry.outer_cluster] = entry.level - 1
|
347
|
+
else:
|
348
|
+
if entry.outer_cluster in last_merge_levels:
|
349
|
+
last_merge_levels[entry.outer_cluster] = max(
|
350
|
+
last_merge_levels[entry.outer_cluster], entry.level
|
351
|
+
)
|
352
|
+
|
353
|
+
return last_merge_levels
|
354
|
+
|
355
|
+
def find_outliers(self, last_merge_levels: Dict[int, int]) -> Tuple[List[int], List[int]]:
|
356
|
+
"""
|
357
|
+
Retrieves outliers based on when the sample was added to the cluster
|
358
|
+
and how far it was from the cluster when it was added
|
359
|
+
|
360
|
+
Parameters
|
361
|
+
----------
|
362
|
+
last_merge_levels : Dict[int, int]
|
363
|
+
A mapping of a cluster id to its last good merge level
|
364
|
+
|
365
|
+
Returns
|
366
|
+
-------
|
367
|
+
Tuple[List[int], List[int]]
|
368
|
+
The outliers and possible outliers as sorted lists of indices
|
369
|
+
"""
|
370
|
+
outliers = set()
|
371
|
+
possible_outliers = set()
|
372
|
+
already_seen = set()
|
373
|
+
last_level = {}
|
374
|
+
|
375
|
+
for level, cluster_set in self.clusters.items():
|
376
|
+
for cluster_id, cluster in cluster_set.items():
|
377
|
+
if cluster_id in last_merge_levels:
|
378
|
+
last_level[cluster_id] = level
|
379
|
+
|
380
|
+
for level, cluster_set in self.clusters.items():
|
381
|
+
for cluster_id, cluster in cluster_set.items():
|
382
|
+
if not cluster.merged and cluster_id in last_merge_levels and level > last_merge_levels[cluster_id]:
|
383
|
+
if cluster_id in already_seen and cluster.samples[-1] not in outliers:
|
384
|
+
outliers.add(cluster.samples[-1])
|
385
|
+
elif cluster.out2:
|
386
|
+
if len(cluster.samples) < self._min_num_samples_per_cluster:
|
387
|
+
outliers.update(cluster.samples.tolist())
|
388
|
+
elif cluster.samples[-1] not in outliers:
|
389
|
+
outliers.add(cluster.samples[-1])
|
390
|
+
if cluster_id not in already_seen:
|
391
|
+
already_seen.add(cluster_id)
|
392
|
+
elif cluster.out1 and len(cluster.samples) >= self._min_num_samples_per_cluster:
|
393
|
+
possible_outliers.add(cluster.samples[-1])
|
394
|
+
elif level == last_level[cluster_id] and len(cluster.samples) < self._min_num_samples_per_cluster:
|
395
|
+
outliers.update(cluster.samples.tolist())
|
396
|
+
|
397
|
+
return sorted(outliers), sorted(possible_outliers)
|
398
|
+
|
399
|
+
def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> List[List[int]]:
|
400
|
+
"""Merges and sorts groups of indices that share any common index"""
|
401
|
+
groups: List[List[int]] = []
|
402
|
+
for indices in zip(*index_groups):
|
403
|
+
indices = set(indices)
|
404
|
+
temp = []
|
405
|
+
for group in groups:
|
406
|
+
if not set(group).isdisjoint(indices):
|
407
|
+
indices.update(group)
|
408
|
+
else:
|
409
|
+
temp.append(group)
|
410
|
+
temp.append(sorted(indices))
|
411
|
+
groups = temp
|
412
|
+
return sorted(groups)
|
413
|
+
|
414
|
+
def find_duplicates(self, last_merge_levels: Dict[int, int]) -> Tuple[List[List[int]], List[List[int]]]:
|
415
|
+
"""
|
416
|
+
Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
|
417
|
+
|
418
|
+
Parameters
|
419
|
+
----------
|
420
|
+
last_merge_levels : Dict[int, int]
|
421
|
+
A mapping of a cluster id to its last good merge level
|
422
|
+
|
423
|
+
Returns
|
424
|
+
-------
|
425
|
+
Tuple[List[List[int]], List[List[int]]]
|
426
|
+
The exact duplicates and near duplicates as lists of related indices
|
427
|
+
"""
|
428
|
+
|
429
|
+
duplicates_std = []
|
430
|
+
for cluster_id, level in last_merge_levels.items():
|
431
|
+
samples = self.clusters[level][cluster_id].samples
|
432
|
+
if len(samples) >= self._min_num_samples_per_cluster:
|
433
|
+
duplicates_std.append(self.clusters[level][cluster_id].dist_std)
|
434
|
+
diag_mask = np.ones_like(self._sqdmat, dtype=bool)
|
435
|
+
np.fill_diagonal(diag_mask, 0)
|
436
|
+
diag_mask = np.triu(diag_mask)
|
437
|
+
|
438
|
+
exact_mask = self._sqdmat <= (np.mean(duplicates_std) / 100)
|
439
|
+
exact_indices = np.nonzero(exact_mask & diag_mask)
|
440
|
+
exact_dupes = self._sorted_union_find(exact_indices)
|
441
|
+
|
442
|
+
near_mask = self._sqdmat <= np.mean(duplicates_std)
|
443
|
+
near_indices = np.nonzero(near_mask & diag_mask & ~exact_mask)
|
444
|
+
near_dupes = self._sorted_union_find(near_indices)
|
445
|
+
|
446
|
+
return exact_dupes, near_dupes
|
447
|
+
|
448
|
+
def evaluate(self):
|
449
|
+
"""Finds and flags indices of the data for outliers and duplicates
|
450
|
+
|
451
|
+
Returns
|
452
|
+
-------
|
453
|
+
|
454
|
+
Dict[str, Union[List[int]], List[List[int]]]
|
455
|
+
Dictionary containing list of outliers, potential outliers, duplicates, and near duplicates in keys
|
456
|
+
"outliers", "potential_outliers", "duplicates", "near_duplicates" respectively
|
457
|
+
"""
|
458
|
+
|
459
|
+
outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
|
460
|
+
duplicates, near_duplicates = self.find_duplicates(self.last_good_merge_levels)
|
461
|
+
|
462
|
+
ret = {
|
463
|
+
"outliers": outliers,
|
464
|
+
"potential_outliers": potential_outliers,
|
465
|
+
"duplicates": duplicates,
|
466
|
+
"near_duplicates": near_duplicates,
|
467
|
+
}
|
468
|
+
|
469
|
+
return ret
|
File without changes
|