dataeval 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +18 -0
  2. dataeval/_internal/detectors/__init__.py +0 -0
  3. dataeval/_internal/detectors/clusterer.py +469 -0
  4. dataeval/_internal/detectors/drift/__init__.py +0 -0
  5. dataeval/_internal/detectors/drift/base.py +265 -0
  6. dataeval/_internal/detectors/drift/cvm.py +97 -0
  7. dataeval/_internal/detectors/drift/ks.py +100 -0
  8. dataeval/_internal/detectors/drift/mmd.py +166 -0
  9. dataeval/_internal/detectors/drift/torch.py +310 -0
  10. dataeval/_internal/detectors/drift/uncertainty.py +149 -0
  11. dataeval/_internal/detectors/duplicates.py +49 -0
  12. dataeval/_internal/detectors/linter.py +78 -0
  13. dataeval/_internal/detectors/ood/__init__.py +0 -0
  14. dataeval/_internal/detectors/ood/ae.py +77 -0
  15. dataeval/_internal/detectors/ood/aegmm.py +69 -0
  16. dataeval/_internal/detectors/ood/base.py +199 -0
  17. dataeval/_internal/detectors/ood/llr.py +284 -0
  18. dataeval/_internal/detectors/ood/vae.py +86 -0
  19. dataeval/_internal/detectors/ood/vaegmm.py +79 -0
  20. dataeval/_internal/flags.py +47 -0
  21. dataeval/_internal/metrics/__init__.py +0 -0
  22. dataeval/_internal/metrics/base.py +92 -0
  23. dataeval/_internal/metrics/ber.py +124 -0
  24. dataeval/_internal/metrics/coverage.py +80 -0
  25. dataeval/_internal/metrics/divergence.py +94 -0
  26. dataeval/_internal/metrics/hash.py +79 -0
  27. dataeval/_internal/metrics/parity.py +180 -0
  28. dataeval/_internal/metrics/stats.py +332 -0
  29. dataeval/_internal/metrics/uap.py +45 -0
  30. dataeval/_internal/metrics/utils.py +158 -0
  31. dataeval/_internal/models/__init__.py +0 -0
  32. dataeval/_internal/models/pytorch/__init__.py +0 -0
  33. dataeval/_internal/models/pytorch/autoencoder.py +202 -0
  34. dataeval/_internal/models/pytorch/blocks.py +46 -0
  35. dataeval/_internal/models/pytorch/utils.py +67 -0
  36. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  37. dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
  38. dataeval/_internal/models/tensorflow/gmm.py +115 -0
  39. dataeval/_internal/models/tensorflow/losses.py +107 -0
  40. dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
  41. dataeval/_internal/models/tensorflow/trainer.py +102 -0
  42. dataeval/_internal/models/tensorflow/utils.py +254 -0
  43. dataeval/_internal/workflows/sufficiency.py +555 -0
  44. dataeval/detectors/__init__.py +29 -0
  45. dataeval/flags/__init__.py +3 -0
  46. dataeval/metrics/__init__.py +7 -0
  47. dataeval/models/__init__.py +15 -0
  48. dataeval/models/tensorflow/__init__.py +6 -0
  49. dataeval/models/torch/__init__.py +8 -0
  50. dataeval/py.typed +0 -0
  51. dataeval/workflows/__init__.py +8 -0
  52. dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
  53. dataeval-0.61.0.dist-info/METADATA +114 -0
  54. dataeval-0.61.0.dist-info/RECORD +55 -0
  55. dataeval-0.61.0.dist-info/WHEEL +4 -0
dataeval/__init__.py ADDED
@@ -0,0 +1,18 @@
1
+ from importlib.util import find_spec
2
+
3
+ from . import detectors, flags, metrics
4
+
5
+ __version__ = "0.61.0"
6
+
7
+ __all__ = ["detectors", "flags", "metrics"]
8
+
9
+ if find_spec("torch") is not None: # pragma: no cover
10
+ from . import models, workflows
11
+
12
+ __all__ += ["models", "workflows"]
13
+ elif find_spec("tensorflow") is not None: # pragma: no cover
14
+ from . import models
15
+
16
+ __all__ += ["models"]
17
+
18
+ del find_spec
File without changes
@@ -0,0 +1,469 @@
1
+ from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
2
+
3
+ import numpy as np
4
+ from scipy.cluster.hierarchy import linkage
5
+ from scipy.spatial.distance import pdist, squareform
6
+
7
+
8
+ def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
9
+ """
10
+ Adds a column to the linkage matrix link_arr that tracks the new id assigned
11
+ to each row
12
+
13
+ Parameters
14
+ ----------
15
+ link_arr : np.ndarray
16
+ linkage matrix
17
+
18
+ Returns
19
+ -------
20
+ np.ndarray
21
+ linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
22
+ """
23
+ # Adjusting linkage matrix to accommodate renumbering
24
+ rows, cols = link_arr.shape
25
+ arr = np.zeros((rows, cols + 1))
26
+ arr[:, :-1] = link_arr
27
+ arr[:, -1] = np.arange(rows + 1, 2 * rows + 1)
28
+
29
+ return arr
30
+
31
+
32
+ class Cluster:
33
+ __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
34
+
35
+ def __init__(self, merged: int, samples: np.ndarray, sample_dist: Union[float, np.ndarray], is_copy: bool = False):
36
+ self.merged = merged
37
+ self.samples = np.array(samples, dtype=np.int32)
38
+ self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
39
+ self.is_copy = is_copy
40
+
41
+ dist = float(self.sample_dist[-1])
42
+
43
+ self.count = len(self.samples)
44
+ if is_copy:
45
+ self.dist_avg = 0.0
46
+ self.dist_std = 0.0
47
+ self.out1 = False
48
+ self.out2 = False
49
+ else:
50
+ self.dist_avg = float(np.mean(self.sample_dist))
51
+ self.dist_std = float(np.std(self.sample_dist)) if len(self.sample_dist) > 1 else 1e-5
52
+ out1 = self.dist_avg + self.dist_std
53
+ out2 = out1 + self.dist_std
54
+ self.out1 = dist > out1
55
+ self.out2 = dist > out2
56
+
57
+ def copy(self) -> "Cluster":
58
+ return Cluster(False, self.samples, self.sample_dist, True)
59
+
60
+ def __repr__(self) -> str:
61
+ _params = {
62
+ "merged": self.merged,
63
+ "samples": self.samples,
64
+ "sample_dist": self.sample_dist,
65
+ "is_copy": self.is_copy,
66
+ }
67
+ return f"{self.__class__.__name__}(**{repr(_params)})"
68
+
69
+
70
+ class Clusters(Dict[int, Dict[int, Cluster]]):
71
+ def __init__(self, *args, **kwargs):
72
+ super().__init__(*args, **kwargs)
73
+ self.max_level: int = 1
74
+
75
+
76
+ class ClusterPosition(NamedTuple):
77
+ """Keeps track of a cluster's level and ID"""
78
+
79
+ level: int
80
+ cid: int
81
+
82
+
83
+ class ClusterMergeEntry:
84
+ __slots__ = "level", "outer_cluster", "inner_cluster", "status"
85
+
86
+ def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
87
+ self.level = level
88
+ self.outer_cluster = outer_cluster
89
+ self.inner_cluster = inner_cluster
90
+ self.status = status
91
+
92
+ def __lt__(self, value: "ClusterMergeEntry") -> bool:
93
+ return self.level.__lt__(value.level)
94
+
95
+ def __gt__(self, value: "ClusterMergeEntry") -> bool:
96
+ return self.level.__gt__(value.level)
97
+
98
+
99
+ class Clusterer:
100
+ """
101
+ Uses hierarchical clustering to flag dataset properties of interest like outliers and duplicates
102
+
103
+ Parameters
104
+ ----------
105
+ dataset : np.ndarray
106
+ An array of images or image embeddings to perform clustering
107
+ """
108
+
109
+ def __init__(self, dataset: np.ndarray):
110
+ # Allows an update to dataset to reset the state rather than instantiate a new class
111
+ self._on_init(dataset)
112
+
113
+ def _on_init(self, dataset: np.ndarray):
114
+ self._validate_data(dataset)
115
+ self._data: np.ndarray = dataset
116
+ self._num_samples = len(dataset)
117
+
118
+ self._darr: np.ndarray = pdist(dataset, metric="euclidean")
119
+ self._sqdmat: np.ndarray = squareform(self._darr)
120
+ self._larr: np.ndarray = extend_linkage(linkage(self._darr))
121
+ self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
122
+
123
+ min_num = int(self._num_samples * 0.05)
124
+ self._min_num_samples_per_cluster = min(max(2, min_num), 100)
125
+
126
+ self._clusters = None
127
+ self._last_good_merge_levels = None
128
+
129
+ @property
130
+ def data(self) -> np.ndarray:
131
+ return self._data
132
+
133
+ @data.setter
134
+ def data(self, x: np.ndarray):
135
+ self._on_init(x)
136
+
137
+ @property
138
+ def clusters(self) -> Clusters:
139
+ if self._clusters is None:
140
+ self._clusters = self._create_clusters()
141
+ return self._clusters
142
+
143
+ @property
144
+ def last_good_merge_levels(self) -> Dict[int, int]:
145
+ if self._last_good_merge_levels is None:
146
+ self._last_good_merge_levels = self._get_last_merge_levels()
147
+ return self._last_good_merge_levels
148
+
149
+ @classmethod
150
+ def _validate_data(cls, x: np.ndarray):
151
+ """Checks that the data has the correct size, shape, and format"""
152
+ if not isinstance(x, np.ndarray):
153
+ raise TypeError(f"Data should be of type np.ndarray; got {type(x)}")
154
+
155
+ if x.ndim != 2:
156
+ raise ValueError(
157
+ f"Data should only have 2 dimensions; got {x.ndim}. Data should be flattened before being input"
158
+ )
159
+ samples, features = x.shape # Due to above check, we know shape has a length of 2
160
+ if samples < 2:
161
+ raise ValueError(f"Data should have at least 2 samples; got {samples}")
162
+ if features < 1:
163
+ raise ValueError(f"Samples should have at least 1 feature; got {features}")
164
+
165
+ def _create_clusters(self) -> Clusters:
166
+ """Generates clusters based on linkage matrix"""
167
+ next_cluster_id = 0
168
+ cluster_map: Dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
169
+ clusters: Clusters = Clusters()
170
+
171
+ # Walking through the linkage array to generate clusters
172
+ for arr_i in self._larr:
173
+ left_id = int(arr_i[0])
174
+ right_id = int(arr_i[1])
175
+ sample_dist = np.array([arr_i[2]], dtype=np.float32)
176
+ merged = False
177
+
178
+ # Determine if the id is already associated with a cluster
179
+ left = cluster_map.get(left_id)
180
+ right = cluster_map.get(right_id)
181
+
182
+ if left and right:
183
+ merged = max([left.cid, right.cid])
184
+ lc = clusters[left.level][left.cid]
185
+ rc = clusters[right.level][right.cid]
186
+ left_first = len(lc.samples) >= len(rc.samples)
187
+ samples = np.concatenate([lc.samples, rc.samples] if left_first else [rc.samples, lc.samples])
188
+ sample_dist = np.concatenate([rc.sample_dist, lc.sample_dist, sample_dist])
189
+ level, cid = max(left.level, right.level) + 1, min(left.cid, right.cid)
190
+
191
+ # Only tracking the levels in which clusters merge for the cluster distance matrix
192
+ clusters.max_level = max(clusters.max_level, left.level, right.level)
193
+ # Update clusters to include previously skipped levels
194
+ clusters = self._fill_levels(clusters, left, right)
195
+ elif left or right:
196
+ child, other_id = cast(Tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
197
+ cc = clusters[child.level][child.cid]
198
+ samples = np.concatenate([cc.samples, [other_id]])
199
+ sample_dist = np.concatenate([cc.sample_dist, sample_dist])
200
+ level, cid = child.level + 1, child.cid
201
+ else:
202
+ samples = np.array([left_id, right_id], dtype=np.int32)
203
+ level, cid = 0, next_cluster_id
204
+ next_cluster_id += 1
205
+
206
+ # Set the cluster and associate the linkage id with the cluster
207
+ if level not in clusters:
208
+ clusters[level] = {}
209
+
210
+ clusters[level][cid] = Cluster(merged, samples, sample_dist)
211
+ cluster_map[int(arr_i[-1])] = ClusterPosition(level, cid)
212
+
213
+ return clusters
214
+
215
+ def _fill_levels(self, clusters: Clusters, left: ClusterPosition, right: ClusterPosition) -> Clusters:
216
+ # Sets each level's cluster info if it does not exist
217
+ if left.level != right.level:
218
+ (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
219
+ cluster = clusters[level][cid].copy()
220
+ for level_id in range(max_level, level, -1):
221
+ clusters[level_id].setdefault(cid, cluster)
222
+ return clusters
223
+
224
+ def _get_cluster_distances(self) -> np.ndarray:
225
+ """Calculates the minimum distances between clusters are each level"""
226
+ # Cluster distance matrix
227
+ max_level = self.clusters.max_level
228
+ cluster_matrix = np.full((max_level, self._max_clusters, self._max_clusters), -1.0, dtype=np.float32)
229
+
230
+ for level, cluster_set in self.clusters.items():
231
+ if level < max_level:
232
+ cluster_ids = sorted(cluster_set.keys())
233
+ for i, cluster_id in enumerate(cluster_ids):
234
+ cluster_matrix[level, cluster_id, cluster_id] = self.clusters[level][cluster_id].dist_avg
235
+ for int_id in range(i + 1, len(cluster_ids)):
236
+ compare_id = cluster_ids[int_id]
237
+ sample_a = self.clusters[level][cluster_id].samples
238
+ sample_b = self.clusters[level][compare_id].samples
239
+ min_mat = self._sqdmat[np.ix_(sample_a, sample_b)].min()
240
+ cluster_matrix[level, cluster_id, compare_id] = min_mat
241
+ cluster_matrix[level, compare_id, cluster_id] = min_mat
242
+
243
+ return cluster_matrix
244
+
245
+ def _calc_merge_indices(self, merge_mean: List[np.ndarray], intra_max: List[float]) -> np.ndarray:
246
+ """
247
+ Determine what clusters should be merged and return their indices
248
+ """
249
+ intra_max_uniques = np.unique(intra_max)
250
+ intra_log_values = np.log(intra_max_uniques)
251
+ two_std_all = intra_log_values.mean() + 2 * intra_log_values.std()
252
+ merge_value = np.log(merge_mean)
253
+ # Mask of indices we know we want to merge
254
+ desired_merge = merge_value < two_std_all
255
+
256
+ # List[Values] for indices we might want to merge
257
+ check = merge_value[~desired_merge]
258
+ # Check distance from value to 2 stds of all values
259
+ check = np.abs((check - two_std_all) / two_std_all)
260
+ # Mask List[Values < 1]
261
+ mask = check < 1
262
+ one_std_check = check[mask].mean() + check[mask].std()
263
+ # Mask of indices that should also be merged
264
+ mask2_vals = np.abs((merge_value - two_std_all) / two_std_all)
265
+ mask2 = mask2_vals < one_std_check
266
+ return np.logical_or(desired_merge, mask2)
267
+
268
+ def _generate_merge_list(self, cluster_matrix: np.ndarray) -> List[ClusterMergeEntry]:
269
+ """
270
+ Runs through the clusters dictionary determining when clusters merge,
271
+ and how close are those clusters when they merge.
272
+
273
+ Parameters
274
+ ----------
275
+ cluster_matrix:
276
+ The distance matrix for all clusters to all others
277
+
278
+ Returns
279
+ -------
280
+ List[ClusterMergeEntry]:
281
+ A list with each cluster's merge history
282
+ """
283
+ intra_max = []
284
+ merge_mean = []
285
+ merge_list: List[ClusterMergeEntry] = []
286
+
287
+ for level, cluster_set in self.clusters.items():
288
+ for outer_cluster, cluster in cluster_set.items():
289
+ inner_cluster = cluster.merged
290
+ if not inner_cluster:
291
+ continue
292
+ # Extract necessary information
293
+ num_samples = len(cluster.samples)
294
+ out1 = cluster.out1
295
+ out2 = cluster.out2
296
+
297
+ # If outside 2-std or 1-std and larger than a minimum sized cluster, take the mean distance, else max
298
+ aggregate_func = (
299
+ np.mean if out2 or (out1 and num_samples >= self._min_num_samples_per_cluster) else np.max
300
+ )
301
+
302
+ distances = cluster_matrix[:level, outer_cluster, inner_cluster]
303
+ intra_distance = cluster_matrix[:, outer_cluster, outer_cluster]
304
+ positive_mask = intra_distance >= 0
305
+ intra_filtered = intra_distance[positive_mask]
306
+
307
+ # TODO: Append now, take max over axis later?
308
+ intra_max.append(np.max(intra_filtered))
309
+ # Calculate the corresponding distance stats
310
+ distance_stats_arr = aggregate_func(distances)
311
+ merge_mean.append(distance_stats_arr)
312
+ merge_list.append(ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
313
+
314
+ all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
315
+
316
+ for i, is_mergeable in enumerate(all_merge_indices):
317
+ merge_list[i].status = is_mergeable
318
+
319
+ merge_list = sorted(merge_list, reverse=True)
320
+
321
+ return merge_list
322
+
323
+ def _get_last_merge_levels(self) -> Dict[int, int]:
324
+ """
325
+ Creates a dictionary for important cluster ids mapped to their last good merge level
326
+
327
+ Returns
328
+ -------
329
+ Dict[int, int]
330
+ A mapping of a cluster id to its last good merge level
331
+ """
332
+ last_merge_levels: Dict[int, int] = {}
333
+
334
+ if self._max_clusters <= 1:
335
+ last_merge_levels = {0: int(self._num_samples * 0.1)}
336
+ else:
337
+ cluster_matrix = self._get_cluster_distances()
338
+ merge_list = self._generate_merge_list(cluster_matrix)
339
+ for entry in merge_list:
340
+ if not entry.status:
341
+ if entry.outer_cluster not in last_merge_levels:
342
+ last_merge_levels[entry.outer_cluster] = 0
343
+ if entry.inner_cluster not in last_merge_levels:
344
+ last_merge_levels[entry.inner_cluster] = 0
345
+ if last_merge_levels[entry.outer_cluster] > entry.level:
346
+ last_merge_levels[entry.outer_cluster] = entry.level - 1
347
+ else:
348
+ if entry.outer_cluster in last_merge_levels:
349
+ last_merge_levels[entry.outer_cluster] = max(
350
+ last_merge_levels[entry.outer_cluster], entry.level
351
+ )
352
+
353
+ return last_merge_levels
354
+
355
+ def find_outliers(self, last_merge_levels: Dict[int, int]) -> Tuple[List[int], List[int]]:
356
+ """
357
+ Retrieves outliers based on when the sample was added to the cluster
358
+ and how far it was from the cluster when it was added
359
+
360
+ Parameters
361
+ ----------
362
+ last_merge_levels : Dict[int, int]
363
+ A mapping of a cluster id to its last good merge level
364
+
365
+ Returns
366
+ -------
367
+ Tuple[List[int], List[int]]
368
+ The outliers and possible outliers as sorted lists of indices
369
+ """
370
+ outliers = set()
371
+ possible_outliers = set()
372
+ already_seen = set()
373
+ last_level = {}
374
+
375
+ for level, cluster_set in self.clusters.items():
376
+ for cluster_id, cluster in cluster_set.items():
377
+ if cluster_id in last_merge_levels:
378
+ last_level[cluster_id] = level
379
+
380
+ for level, cluster_set in self.clusters.items():
381
+ for cluster_id, cluster in cluster_set.items():
382
+ if not cluster.merged and cluster_id in last_merge_levels and level > last_merge_levels[cluster_id]:
383
+ if cluster_id in already_seen and cluster.samples[-1] not in outliers:
384
+ outliers.add(cluster.samples[-1])
385
+ elif cluster.out2:
386
+ if len(cluster.samples) < self._min_num_samples_per_cluster:
387
+ outliers.update(cluster.samples.tolist())
388
+ elif cluster.samples[-1] not in outliers:
389
+ outliers.add(cluster.samples[-1])
390
+ if cluster_id not in already_seen:
391
+ already_seen.add(cluster_id)
392
+ elif cluster.out1 and len(cluster.samples) >= self._min_num_samples_per_cluster:
393
+ possible_outliers.add(cluster.samples[-1])
394
+ elif level == last_level[cluster_id] and len(cluster.samples) < self._min_num_samples_per_cluster:
395
+ outliers.update(cluster.samples.tolist())
396
+
397
+ return sorted(outliers), sorted(possible_outliers)
398
+
399
+ def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> List[List[int]]:
400
+ """Merges and sorts groups of indices that share any common index"""
401
+ groups: List[List[int]] = []
402
+ for indices in zip(*index_groups):
403
+ indices = set(indices)
404
+ temp = []
405
+ for group in groups:
406
+ if not set(group).isdisjoint(indices):
407
+ indices.update(group)
408
+ else:
409
+ temp.append(group)
410
+ temp.append(sorted(indices))
411
+ groups = temp
412
+ return sorted(groups)
413
+
414
+ def find_duplicates(self, last_merge_levels: Dict[int, int]) -> Tuple[List[List[int]], List[List[int]]]:
415
+ """
416
+ Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
417
+
418
+ Parameters
419
+ ----------
420
+ last_merge_levels : Dict[int, int]
421
+ A mapping of a cluster id to its last good merge level
422
+
423
+ Returns
424
+ -------
425
+ Tuple[List[List[int]], List[List[int]]]
426
+ The exact duplicates and near duplicates as lists of related indices
427
+ """
428
+
429
+ duplicates_std = []
430
+ for cluster_id, level in last_merge_levels.items():
431
+ samples = self.clusters[level][cluster_id].samples
432
+ if len(samples) >= self._min_num_samples_per_cluster:
433
+ duplicates_std.append(self.clusters[level][cluster_id].dist_std)
434
+ diag_mask = np.ones_like(self._sqdmat, dtype=bool)
435
+ np.fill_diagonal(diag_mask, 0)
436
+ diag_mask = np.triu(diag_mask)
437
+
438
+ exact_mask = self._sqdmat <= (np.mean(duplicates_std) / 100)
439
+ exact_indices = np.nonzero(exact_mask & diag_mask)
440
+ exact_dupes = self._sorted_union_find(exact_indices)
441
+
442
+ near_mask = self._sqdmat <= np.mean(duplicates_std)
443
+ near_indices = np.nonzero(near_mask & diag_mask & ~exact_mask)
444
+ near_dupes = self._sorted_union_find(near_indices)
445
+
446
+ return exact_dupes, near_dupes
447
+
448
+ def evaluate(self):
449
+ """Finds and flags indices of the data for outliers and duplicates
450
+
451
+ Returns
452
+ -------
453
+
454
+ Dict[str, Union[List[int]], List[List[int]]]
455
+ Dictionary containing list of outliers, potential outliers, duplicates, and near duplicates in keys
456
+ "outliers", "potential_outliers", "duplicates", "near_duplicates" respectively
457
+ """
458
+
459
+ outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
460
+ duplicates, near_duplicates = self.find_duplicates(self.last_good_merge_levels)
461
+
462
+ ret = {
463
+ "outliers": outliers,
464
+ "potential_outliers": potential_outliers,
465
+ "duplicates": duplicates,
466
+ "near_duplicates": near_duplicates,
467
+ }
468
+
469
+ return ret
File without changes