dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. dataeval/__init__.py +13 -9
  2. dataeval/_internal/detectors/clusterer.py +63 -49
  3. dataeval/_internal/detectors/drift/base.py +248 -51
  4. dataeval/_internal/detectors/drift/cvm.py +28 -26
  5. dataeval/_internal/detectors/drift/ks.py +31 -28
  6. dataeval/_internal/detectors/drift/mmd.py +62 -42
  7. dataeval/_internal/detectors/drift/torch.py +69 -60
  8. dataeval/_internal/detectors/drift/uncertainty.py +32 -32
  9. dataeval/_internal/detectors/duplicates.py +67 -31
  10. dataeval/_internal/detectors/ood/ae.py +15 -29
  11. dataeval/_internal/detectors/ood/aegmm.py +33 -27
  12. dataeval/_internal/detectors/ood/base.py +86 -47
  13. dataeval/_internal/detectors/ood/llr.py +34 -31
  14. dataeval/_internal/detectors/ood/vae.py +32 -31
  15. dataeval/_internal/detectors/ood/vaegmm.py +34 -28
  16. dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
  17. dataeval/_internal/flags.py +44 -21
  18. dataeval/_internal/interop.py +5 -3
  19. dataeval/_internal/metrics/balance.py +42 -5
  20. dataeval/_internal/metrics/ber.py +11 -8
  21. dataeval/_internal/metrics/coverage.py +15 -8
  22. dataeval/_internal/metrics/divergence.py +41 -7
  23. dataeval/_internal/metrics/diversity.py +57 -19
  24. dataeval/_internal/metrics/parity.py +141 -66
  25. dataeval/_internal/metrics/stats.py +330 -313
  26. dataeval/_internal/metrics/uap.py +33 -4
  27. dataeval/_internal/metrics/utils.py +79 -40
  28. dataeval/_internal/models/pytorch/autoencoder.py +127 -22
  29. dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
  30. dataeval/_internal/models/tensorflow/gmm.py +4 -2
  31. dataeval/_internal/models/tensorflow/losses.py +17 -13
  32. dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
  33. dataeval/_internal/models/tensorflow/trainer.py +10 -7
  34. dataeval/_internal/models/tensorflow/utils.py +23 -20
  35. dataeval/_internal/output.py +85 -0
  36. dataeval/_internal/utils.py +5 -3
  37. dataeval/_internal/workflows/sufficiency.py +122 -121
  38. dataeval/detectors/__init__.py +6 -25
  39. dataeval/detectors/drift/__init__.py +16 -0
  40. dataeval/detectors/drift/kernels/__init__.py +6 -0
  41. dataeval/detectors/drift/updates/__init__.py +3 -0
  42. dataeval/detectors/linters/__init__.py +5 -0
  43. dataeval/detectors/ood/__init__.py +11 -0
  44. dataeval/flags/__init__.py +2 -2
  45. dataeval/metrics/__init__.py +2 -26
  46. dataeval/metrics/bias/__init__.py +14 -0
  47. dataeval/metrics/estimators/__init__.py +9 -0
  48. dataeval/metrics/stats/__init__.py +6 -0
  49. dataeval/tensorflow/__init__.py +3 -0
  50. dataeval/tensorflow/loss/__init__.py +3 -0
  51. dataeval/tensorflow/models/__init__.py +5 -0
  52. dataeval/tensorflow/recon/__init__.py +3 -0
  53. dataeval/torch/__init__.py +3 -0
  54. dataeval/{models/torch → torch/models}/__init__.py +1 -2
  55. dataeval/torch/trainer/__init__.py +3 -0
  56. dataeval/utils/__init__.py +3 -6
  57. dataeval/workflows/__init__.py +2 -4
  58. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
  59. dataeval-0.66.0.dist-info/RECORD +72 -0
  60. dataeval/_internal/metrics/base.py +0 -10
  61. dataeval/models/__init__.py +0 -15
  62. dataeval/models/tensorflow/__init__.py +0 -6
  63. dataeval-0.64.0.dist-info/RECORD +0 -60
  64. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
  65. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
dataeval/__init__.py CHANGED
@@ -1,18 +1,22 @@
1
+ __version__ = "0.66.0"
2
+
1
3
  from importlib.util import find_spec
2
4
 
3
- from . import detectors, flags, metrics
5
+ _IS_TORCH_AVAILABLE = find_spec("torch") is not None
6
+ _IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("tensorflow_probability") is not None
7
+
8
+ del find_spec
4
9
 
5
- __version__ = "0.64.0"
10
+ from . import detectors, flags, metrics # noqa: E402
6
11
 
7
12
  __all__ = ["detectors", "flags", "metrics"]
8
13
 
9
- if find_spec("torch") is not None: # pragma: no cover
10
- from . import models, utils, workflows
14
+ if _IS_TORCH_AVAILABLE: # pragma: no cover
15
+ from . import torch, utils, workflows
11
16
 
12
- __all__ += ["models", "utils", "workflows"]
13
- elif find_spec("tensorflow") is not None: # pragma: no cover
14
- from . import models
17
+ __all__ += ["torch", "utils", "workflows"]
15
18
 
16
- __all__ += ["models"]
19
+ if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
20
+ from . import tensorflow
17
21
 
18
- del find_spec
22
+ __all__ += ["tensorflow"]
@@ -1,26 +1,52 @@
1
- from typing import Dict, Iterable, List, NamedTuple, Tuple, Union, cast
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable, NamedTuple, cast
2
5
 
3
6
  import numpy as np
4
- from numpy.typing import ArrayLike
7
+ from numpy.typing import ArrayLike, NDArray
5
8
  from scipy.cluster.hierarchy import linkage
6
9
  from scipy.spatial.distance import pdist, squareform
7
10
 
8
11
  from dataeval._internal.interop import to_numpy
12
+ from dataeval._internal.metrics.utils import flatten
13
+ from dataeval._internal.output import OutputMetadata, set_metadata
9
14
 
10
15
 
11
- def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
16
+ @dataclass(frozen=True)
17
+ class ClustererOutput(OutputMetadata):
18
+ """
19
+ Attributes
20
+ ----------
21
+ outliers : List[int]
22
+ Indices that do not fall within a cluster
23
+ potential_outliers : List[int]
24
+ Indices which are near the border between belonging in the cluster and being an outlier
25
+ duplicates : List[List[int]]
26
+ Groups of indices that are exact duplicates
27
+ potential_duplicates : List[List[int]]
28
+ Groups of indices which are not exact but closely related data points
29
+ """
30
+
31
+ outliers: list[int]
32
+ potential_outliers: list[int]
33
+ duplicates: list[list[int]]
34
+ potential_duplicates: list[list[int]]
35
+
36
+
37
+ def extend_linkage(link_arr: NDArray) -> NDArray:
12
38
  """
13
39
  Adds a column to the linkage matrix link_arr that tracks the new id assigned
14
40
  to each row
15
41
 
16
42
  Parameters
17
43
  ----------
18
- link_arr : np.ndarray
44
+ link_arr : NDArray
19
45
  linkage matrix
20
46
 
21
47
  Returns
22
48
  -------
23
- np.ndarray
49
+ NDArray
24
50
  linkage matrix with adjusted shape, new shape (link_arr.shape[0], link_arr.shape[1]+1)
25
51
  """
26
52
  # Adjusting linkage matrix to accommodate renumbering
@@ -35,7 +61,7 @@ def extend_linkage(link_arr: np.ndarray) -> np.ndarray:
35
61
  class Cluster:
36
62
  __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
37
63
 
38
- def __init__(self, merged: int, samples: np.ndarray, sample_dist: Union[float, np.ndarray], is_copy: bool = False):
64
+ def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
39
65
  self.merged = merged
40
66
  self.samples = np.array(samples, dtype=np.int32)
41
67
  self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
@@ -57,7 +83,7 @@ class Cluster:
57
83
  self.out1 = dist > out1
58
84
  self.out2 = dist > out2
59
85
 
60
- def copy(self) -> "Cluster":
86
+ def copy(self) -> Cluster:
61
87
  return Cluster(False, self.samples, self.sample_dist, True)
62
88
 
63
89
  def __repr__(self) -> str:
@@ -70,7 +96,7 @@ class Cluster:
70
96
  return f"{self.__class__.__name__}(**{repr(_params)})"
71
97
 
72
98
 
73
- class Clusters(Dict[int, Dict[int, Cluster]]):
99
+ class Clusters(dict[int, dict[int, Cluster]]):
74
100
  def __init__(self, *args, **kwargs):
75
101
  super().__init__(*args, **kwargs)
76
102
  self.max_level: int = 1
@@ -92,10 +118,10 @@ class ClusterMergeEntry:
92
118
  self.inner_cluster = inner_cluster
93
119
  self.status = status
94
120
 
95
- def __lt__(self, value: "ClusterMergeEntry") -> bool:
121
+ def __lt__(self, value: ClusterMergeEntry) -> bool:
96
122
  return self.level.__lt__(value.level)
97
123
 
98
- def __gt__(self, value: "ClusterMergeEntry") -> bool:
124
+ def __gt__(self, value: ClusterMergeEntry) -> bool:
99
125
  return self.level.__gt__(value.level)
100
126
 
101
127
 
@@ -130,13 +156,13 @@ class Clusterer:
130
156
  self._on_init(dataset)
131
157
 
132
158
  def _on_init(self, dataset: ArrayLike):
133
- self._data: np.ndarray = to_numpy(dataset)
159
+ self._data: NDArray = flatten(to_numpy(dataset))
134
160
  self._validate_data(self._data)
135
161
  self._num_samples = len(self._data)
136
162
 
137
- self._darr: np.ndarray = pdist(self._data, metric="euclidean")
138
- self._sqdmat: np.ndarray = squareform(self._darr)
139
- self._larr: np.ndarray = extend_linkage(linkage(self._darr))
163
+ self._darr: NDArray = pdist(self._data, metric="euclidean")
164
+ self._sqdmat: NDArray = squareform(self._darr)
165
+ self._larr: NDArray = extend_linkage(linkage(self._darr))
140
166
  self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
141
167
 
142
168
  min_num = int(self._num_samples * 0.05)
@@ -146,7 +172,7 @@ class Clusterer:
146
172
  self._last_good_merge_levels = None
147
173
 
148
174
  @property
149
- def data(self) -> np.ndarray:
175
+ def data(self) -> NDArray:
150
176
  return self._data
151
177
 
152
178
  @data.setter
@@ -160,16 +186,16 @@ class Clusterer:
160
186
  return self._clusters
161
187
 
162
188
  @property
163
- def last_good_merge_levels(self) -> Dict[int, int]:
189
+ def last_good_merge_levels(self) -> dict[int, int]:
164
190
  if self._last_good_merge_levels is None:
165
191
  self._last_good_merge_levels = self._get_last_merge_levels()
166
192
  return self._last_good_merge_levels
167
193
 
168
194
  @classmethod
169
- def _validate_data(cls, x: np.ndarray):
195
+ def _validate_data(cls, x: NDArray):
170
196
  """Checks that the data has the correct size, shape, and format"""
171
197
  if not isinstance(x, np.ndarray):
172
- raise TypeError(f"Data should be of type np.ndarray; got {type(x)}")
198
+ raise TypeError(f"Data should be of type NDArray; got {type(x)}")
173
199
 
174
200
  if x.ndim != 2:
175
201
  raise ValueError(
@@ -184,7 +210,7 @@ class Clusterer:
184
210
  def _create_clusters(self) -> Clusters:
185
211
  """Generates clusters based on linkage matrix"""
186
212
  next_cluster_id = 0
187
- cluster_map: Dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
213
+ cluster_map: dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
188
214
  clusters: Clusters = Clusters()
189
215
 
190
216
  # Walking through the linkage array to generate clusters
@@ -212,7 +238,7 @@ class Clusterer:
212
238
  # Update clusters to include previously skipped levels
213
239
  clusters = self._fill_levels(clusters, left, right)
214
240
  elif left or right:
215
- child, other_id = cast(Tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
241
+ child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
216
242
  cc = clusters[child.level][child.cid]
217
243
  samples = np.concatenate([cc.samples, [other_id]])
218
244
  sample_dist = np.concatenate([cc.sample_dist, sample_dist])
@@ -240,7 +266,7 @@ class Clusterer:
240
266
  clusters[level_id].setdefault(cid, cluster)
241
267
  return clusters
242
268
 
243
- def _get_cluster_distances(self) -> np.ndarray:
269
+ def _get_cluster_distances(self) -> NDArray:
244
270
  """Calculates the minimum distances between clusters are each level"""
245
271
  # Cluster distance matrix
246
272
  max_level = self.clusters.max_level
@@ -261,7 +287,7 @@ class Clusterer:
261
287
 
262
288
  return cluster_matrix
263
289
 
264
- def _calc_merge_indices(self, merge_mean: List[np.ndarray], intra_max: List[float]) -> np.ndarray:
290
+ def _calc_merge_indices(self, merge_mean: list[NDArray], intra_max: list[float]) -> NDArray:
265
291
  """
266
292
  Determine what clusters should be merged and return their indices
267
293
  """
@@ -284,7 +310,7 @@ class Clusterer:
284
310
  mask2 = mask2_vals < one_std_check
285
311
  return np.logical_or(desired_merge, mask2)
286
312
 
287
- def _generate_merge_list(self, cluster_matrix: np.ndarray) -> List[ClusterMergeEntry]:
313
+ def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
288
314
  """
289
315
  Runs through the clusters dictionary determining when clusters merge,
290
316
  and how close are those clusters when they merge.
@@ -301,7 +327,7 @@ class Clusterer:
301
327
  """
302
328
  intra_max = []
303
329
  merge_mean = []
304
- merge_list: List[ClusterMergeEntry] = []
330
+ merge_list: list[ClusterMergeEntry] = []
305
331
 
306
332
  for level, cluster_set in self.clusters.items():
307
333
  for outer_cluster, cluster in cluster_set.items():
@@ -339,7 +365,7 @@ class Clusterer:
339
365
 
340
366
  return merge_list
341
367
 
342
- def _get_last_merge_levels(self) -> Dict[int, int]:
368
+ def _get_last_merge_levels(self) -> dict[int, int]:
343
369
  """
344
370
  Creates a dictionary for important cluster ids mapped to their last good merge level
345
371
 
@@ -348,7 +374,7 @@ class Clusterer:
348
374
  Dict[int, int]
349
375
  A mapping of a cluster id to its last good merge level
350
376
  """
351
- last_merge_levels: Dict[int, int] = {}
377
+ last_merge_levels: dict[int, int] = {}
352
378
 
353
379
  if self._max_clusters <= 1:
354
380
  last_merge_levels = {0: int(self._num_samples * 0.1)}
@@ -371,7 +397,7 @@ class Clusterer:
371
397
 
372
398
  return last_merge_levels
373
399
 
374
- def find_outliers(self, last_merge_levels: Dict[int, int]) -> Tuple[List[int], List[int]]:
400
+ def find_outliers(self, last_merge_levels: dict[int, int]) -> tuple[list[int], list[int]]:
375
401
  """
376
402
  Retrieves outliers based on when the sample was added to the cluster
377
403
  and how far it was from the cluster when it was added
@@ -415,9 +441,9 @@ class Clusterer:
415
441
 
416
442
  return sorted(outliers), sorted(possible_outliers)
417
443
 
418
- def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> List[List[int]]:
444
+ def _sorted_union_find(self, index_groups: Iterable[Iterable[int]]) -> list[list[int]]:
419
445
  """Merges and sorts groups of indices that share any common index"""
420
- groups: List[List[int]] = []
446
+ groups: list[list[int]] = []
421
447
  for indices in zip(*index_groups):
422
448
  indices = set(indices)
423
449
  temp = []
@@ -430,7 +456,7 @@ class Clusterer:
430
456
  groups = temp
431
457
  return sorted(groups)
432
458
 
433
- def find_duplicates(self, last_merge_levels: Dict[int, int]) -> Tuple[List[List[int]], List[List[int]]]:
459
+ def find_duplicates(self, last_merge_levels: dict[int, int]) -> tuple[list[list[int]], list[list[int]]]:
434
460
  """
435
461
  Finds duplicate and near duplicate data based on the last good merge levels when building the cluster
436
462
 
@@ -464,35 +490,23 @@ class Clusterer:
464
490
 
465
491
  return exact_dupes, near_dupes
466
492
 
467
- def evaluate(self):
493
+ # TODO: Move data input to evaluate from class
494
+ @set_metadata("dataeval.detectors", ["data"])
495
+ def evaluate(self) -> ClustererOutput:
468
496
  """Finds and flags indices of the data for outliers and duplicates
469
497
 
470
498
  Returns
471
499
  -------
472
- Dict[str, List[int]]
473
- outliers :
474
- List of indices that do not fall within a cluster
475
- potential_outliers :
476
- List of indices which are near the border between belonging in the cluster and being an outlier
477
- duplicates :
478
- List of groups of indices that are exact duplicates
479
- potential_duplicates :
480
- List of groups of indices which are not exact but closely related data points
500
+ ClustererOutput
501
+ The outliers and duplicate indices found in the data
481
502
 
482
503
  Example
483
504
  -------
484
505
  >>> cluster.evaluate()
485
- {'outliers': [18, 21, 34, 35, 45], 'potential_outliers': [13, 15, 42], 'duplicates': [[9, 24], [23, 48]], 'potential_duplicates': [[1, 11]]}
506
+ ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
486
507
  """ # noqa: E501
487
508
 
488
509
  outliers, potential_outliers = self.find_outliers(self.last_good_merge_levels)
489
510
  duplicates, potential_duplicates = self.find_duplicates(self.last_good_merge_levels)
490
511
 
491
- ret = {
492
- "outliers": outliers,
493
- "potential_outliers": potential_outliers,
494
- "duplicates": duplicates,
495
- "potential_duplicates": potential_duplicates,
496
- }
497
-
498
- return ret
512
+ return ClustererOutput(outliers, potential_outliers, duplicates, potential_duplicates)