dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +9 -10
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
  16. dataeval/detectors/ood/__init__.py +6 -6
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
  18. dataeval/detectors/ood/aegmm.py +66 -0
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
  25. dataeval/detectors/ood/vaegmm.py +75 -0
  26. dataeval/interop.py +56 -0
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
  32. dataeval/metrics/bias/metadata.py +358 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +8 -3
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/lazy.py +26 -0
  51. dataeval/utils/metadata.py +258 -0
  52. dataeval/utils/shared.py +151 -0
  53. dataeval/{_internal → utils}/split_dataset.py +98 -33
  54. dataeval/utils/tensorflow/__init__.py +7 -6
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
  56. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
  57. dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
  58. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
  59. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
  60. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  61. dataeval/utils/torch/__init__.py +7 -3
  62. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  63. dataeval/{_internal → utils/torch}/datasets.py +48 -42
  64. dataeval/utils/torch/models.py +138 -0
  65. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
  66. dataeval/{_internal → utils/torch}/utils.py +3 -1
  67. dataeval/workflows/__init__.py +1 -1
  68. dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
  69. {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
  70. dataeval-0.73.0.dist-info/RECORD +73 -0
  71. dataeval/_internal/detectors/__init__.py +0 -0
  72. dataeval/_internal/detectors/drift/__init__.py +0 -0
  73. dataeval/_internal/detectors/ood/__init__.py +0 -0
  74. dataeval/_internal/detectors/ood/aegmm.py +0 -78
  75. dataeval/_internal/detectors/ood/vaegmm.py +0 -89
  76. dataeval/_internal/interop.py +0 -49
  77. dataeval/_internal/metrics/__init__.py +0 -0
  78. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  79. dataeval/_internal/metrics/utils.py +0 -447
  80. dataeval/_internal/models/__init__.py +0 -0
  81. dataeval/_internal/models/pytorch/__init__.py +0 -0
  82. dataeval/_internal/models/pytorch/utils.py +0 -67
  83. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  84. dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
  85. dataeval/_internal/workflows/__init__.py +0 -0
  86. dataeval/detectors/drift/kernels/__init__.py +0 -10
  87. dataeval/detectors/drift/updates/__init__.py +0 -8
  88. dataeval/utils/tensorflow/models/__init__.py +0 -9
  89. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  90. dataeval/utils/torch/datasets/__init__.py +0 -12
  91. dataeval/utils/torch/models/__init__.py +0 -11
  92. dataeval/utils/torch/trainer/__init__.py +0 -7
  93. dataeval-0.72.1.dist-info/RECORD +0 -81
  94. {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
  95. {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["ClustererOutput", "Clusterer"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Iterable, NamedTuple, cast
6
+ from typing import Any, Iterable, NamedTuple, cast
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
  from scipy.cluster.hierarchy import linkage
9
11
  from scipy.spatial.distance import pdist, squareform
10
12
 
11
- from dataeval._internal.interop import to_numpy
12
- from dataeval._internal.metrics.utils import flatten
13
- from dataeval._internal.output import OutputMetadata, set_metadata
13
+ from dataeval.interop import to_numpy
14
+ from dataeval.output import OutputMetadata, set_metadata
15
+ from dataeval.utils.shared import flatten
14
16
 
15
17
 
16
18
  @dataclass(frozen=True)
@@ -36,7 +38,7 @@ class ClustererOutput(OutputMetadata):
36
38
  potential_duplicates: list[list[int]]
37
39
 
38
40
 
39
- def extend_linkage(link_arr: NDArray) -> NDArray:
41
+ def _extend_linkage(link_arr: NDArray) -> NDArray:
40
42
  """
41
43
  Adds a column to the linkage matrix link_arr that tracks the new id assigned
42
44
  to each row
@@ -60,10 +62,10 @@ def extend_linkage(link_arr: NDArray) -> NDArray:
60
62
  return arr
61
63
 
62
64
 
63
- class Cluster:
65
+ class _Cluster:
64
66
  __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
65
67
 
66
- def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
68
+ def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
67
69
  self.merged = merged
68
70
  self.samples = np.array(samples, dtype=np.int32)
69
71
  self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
@@ -85,8 +87,8 @@ class Cluster:
85
87
  self.out1 = dist > out1
86
88
  self.out2 = dist > out2
87
89
 
88
- def copy(self) -> Cluster:
89
- return Cluster(False, self.samples, self.sample_dist, True)
90
+ def copy(self) -> _Cluster:
91
+ return _Cluster(False, self.samples, self.sample_dist, True)
90
92
 
91
93
  def __repr__(self) -> str:
92
94
  _params = {
@@ -98,32 +100,32 @@ class Cluster:
98
100
  return f"{self.__class__.__name__}(**{repr(_params)})"
99
101
 
100
102
 
101
- class Clusters(dict[int, dict[int, Cluster]]):
102
- def __init__(self, *args, **kwargs):
103
- super().__init__(*args, **kwargs)
103
+ class _Clusters(dict[int, dict[int, _Cluster]]):
104
+ def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
105
+ super().__init__(*args)
104
106
  self.max_level: int = 1
105
107
 
106
108
 
107
- class ClusterPosition(NamedTuple):
109
+ class _ClusterPosition(NamedTuple):
108
110
  """Keeps track of a cluster's level and ID"""
109
111
 
110
112
  level: int
111
113
  cid: int
112
114
 
113
115
 
114
- class ClusterMergeEntry:
116
+ class _ClusterMergeEntry:
115
117
  __slots__ = "level", "outer_cluster", "inner_cluster", "status"
116
118
 
117
- def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
119
+ def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
118
120
  self.level = level
119
121
  self.outer_cluster = outer_cluster
120
122
  self.inner_cluster = inner_cluster
121
123
  self.status = status
122
124
 
123
- def __lt__(self, value: ClusterMergeEntry) -> bool:
125
+ def __lt__(self, value: _ClusterMergeEntry) -> bool:
124
126
  return self.level.__lt__(value.level)
125
127
 
126
- def __gt__(self, value: ClusterMergeEntry) -> bool:
128
+ def __gt__(self, value: _ClusterMergeEntry) -> bool:
127
129
  return self.level.__gt__(value.level)
128
130
 
129
131
 
@@ -153,36 +155,36 @@ class Clusterer:
153
155
  >>> cluster = Clusterer(dataset)
154
156
  """
155
157
 
156
- def __init__(self, dataset: ArrayLike):
158
+ def __init__(self, dataset: ArrayLike) -> None:
157
159
  # Allows an update to dataset to reset the state rather than instantiate a new class
158
160
  self._on_init(dataset)
159
161
 
160
162
  def _on_init(self, dataset: ArrayLike):
161
- self._data: NDArray = flatten(to_numpy(dataset))
163
+ self._data: NDArray[Any] = flatten(to_numpy(dataset))
162
164
  self._validate_data(self._data)
163
165
  self._num_samples = len(self._data)
164
166
 
165
- self._darr: NDArray = pdist(self._data, metric="euclidean")
166
- self._sqdmat: NDArray = squareform(self._darr)
167
- self._larr: NDArray = extend_linkage(linkage(self._darr))
167
+ self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
168
+ self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
169
+ self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
168
170
  self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
169
171
 
170
172
  min_num = int(self._num_samples * 0.05)
171
- self._min_num_samples_per_cluster = min(max(2, min_num), 100)
173
+ self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
172
174
 
173
- self._clusters = None
174
- self._last_good_merge_levels = None
175
+ self._clusters: _Clusters | None = None
176
+ self._last_good_merge_levels: dict[int, int] | None = None
175
177
 
176
178
  @property
177
- def data(self) -> NDArray:
179
+ def data(self) -> NDArray[Any]:
178
180
  return self._data
179
181
 
180
182
  @data.setter
181
- def data(self, x: ArrayLike):
183
+ def data(self, x: ArrayLike) -> None:
182
184
  self._on_init(x)
183
185
 
184
186
  @property
185
- def clusters(self) -> Clusters:
187
+ def clusters(self) -> _Clusters:
186
188
  if self._clusters is None:
187
189
  self._clusters = self._create_clusters()
188
190
  return self._clusters
@@ -209,11 +211,11 @@ class Clusterer:
209
211
  if features < 1:
210
212
  raise ValueError(f"Samples should have at least 1 feature; got {features}")
211
213
 
212
- def _create_clusters(self) -> Clusters:
214
+ def _create_clusters(self) -> _Clusters:
213
215
  """Generates clusters based on linkage matrix"""
214
216
  next_cluster_id = 0
215
- cluster_map: dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
216
- clusters: Clusters = Clusters()
217
+ cluster_map: dict[int, _ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
218
+ clusters: _Clusters = _Clusters()
217
219
 
218
220
  # Walking through the linkage array to generate clusters
219
221
  for arr_i in self._larr:
@@ -240,7 +242,7 @@ class Clusterer:
240
242
  # Update clusters to include previously skipped levels
241
243
  clusters = self._fill_levels(clusters, left, right)
242
244
  elif left or right:
243
- child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
245
+ child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
244
246
  cc = clusters[child.level][child.cid]
245
247
  samples = np.concatenate([cc.samples, [other_id]])
246
248
  sample_dist = np.concatenate([cc.sample_dist, sample_dist])
@@ -254,12 +256,12 @@ class Clusterer:
254
256
  if level not in clusters:
255
257
  clusters[level] = {}
256
258
 
257
- clusters[level][cid] = Cluster(merged, samples, sample_dist)
258
- cluster_map[int(arr_i[-1])] = ClusterPosition(level, cid)
259
+ clusters[level][cid] = _Cluster(merged, samples, sample_dist)
260
+ cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
259
261
 
260
262
  return clusters
261
263
 
262
- def _fill_levels(self, clusters: Clusters, left: ClusterPosition, right: ClusterPosition) -> Clusters:
264
+ def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
263
265
  # Sets each level's cluster info if it does not exist
264
266
  if left.level != right.level:
265
267
  (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
@@ -312,7 +314,7 @@ class Clusterer:
312
314
  mask2 = mask2_vals < one_std_check
313
315
  return np.logical_or(desired_merge, mask2)
314
316
 
315
- def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
317
+ def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
316
318
  """
317
319
  Runs through the clusters dictionary determining when clusters merge,
318
320
  and how close are those clusters when they merge.
@@ -329,7 +331,7 @@ class Clusterer:
329
331
  """
330
332
  intra_max = []
331
333
  merge_mean = []
332
- merge_list: list[ClusterMergeEntry] = []
334
+ merge_list: list[_ClusterMergeEntry] = []
333
335
 
334
336
  for level, cluster_set in self.clusters.items():
335
337
  for outer_cluster, cluster in cluster_set.items():
@@ -356,7 +358,7 @@ class Clusterer:
356
358
  # Calculate the corresponding distance stats
357
359
  distance_stats_arr = aggregate_func(distances)
358
360
  merge_mean.append(distance_stats_arr)
359
- merge_list.append(ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
361
+ merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
360
362
 
361
363
  all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
362
364
 
@@ -493,7 +495,7 @@ class Clusterer:
493
495
  return exact_dupes, near_dupes
494
496
 
495
497
  # TODO: Move data input to evaluate from class
496
- @set_metadata("dataeval.detectors", ["data"])
498
+ @set_metadata(["data"])
497
499
  def evaluate(self) -> ClustererOutput:
498
500
  """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
499
501
 
@@ -1,13 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["DuplicatesOutput", "Duplicates"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Generic, Iterable, Sequence, TypeVar
6
+ from typing import Generic, Iterable, Sequence, TypeVar, overload
5
7
 
6
8
  from numpy.typing import ArrayLike
7
9
 
8
- from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
9
- from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
10
- from dataeval._internal.output import OutputMetadata, set_metadata
10
+ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
11
+ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
12
+ from dataeval.output import OutputMetadata, set_metadata
11
13
 
12
14
  DuplicateGroup = list[int]
13
15
  DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
@@ -58,7 +60,7 @@ class Duplicates:
58
60
  >>> exact_dupes = Duplicates(only_exact=True)
59
61
  """
60
62
 
61
- def __init__(self, only_exact: bool = False):
63
+ def __init__(self, only_exact: bool = False) -> None:
62
64
  self.stats: HashStatsOutput
63
65
  self.only_exact = only_exact
64
66
 
@@ -81,8 +83,16 @@ class Duplicates:
81
83
  "near": sorted(near),
82
84
  }
83
85
 
84
- @set_metadata("dataeval.detectors", ["only_exact"])
85
- def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
86
+ @overload
87
+ def from_stats(self, hashes: HashStatsOutput) -> DuplicatesOutput[DuplicateGroup]: ...
88
+
89
+ @overload
90
+ def from_stats(self, hashes: Sequence[HashStatsOutput]) -> DuplicatesOutput[DatasetDuplicateGroupMap]: ...
91
+
92
+ @set_metadata(["only_exact"])
93
+ def from_stats(
94
+ self, hashes: HashStatsOutput | Sequence[HashStatsOutput]
95
+ ) -> DuplicatesOutput[DuplicateGroup] | DuplicatesOutput[DatasetDuplicateGroupMap]:
86
96
  """
87
97
  Returns duplicate image indices for both exact matches and near matches
88
98
 
@@ -128,8 +138,8 @@ class Duplicates:
128
138
 
129
139
  return DuplicatesOutput(**duplicates)
130
140
 
131
- @set_metadata("dataeval.detectors", ["only_exact"])
132
- def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
141
+ @set_metadata(["only_exact"])
142
+ def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
133
143
  """
134
144
  Returns duplicate image indices for both exact matches and near matches
135
145
 
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = []
4
+
3
5
  from copy import deepcopy
4
6
  from typing import Sequence, TypeVar
5
7
 
6
8
  import numpy as np
7
9
 
8
- from dataeval._internal.metrics.stats.base import BaseStatsOutput
10
+ from dataeval.metrics.stats.base import BaseStatsOutput
9
11
 
10
12
  TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
11
13
 
@@ -1,18 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["OutliersOutput", "Outliers"]
4
+
3
5
  from dataclasses import dataclass
4
6
  from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
 
9
- from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
10
- from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
11
- from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
12
- from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
13
- from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
14
- from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
15
- from dataeval._internal.output import OutputMetadata, set_metadata
11
+ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
12
+ from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
13
+ from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
14
+ from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
15
+ from dataeval.metrics.stats.pixelstats import PixelStatsOutput
16
+ from dataeval.metrics.stats.visualstats import VisualStatsOutput
17
+ from dataeval.output import OutputMetadata, set_metadata
16
18
 
17
19
  IndexIssueMap = dict[int, dict[str, float]]
18
20
  OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
@@ -37,7 +39,7 @@ class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
37
39
 
38
40
  issues: TIndexIssueMap
39
41
 
40
- def __len__(self):
42
+ def __len__(self) -> int:
41
43
  if isinstance(self.issues, dict):
42
44
  return len(self.issues)
43
45
  else:
@@ -157,10 +159,10 @@ class Outliers:
157
159
  @overload
158
160
  def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
159
161
 
160
- @set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
162
+ @set_metadata(["outlier_method", "outlier_threshold"])
161
163
  def from_stats(
162
164
  self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
163
- ) -> OutliersOutput:
165
+ ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
164
166
  """
165
167
  Returns indices of Outliers with the issues identified for each
166
168
 
@@ -195,7 +197,7 @@ class Outliers:
195
197
  {}
196
198
  """ # noqa: E501
197
199
  if isinstance(stats, DatasetStatsOutput):
198
- outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
200
+ outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
199
201
  return OutliersOutput(outliers)
200
202
 
201
203
  if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
@@ -226,16 +228,7 @@ class Outliers:
226
228
 
227
229
  return OutliersOutput(output_list)
228
230
 
229
- @set_metadata(
230
- "dataeval.detectors",
231
- [
232
- "use_dimension",
233
- "use_pixel",
234
- "use_visual",
235
- "outlier_method",
236
- "outlier_threshold",
237
- ],
238
- )
231
+ @set_metadata(["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
239
232
  def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
240
233
  """
241
234
  Returns indices of Outliers with the issues identified for each
@@ -5,11 +5,11 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
5
5
  from dataeval import _IS_TENSORFLOW_AVAILABLE
6
6
 
7
7
  if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
8
- from dataeval._internal.detectors.ood.ae import OOD_AE
9
- from dataeval._internal.detectors.ood.aegmm import OOD_AEGMM
10
- from dataeval._internal.detectors.ood.base import OODOutput, OODScoreOutput
11
- from dataeval._internal.detectors.ood.llr import OOD_LLR
12
- from dataeval._internal.detectors.ood.vae import OOD_VAE
13
- from dataeval._internal.detectors.ood.vaegmm import OOD_VAEGMM
8
+ from dataeval.detectors.ood.ae import OOD_AE
9
+ from dataeval.detectors.ood.aegmm import OOD_AEGMM
10
+ from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
11
+ from dataeval.detectors.ood.llr import OOD_LLR
12
+ from dataeval.detectors.ood.vae import OOD_VAE
13
+ from dataeval.detectors.ood.vaegmm import OOD_VAEGMM
14
14
 
15
15
  __all__ = ["OOD_AE", "OOD_AEGMM", "OOD_LLR", "OOD_VAE", "OOD_VAEGMM", "OODOutput", "OODScoreOutput"]
@@ -8,18 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- from typing import Callable
11
+ __all__ = ["OOD_AE"]
12
+
13
+ from typing import TYPE_CHECKING, Callable
12
14
 
13
15
  import numpy as np
14
- import tensorflow as tf
15
- import tf_keras as keras
16
16
  from numpy.typing import ArrayLike
17
17
 
18
- from dataeval._internal.detectors.ood.base import OODBase, OODScoreOutput
19
- from dataeval._internal.interop import as_numpy
20
- from dataeval._internal.models.tensorflow.autoencoder import AE
21
- from dataeval._internal.models.tensorflow.utils import predict_batch
22
- from dataeval._internal.output import set_metadata
18
+ from dataeval.detectors.ood.base import OODBase, OODScoreOutput
19
+ from dataeval.interop import as_numpy
20
+ from dataeval.utils.lazy import lazyload
21
+ from dataeval.utils.tensorflow._internal.utils import predict_batch
22
+
23
+ if TYPE_CHECKING:
24
+ import tensorflow as tf
25
+ import tf_keras as keras
26
+
27
+ import dataeval.utils.tensorflow._internal.models as tf_models
28
+ else:
29
+ tf = lazyload("tensorflow")
30
+ keras = lazyload("tf_keras")
31
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
23
32
 
24
33
 
25
34
  class OOD_AE(OODBase):
@@ -32,7 +41,7 @@ class OOD_AE(OODBase):
32
41
  An :term:`autoencoder<Autoencoder>` model.
33
42
  """
34
43
 
35
- def __init__(self, model: AE) -> None:
44
+ def __init__(self, model: tf_models.AE) -> None:
36
45
  super().__init__(model)
37
46
 
38
47
  def fit(
@@ -40,7 +49,7 @@ class OOD_AE(OODBase):
40
49
  x_ref: ArrayLike,
41
50
  threshold_perc: float = 100.0,
42
51
  loss_fn: Callable[..., tf.Tensor] | None = None,
43
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
52
+ optimizer: keras.optimizers.Optimizer | None = None,
44
53
  epochs: int = 20,
45
54
  batch_size: int = 64,
46
55
  verbose: bool = True,
@@ -49,8 +58,7 @@ class OOD_AE(OODBase):
49
58
  loss_fn = keras.losses.MeanSquaredError()
50
59
  super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
51
60
 
52
- @set_metadata("dataeval.detectors")
53
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
61
+ def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
54
62
  self._validate(X := as_numpy(X))
55
63
 
56
64
  # reconstruct instances
@@ -0,0 +1,66 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ __all__ = ["OOD_AEGMM"]
12
+
13
+ from typing import TYPE_CHECKING, Callable
14
+
15
+ from numpy.typing import ArrayLike
16
+
17
+ from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
18
+ from dataeval.interop import to_numpy
19
+ from dataeval.utils.lazy import lazyload
20
+ from dataeval.utils.tensorflow._internal.gmm import gmm_energy
21
+ from dataeval.utils.tensorflow._internal.loss import LossGMM
22
+ from dataeval.utils.tensorflow._internal.utils import predict_batch
23
+
24
+ if TYPE_CHECKING:
25
+ import tensorflow as tf
26
+ import tf_keras as keras
27
+
28
+ import dataeval.utils.tensorflow._internal.models as tf_models
29
+ else:
30
+ tf = lazyload("tensorflow")
31
+ keras = lazyload("tf_keras")
32
+ tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
33
+
34
+
35
+ class OOD_AEGMM(OODGMMBase):
36
+ """
37
+ AE with Gaussian Mixture Model based outlier detector.
38
+
39
+ Parameters
40
+ ----------
41
+ model : AEGMM
42
+ An AEGMM model.
43
+ """
44
+
45
+ def __init__(self, model: tf_models.AEGMM) -> None:
46
+ super().__init__(model)
47
+
48
+ def fit(
49
+ self,
50
+ x_ref: ArrayLike,
51
+ threshold_perc: float = 100.0,
52
+ loss_fn: Callable[..., tf.Tensor] | None = None,
53
+ optimizer: keras.optimizers.Optimizer | None = None,
54
+ epochs: int = 20,
55
+ batch_size: int = 64,
56
+ verbose: bool = True,
57
+ ) -> None:
58
+ if loss_fn is None:
59
+ loss_fn = LossGMM()
60
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
61
+
62
+ def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
63
+ self._validate(X := to_numpy(X))
64
+ _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
65
+ energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
66
+ return OODScoreOutput(energy.numpy()) # type: ignore
@@ -8,19 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ __all__ = ["OODOutput", "OODScoreOutput"]
12
+
11
13
  from abc import ABC, abstractmethod
12
14
  from dataclasses import dataclass
13
- from typing import Callable, Literal, cast
15
+ from typing import TYPE_CHECKING, Callable, Literal, cast
14
16
 
15
17
  import numpy as np
16
- import tensorflow as tf
17
- import tf_keras as keras
18
18
  from numpy.typing import ArrayLike, NDArray
19
19
 
20
- from dataeval._internal.interop import to_numpy
21
- from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
22
- from dataeval._internal.models.tensorflow.trainer import trainer
23
- from dataeval._internal.output import OutputMetadata, set_metadata
20
+ from dataeval.interop import to_numpy
21
+ from dataeval.output import OutputMetadata, set_metadata
22
+ from dataeval.utils.lazy import lazyload
23
+ from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
24
+ from dataeval.utils.tensorflow._internal.trainer import trainer
25
+
26
+ if TYPE_CHECKING:
27
+ import tensorflow as tf
28
+ import tf_keras as keras
29
+ else:
30
+ tf = lazyload("tensorflow")
31
+ keras = lazyload("tf_keras")
24
32
 
25
33
 
26
34
  @dataclass(frozen=True)
@@ -61,7 +69,7 @@ class OODScoreOutput(OutputMetadata):
61
69
  instance_score: NDArray[np.float32]
62
70
  feature_score: NDArray[np.float32] | None = None
63
71
 
64
- def get(self, ood_type: Literal["instance", "feature"]) -> NDArray:
72
+ def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
65
73
  """
66
74
  Returns either the instance or feature score
67
75
 
@@ -107,6 +115,9 @@ class OODBase(ABC):
107
115
  self._validate(X)
108
116
 
109
117
  @abstractmethod
118
+ def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput: ...
119
+
120
+ @set_metadata()
110
121
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
111
122
  """
112
123
  Compute the :term:`out of distribution<Out-of-distribution (OOD)>` scores for a given dataset.
@@ -124,6 +135,7 @@ class OODBase(ABC):
124
135
  OODScoreOutput
125
136
  An object containing the instance-level and feature-level OOD scores.
126
137
  """
138
+ return self._score(X, batch_size)
127
139
 
128
140
  def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
129
141
  return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
@@ -131,12 +143,12 @@ class OODBase(ABC):
131
143
  def fit(
132
144
  self,
133
145
  x_ref: ArrayLike,
134
- threshold_perc: float = 100.0,
135
- loss_fn: Callable[..., tf.Tensor] | None = None,
136
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
137
- epochs: int = 20,
138
- batch_size: int = 64,
139
- verbose: bool = True,
146
+ threshold_perc: float,
147
+ loss_fn: Callable[..., tf.Tensor],
148
+ optimizer: keras.optimizers.Optimizer,
149
+ epochs: int,
150
+ batch_size: int,
151
+ verbose: bool,
140
152
  ) -> None:
141
153
  """
142
154
  Train the model and infer the threshold value.
@@ -174,7 +186,7 @@ class OODBase(ABC):
174
186
  self._ref_score = self.score(x_ref, batch_size)
175
187
  self._threshold_perc = threshold_perc
176
188
 
177
- @set_metadata("dataeval.detectors")
189
+ @set_metadata()
178
190
  def predict(
179
191
  self,
180
192
  X: ArrayLike,
@@ -218,12 +230,12 @@ class OODGMMBase(OODBase):
218
230
  def fit(
219
231
  self,
220
232
  x_ref: ArrayLike,
221
- threshold_perc: float = 100.0,
222
- loss_fn: Callable[..., tf.Tensor] | None = None,
223
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
224
- epochs: int = 20,
225
- batch_size: int = 64,
226
- verbose: bool = True,
233
+ threshold_perc: float,
234
+ loss_fn: Callable[..., tf.Tensor],
235
+ optimizer: keras.optimizers.Optimizer,
236
+ epochs: int,
237
+ batch_size: int,
238
+ verbose: bool,
227
239
  ) -> None:
228
240
  # Train the model
229
241
  trainer(