dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +9 -10
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
  16. dataeval/detectors/ood/__init__.py +6 -6
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
  18. dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
  25. dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
  26. dataeval/{_internal/interop.py → interop.py} +12 -7
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
  32. dataeval/metrics/bias/metadata.py +275 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +7 -3
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/shared.py +151 -0
  51. dataeval/{_internal → utils}/split_dataset.py +98 -33
  52. dataeval/utils/tensorflow/__init__.py +7 -6
  53. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
  54. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
  56. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
  57. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
  58. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  59. dataeval/utils/torch/__init__.py +7 -3
  60. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  61. dataeval/{_internal → utils/torch}/datasets.py +48 -42
  62. dataeval/utils/torch/models.py +138 -0
  63. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
  64. dataeval/{_internal → utils/torch}/utils.py +3 -1
  65. dataeval/workflows/__init__.py +1 -1
  66. dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
  67. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
  68. dataeval-0.72.2.dist-info/RECORD +72 -0
  69. dataeval/_internal/detectors/__init__.py +0 -0
  70. dataeval/_internal/detectors/drift/__init__.py +0 -0
  71. dataeval/_internal/detectors/ood/__init__.py +0 -0
  72. dataeval/_internal/metrics/__init__.py +0 -0
  73. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  74. dataeval/_internal/metrics/utils.py +0 -447
  75. dataeval/_internal/models/__init__.py +0 -0
  76. dataeval/_internal/models/pytorch/__init__.py +0 -0
  77. dataeval/_internal/models/pytorch/utils.py +0 -67
  78. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  79. dataeval/_internal/workflows/__init__.py +0 -0
  80. dataeval/detectors/drift/kernels/__init__.py +0 -10
  81. dataeval/detectors/drift/updates/__init__.py +0 -8
  82. dataeval/utils/tensorflow/models/__init__.py +0 -9
  83. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  84. dataeval/utils/torch/datasets/__init__.py +0 -12
  85. dataeval/utils/torch/models/__init__.py +0 -11
  86. dataeval/utils/torch/trainer/__init__.py +0 -7
  87. dataeval-0.72.1.dist-info/RECORD +0 -81
  88. /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
  89. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
  90. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,16 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["ClustererOutput", "Clusterer"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Iterable, NamedTuple, cast
6
+ from typing import Any, Iterable, NamedTuple, cast
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
  from scipy.cluster.hierarchy import linkage
9
11
  from scipy.spatial.distance import pdist, squareform
10
12
 
11
- from dataeval._internal.interop import to_numpy
12
- from dataeval._internal.metrics.utils import flatten
13
- from dataeval._internal.output import OutputMetadata, set_metadata
13
+ from dataeval.interop import to_numpy
14
+ from dataeval.output import OutputMetadata, set_metadata
15
+ from dataeval.utils.shared import flatten
14
16
 
15
17
 
16
18
  @dataclass(frozen=True)
@@ -36,7 +38,7 @@ class ClustererOutput(OutputMetadata):
36
38
  potential_duplicates: list[list[int]]
37
39
 
38
40
 
39
- def extend_linkage(link_arr: NDArray) -> NDArray:
41
+ def _extend_linkage(link_arr: NDArray) -> NDArray:
40
42
  """
41
43
  Adds a column to the linkage matrix link_arr that tracks the new id assigned
42
44
  to each row
@@ -60,10 +62,10 @@ def extend_linkage(link_arr: NDArray) -> NDArray:
60
62
  return arr
61
63
 
62
64
 
63
- class Cluster:
65
+ class _Cluster:
64
66
  __slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
65
67
 
66
- def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
68
+ def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
67
69
  self.merged = merged
68
70
  self.samples = np.array(samples, dtype=np.int32)
69
71
  self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
@@ -85,8 +87,8 @@ class Cluster:
85
87
  self.out1 = dist > out1
86
88
  self.out2 = dist > out2
87
89
 
88
- def copy(self) -> Cluster:
89
- return Cluster(False, self.samples, self.sample_dist, True)
90
+ def copy(self) -> _Cluster:
91
+ return _Cluster(False, self.samples, self.sample_dist, True)
90
92
 
91
93
  def __repr__(self) -> str:
92
94
  _params = {
@@ -98,32 +100,32 @@ class Cluster:
98
100
  return f"{self.__class__.__name__}(**{repr(_params)})"
99
101
 
100
102
 
101
- class Clusters(dict[int, dict[int, Cluster]]):
102
- def __init__(self, *args, **kwargs):
103
- super().__init__(*args, **kwargs)
103
+ class _Clusters(dict[int, dict[int, _Cluster]]):
104
+ def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
105
+ super().__init__(*args)
104
106
  self.max_level: int = 1
105
107
 
106
108
 
107
- class ClusterPosition(NamedTuple):
109
+ class _ClusterPosition(NamedTuple):
108
110
  """Keeps track of a cluster's level and ID"""
109
111
 
110
112
  level: int
111
113
  cid: int
112
114
 
113
115
 
114
- class ClusterMergeEntry:
116
+ class _ClusterMergeEntry:
115
117
  __slots__ = "level", "outer_cluster", "inner_cluster", "status"
116
118
 
117
- def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
119
+ def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
118
120
  self.level = level
119
121
  self.outer_cluster = outer_cluster
120
122
  self.inner_cluster = inner_cluster
121
123
  self.status = status
122
124
 
123
- def __lt__(self, value: ClusterMergeEntry) -> bool:
125
+ def __lt__(self, value: _ClusterMergeEntry) -> bool:
124
126
  return self.level.__lt__(value.level)
125
127
 
126
- def __gt__(self, value: ClusterMergeEntry) -> bool:
128
+ def __gt__(self, value: _ClusterMergeEntry) -> bool:
127
129
  return self.level.__gt__(value.level)
128
130
 
129
131
 
@@ -153,36 +155,36 @@ class Clusterer:
153
155
  >>> cluster = Clusterer(dataset)
154
156
  """
155
157
 
156
- def __init__(self, dataset: ArrayLike):
158
+ def __init__(self, dataset: ArrayLike) -> None:
157
159
  # Allows an update to dataset to reset the state rather than instantiate a new class
158
160
  self._on_init(dataset)
159
161
 
160
162
  def _on_init(self, dataset: ArrayLike):
161
- self._data: NDArray = flatten(to_numpy(dataset))
163
+ self._data: NDArray[Any] = flatten(to_numpy(dataset))
162
164
  self._validate_data(self._data)
163
165
  self._num_samples = len(self._data)
164
166
 
165
- self._darr: NDArray = pdist(self._data, metric="euclidean")
166
- self._sqdmat: NDArray = squareform(self._darr)
167
- self._larr: NDArray = extend_linkage(linkage(self._darr))
167
+ self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
168
+ self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
169
+ self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
168
170
  self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
169
171
 
170
172
  min_num = int(self._num_samples * 0.05)
171
- self._min_num_samples_per_cluster = min(max(2, min_num), 100)
173
+ self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
172
174
 
173
- self._clusters = None
174
- self._last_good_merge_levels = None
175
+ self._clusters: _Clusters | None = None
176
+ self._last_good_merge_levels: dict[int, int] | None = None
175
177
 
176
178
  @property
177
- def data(self) -> NDArray:
179
+ def data(self) -> NDArray[Any]:
178
180
  return self._data
179
181
 
180
182
  @data.setter
181
- def data(self, x: ArrayLike):
183
+ def data(self, x: ArrayLike) -> None:
182
184
  self._on_init(x)
183
185
 
184
186
  @property
185
- def clusters(self) -> Clusters:
187
+ def clusters(self) -> _Clusters:
186
188
  if self._clusters is None:
187
189
  self._clusters = self._create_clusters()
188
190
  return self._clusters
@@ -209,11 +211,11 @@ class Clusterer:
209
211
  if features < 1:
210
212
  raise ValueError(f"Samples should have at least 1 feature; got {features}")
211
213
 
212
- def _create_clusters(self) -> Clusters:
214
+ def _create_clusters(self) -> _Clusters:
213
215
  """Generates clusters based on linkage matrix"""
214
216
  next_cluster_id = 0
215
- cluster_map: dict[int, ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
216
- clusters: Clusters = Clusters()
217
+ cluster_map: dict[int, _ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
218
+ clusters: _Clusters = _Clusters()
217
219
 
218
220
  # Walking through the linkage array to generate clusters
219
221
  for arr_i in self._larr:
@@ -240,7 +242,7 @@ class Clusterer:
240
242
  # Update clusters to include previously skipped levels
241
243
  clusters = self._fill_levels(clusters, left, right)
242
244
  elif left or right:
243
- child, other_id = cast(tuple[ClusterPosition, int], (left, right_id) if left else (right, left_id))
245
+ child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
244
246
  cc = clusters[child.level][child.cid]
245
247
  samples = np.concatenate([cc.samples, [other_id]])
246
248
  sample_dist = np.concatenate([cc.sample_dist, sample_dist])
@@ -254,12 +256,12 @@ class Clusterer:
254
256
  if level not in clusters:
255
257
  clusters[level] = {}
256
258
 
257
- clusters[level][cid] = Cluster(merged, samples, sample_dist)
258
- cluster_map[int(arr_i[-1])] = ClusterPosition(level, cid)
259
+ clusters[level][cid] = _Cluster(merged, samples, sample_dist)
260
+ cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
259
261
 
260
262
  return clusters
261
263
 
262
- def _fill_levels(self, clusters: Clusters, left: ClusterPosition, right: ClusterPosition) -> Clusters:
264
+ def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
263
265
  # Sets each level's cluster info if it does not exist
264
266
  if left.level != right.level:
265
267
  (level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
@@ -312,7 +314,7 @@ class Clusterer:
312
314
  mask2 = mask2_vals < one_std_check
313
315
  return np.logical_or(desired_merge, mask2)
314
316
 
315
- def _generate_merge_list(self, cluster_matrix: NDArray) -> list[ClusterMergeEntry]:
317
+ def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
316
318
  """
317
319
  Runs through the clusters dictionary determining when clusters merge,
318
320
  and how close are those clusters when they merge.
@@ -329,7 +331,7 @@ class Clusterer:
329
331
  """
330
332
  intra_max = []
331
333
  merge_mean = []
332
- merge_list: list[ClusterMergeEntry] = []
334
+ merge_list: list[_ClusterMergeEntry] = []
333
335
 
334
336
  for level, cluster_set in self.clusters.items():
335
337
  for outer_cluster, cluster in cluster_set.items():
@@ -356,7 +358,7 @@ class Clusterer:
356
358
  # Calculate the corresponding distance stats
357
359
  distance_stats_arr = aggregate_func(distances)
358
360
  merge_mean.append(distance_stats_arr)
359
- merge_list.append(ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
361
+ merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
360
362
 
361
363
  all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
362
364
 
@@ -493,7 +495,7 @@ class Clusterer:
493
495
  return exact_dupes, near_dupes
494
496
 
495
497
  # TODO: Move data input to evaluate from class
496
- @set_metadata("dataeval.detectors", ["data"])
498
+ @set_metadata(["data"])
497
499
  def evaluate(self) -> ClustererOutput:
498
500
  """Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
499
501
 
@@ -1,13 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["DuplicatesOutput", "Duplicates"]
4
+
3
5
  from dataclasses import dataclass
4
- from typing import Generic, Iterable, Sequence, TypeVar
6
+ from typing import Generic, Iterable, Sequence, TypeVar, overload
5
7
 
6
8
  from numpy.typing import ArrayLike
7
9
 
8
- from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
9
- from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
10
- from dataeval._internal.output import OutputMetadata, set_metadata
10
+ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
11
+ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
12
+ from dataeval.output import OutputMetadata, set_metadata
11
13
 
12
14
  DuplicateGroup = list[int]
13
15
  DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
@@ -58,7 +60,7 @@ class Duplicates:
58
60
  >>> exact_dupes = Duplicates(only_exact=True)
59
61
  """
60
62
 
61
- def __init__(self, only_exact: bool = False):
63
+ def __init__(self, only_exact: bool = False) -> None:
62
64
  self.stats: HashStatsOutput
63
65
  self.only_exact = only_exact
64
66
 
@@ -81,8 +83,16 @@ class Duplicates:
81
83
  "near": sorted(near),
82
84
  }
83
85
 
84
- @set_metadata("dataeval.detectors", ["only_exact"])
85
- def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
86
+ @overload
87
+ def from_stats(self, hashes: HashStatsOutput) -> DuplicatesOutput[DuplicateGroup]: ...
88
+
89
+ @overload
90
+ def from_stats(self, hashes: Sequence[HashStatsOutput]) -> DuplicatesOutput[DatasetDuplicateGroupMap]: ...
91
+
92
+ @set_metadata(["only_exact"])
93
+ def from_stats(
94
+ self, hashes: HashStatsOutput | Sequence[HashStatsOutput]
95
+ ) -> DuplicatesOutput[DuplicateGroup] | DuplicatesOutput[DatasetDuplicateGroupMap]:
86
96
  """
87
97
  Returns duplicate image indices for both exact matches and near matches
88
98
 
@@ -128,8 +138,8 @@ class Duplicates:
128
138
 
129
139
  return DuplicatesOutput(**duplicates)
130
140
 
131
- @set_metadata("dataeval.detectors", ["only_exact"])
132
- def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
141
+ @set_metadata(["only_exact"])
142
+ def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
133
143
  """
134
144
  Returns duplicate image indices for both exact matches and near matches
135
145
 
@@ -1,11 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = []
4
+
3
5
  from copy import deepcopy
4
6
  from typing import Sequence, TypeVar
5
7
 
6
8
  import numpy as np
7
9
 
8
- from dataeval._internal.metrics.stats.base import BaseStatsOutput
10
+ from dataeval.metrics.stats.base import BaseStatsOutput
9
11
 
10
12
  TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
11
13
 
@@ -1,18 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
+ __all__ = ["OutliersOutput", "Outliers"]
4
+
3
5
  from dataclasses import dataclass
4
6
  from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
5
7
 
6
8
  import numpy as np
7
9
  from numpy.typing import ArrayLike, NDArray
8
10
 
9
- from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
10
- from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
11
- from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
12
- from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
13
- from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
14
- from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
15
- from dataeval._internal.output import OutputMetadata, set_metadata
11
+ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
12
+ from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
13
+ from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
14
+ from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
15
+ from dataeval.metrics.stats.pixelstats import PixelStatsOutput
16
+ from dataeval.metrics.stats.visualstats import VisualStatsOutput
17
+ from dataeval.output import OutputMetadata, set_metadata
16
18
 
17
19
  IndexIssueMap = dict[int, dict[str, float]]
18
20
  OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
@@ -37,7 +39,7 @@ class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
37
39
 
38
40
  issues: TIndexIssueMap
39
41
 
40
- def __len__(self):
42
+ def __len__(self) -> int:
41
43
  if isinstance(self.issues, dict):
42
44
  return len(self.issues)
43
45
  else:
@@ -157,10 +159,10 @@ class Outliers:
157
159
  @overload
158
160
  def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
159
161
 
160
- @set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
162
+ @set_metadata(["outlier_method", "outlier_threshold"])
161
163
  def from_stats(
162
164
  self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
163
- ) -> OutliersOutput:
165
+ ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
164
166
  """
165
167
  Returns indices of Outliers with the issues identified for each
166
168
 
@@ -195,7 +197,7 @@ class Outliers:
195
197
  {}
196
198
  """ # noqa: E501
197
199
  if isinstance(stats, DatasetStatsOutput):
198
- outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
200
+ outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
199
201
  return OutliersOutput(outliers)
200
202
 
201
203
  if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
@@ -226,16 +228,7 @@ class Outliers:
226
228
 
227
229
  return OutliersOutput(output_list)
228
230
 
229
- @set_metadata(
230
- "dataeval.detectors",
231
- [
232
- "use_dimension",
233
- "use_pixel",
234
- "use_visual",
235
- "outlier_method",
236
- "outlier_threshold",
237
- ],
238
- )
231
+ @set_metadata(["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
239
232
  def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
240
233
  """
241
234
  Returns indices of Outliers with the issues identified for each
@@ -5,11 +5,11 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
5
5
  from dataeval import _IS_TENSORFLOW_AVAILABLE
6
6
 
7
7
  if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
8
- from dataeval._internal.detectors.ood.ae import OOD_AE
9
- from dataeval._internal.detectors.ood.aegmm import OOD_AEGMM
10
- from dataeval._internal.detectors.ood.base import OODOutput, OODScoreOutput
11
- from dataeval._internal.detectors.ood.llr import OOD_LLR
12
- from dataeval._internal.detectors.ood.vae import OOD_VAE
13
- from dataeval._internal.detectors.ood.vaegmm import OOD_VAEGMM
8
+ from dataeval.detectors.ood.ae import OOD_AE
9
+ from dataeval.detectors.ood.aegmm import OOD_AEGMM
10
+ from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
11
+ from dataeval.detectors.ood.llr import OOD_LLR
12
+ from dataeval.detectors.ood.vae import OOD_VAE
13
+ from dataeval.detectors.ood.vaegmm import OOD_VAEGMM
14
14
 
15
15
  __all__ = ["OOD_AE", "OOD_AEGMM", "OOD_LLR", "OOD_VAE", "OOD_VAEGMM", "OODOutput", "OODScoreOutput"]
@@ -8,6 +8,8 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ __all__ = ["OOD_AE"]
12
+
11
13
  from typing import Callable
12
14
 
13
15
  import numpy as np
@@ -15,11 +17,10 @@ import tensorflow as tf
15
17
  import tf_keras as keras
16
18
  from numpy.typing import ArrayLike
17
19
 
18
- from dataeval._internal.detectors.ood.base import OODBase, OODScoreOutput
19
- from dataeval._internal.interop import as_numpy
20
- from dataeval._internal.models.tensorflow.autoencoder import AE
21
- from dataeval._internal.models.tensorflow.utils import predict_batch
22
- from dataeval._internal.output import set_metadata
20
+ from dataeval.detectors.ood.base import OODBase, OODScoreOutput
21
+ from dataeval.interop import as_numpy
22
+ from dataeval.utils.tensorflow._internal.autoencoder import AE
23
+ from dataeval.utils.tensorflow._internal.utils import predict_batch
23
24
 
24
25
 
25
26
  class OOD_AE(OODBase):
@@ -49,8 +50,7 @@ class OOD_AE(OODBase):
49
50
  loss_fn = keras.losses.MeanSquaredError()
50
51
  super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
51
52
 
52
- @set_metadata("dataeval.detectors")
53
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
53
+ def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
54
54
  self._validate(X := as_numpy(X))
55
55
 
56
56
  # reconstruct instances
@@ -8,19 +8,20 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ __all__ = ["OOD_AEGMM"]
12
+
11
13
  from typing import Callable
12
14
 
13
15
  import tensorflow as tf
14
16
  import tf_keras as keras
15
17
  from numpy.typing import ArrayLike
16
18
 
17
- from dataeval._internal.detectors.ood.base import OODGMMBase, OODScoreOutput
18
- from dataeval._internal.interop import to_numpy
19
- from dataeval._internal.models.tensorflow.autoencoder import AEGMM
20
- from dataeval._internal.models.tensorflow.gmm import gmm_energy
21
- from dataeval._internal.models.tensorflow.losses import LossGMM
22
- from dataeval._internal.models.tensorflow.utils import predict_batch
23
- from dataeval._internal.output import set_metadata
19
+ from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
20
+ from dataeval.interop import to_numpy
21
+ from dataeval.utils.tensorflow._internal.autoencoder import AEGMM
22
+ from dataeval.utils.tensorflow._internal.gmm import gmm_energy
23
+ from dataeval.utils.tensorflow._internal.loss import LossGMM
24
+ from dataeval.utils.tensorflow._internal.utils import predict_batch
24
25
 
25
26
 
26
27
  class OOD_AEGMM(OODGMMBase):
@@ -50,28 +51,7 @@ class OOD_AEGMM(OODGMMBase):
50
51
  loss_fn = LossGMM()
51
52
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
52
53
 
53
- @set_metadata("dataeval.detectors")
54
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
55
- """
56
- Compute the :term:`out of distribution<Out-of-distribution (OOD)>` score for a given dataset.
57
-
58
- Parameters
59
- ----------
60
- X : ArrayLike
61
- Input data to score.
62
- batch_size : int, default 1e10
63
- Number of instances to process in each batch.
64
- Use a smaller batch size if your dataset is large or if you encounter memory issues.
65
-
66
- Returns
67
- -------
68
- OODScoreOutput
69
- An object containing the instance-level OOD score.
70
-
71
- Note
72
- ----
73
- This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
74
- """
54
+ def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
75
55
  self._validate(X := to_numpy(X))
76
56
  _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
77
57
  energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
@@ -8,6 +8,8 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
+ __all__ = ["OODOutput", "OODScoreOutput"]
12
+
11
13
  from abc import ABC, abstractmethod
12
14
  from dataclasses import dataclass
13
15
  from typing import Callable, Literal, cast
@@ -17,10 +19,10 @@ import tensorflow as tf
17
19
  import tf_keras as keras
18
20
  from numpy.typing import ArrayLike, NDArray
19
21
 
20
- from dataeval._internal.interop import to_numpy
21
- from dataeval._internal.models.tensorflow.gmm import GaussianMixtureModelParams, gmm_params
22
- from dataeval._internal.models.tensorflow.trainer import trainer
23
- from dataeval._internal.output import OutputMetadata, set_metadata
22
+ from dataeval.interop import to_numpy
23
+ from dataeval.output import OutputMetadata, set_metadata
24
+ from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
25
+ from dataeval.utils.tensorflow._internal.trainer import trainer
24
26
 
25
27
 
26
28
  @dataclass(frozen=True)
@@ -61,7 +63,7 @@ class OODScoreOutput(OutputMetadata):
61
63
  instance_score: NDArray[np.float32]
62
64
  feature_score: NDArray[np.float32] | None = None
63
65
 
64
- def get(self, ood_type: Literal["instance", "feature"]) -> NDArray:
66
+ def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
65
67
  """
66
68
  Returns either the instance or feature score
67
69
 
@@ -107,6 +109,9 @@ class OODBase(ABC):
107
109
  self._validate(X)
108
110
 
109
111
  @abstractmethod
112
+ def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput: ...
113
+
114
+ @set_metadata()
110
115
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
111
116
  """
112
117
  Compute the :term:`out of distribution<Out-of-distribution (OOD)>` scores for a given dataset.
@@ -124,6 +129,7 @@ class OODBase(ABC):
124
129
  OODScoreOutput
125
130
  An object containing the instance-level and feature-level OOD scores.
126
131
  """
132
+ return self._score(X, batch_size)
127
133
 
128
134
  def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
129
135
  return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
@@ -131,12 +137,12 @@ class OODBase(ABC):
131
137
  def fit(
132
138
  self,
133
139
  x_ref: ArrayLike,
134
- threshold_perc: float = 100.0,
135
- loss_fn: Callable[..., tf.Tensor] | None = None,
136
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
137
- epochs: int = 20,
138
- batch_size: int = 64,
139
- verbose: bool = True,
140
+ threshold_perc: float,
141
+ loss_fn: Callable[..., tf.Tensor],
142
+ optimizer: keras.optimizers.Optimizer,
143
+ epochs: int,
144
+ batch_size: int,
145
+ verbose: bool,
140
146
  ) -> None:
141
147
  """
142
148
  Train the model and infer the threshold value.
@@ -174,7 +180,7 @@ class OODBase(ABC):
174
180
  self._ref_score = self.score(x_ref, batch_size)
175
181
  self._threshold_perc = threshold_perc
176
182
 
177
- @set_metadata("dataeval.detectors")
183
+ @set_metadata()
178
184
  def predict(
179
185
  self,
180
186
  X: ArrayLike,
@@ -218,12 +224,12 @@ class OODGMMBase(OODBase):
218
224
  def fit(
219
225
  self,
220
226
  x_ref: ArrayLike,
221
- threshold_perc: float = 100.0,
222
- loss_fn: Callable[..., tf.Tensor] | None = None,
223
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
224
- epochs: int = 20,
225
- batch_size: int = 64,
226
- verbose: bool = True,
227
+ threshold_perc: float,
228
+ loss_fn: Callable[..., tf.Tensor],
229
+ optimizer: keras.optimizers.Optimizer,
230
+ epochs: int,
231
+ batch_size: int,
232
+ verbose: bool,
227
233
  ) -> None:
228
234
  # Train the model
229
235
  trainer(