dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/detectors/__init__.py +4 -3
- dataeval/detectors/drift/__init__.py +9 -10
- dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
- dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
- dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
- dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
- dataeval/detectors/drift/updates.py +61 -0
- dataeval/detectors/linters/__init__.py +3 -3
- dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
- dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
- dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
- dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
- dataeval/detectors/ood/__init__.py +6 -6
- dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
- dataeval/detectors/ood/aegmm.py +66 -0
- dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
- dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
- dataeval/detectors/ood/metadata_ks_compare.py +99 -0
- dataeval/detectors/ood/metadata_least_likely.py +119 -0
- dataeval/detectors/ood/metadata_ood_mi.py +92 -0
- dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
- dataeval/detectors/ood/vaegmm.py +75 -0
- dataeval/interop.py +56 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +4 -4
- dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
- dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
- dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
- dataeval/metrics/bias/metadata.py +358 -0
- dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
- dataeval/metrics/estimators/__init__.py +3 -3
- dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
- dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
- dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
- dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
- dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
- dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
- dataeval/metrics/stats/hashstats.py +156 -0
- dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
- dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
- dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
- dataeval/{_internal/output.py → output.py} +26 -6
- dataeval/utils/__init__.py +8 -3
- dataeval/utils/image.py +71 -0
- dataeval/utils/lazy.py +26 -0
- dataeval/utils/metadata.py +258 -0
- dataeval/utils/shared.py +151 -0
- dataeval/{_internal → utils}/split_dataset.py +98 -33
- dataeval/utils/tensorflow/__init__.py +7 -6
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
- dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
- dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
- dataeval/utils/tensorflow/loss/__init__.py +6 -2
- dataeval/utils/torch/__init__.py +7 -3
- dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
- dataeval/{_internal → utils/torch}/datasets.py +48 -42
- dataeval/utils/torch/models.py +138 -0
- dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
- dataeval/{_internal → utils/torch}/utils.py +3 -1
- dataeval/workflows/__init__.py +1 -1
- dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
- {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
- dataeval-0.73.0.dist-info/RECORD +73 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/detectors/ood/aegmm.py +0 -78
- dataeval/_internal/detectors/ood/vaegmm.py +0 -89
- dataeval/_internal/interop.py +0 -49
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/stats/hashstats.py +0 -75
- dataeval/_internal/metrics/utils.py +0 -447
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/utils.py +0 -67
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/detectors/drift/kernels/__init__.py +0 -10
- dataeval/detectors/drift/updates/__init__.py +0 -8
- dataeval/utils/tensorflow/models/__init__.py +0 -9
- dataeval/utils/tensorflow/recon/__init__.py +0 -3
- dataeval/utils/torch/datasets/__init__.py +0 -12
- dataeval/utils/torch/models/__init__.py +0 -11
- dataeval/utils/torch/trainer/__init__.py +0 -7
- dataeval-0.72.1.dist-info/RECORD +0 -81
- {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0
@@ -1,16 +1,18 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = ["ClustererOutput", "Clusterer"]
|
4
|
+
|
3
5
|
from dataclasses import dataclass
|
4
|
-
from typing import Iterable, NamedTuple, cast
|
6
|
+
from typing import Any, Iterable, NamedTuple, cast
|
5
7
|
|
6
8
|
import numpy as np
|
7
9
|
from numpy.typing import ArrayLike, NDArray
|
8
10
|
from scipy.cluster.hierarchy import linkage
|
9
11
|
from scipy.spatial.distance import pdist, squareform
|
10
12
|
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
13
|
+
from dataeval.interop import to_numpy
|
14
|
+
from dataeval.output import OutputMetadata, set_metadata
|
15
|
+
from dataeval.utils.shared import flatten
|
14
16
|
|
15
17
|
|
16
18
|
@dataclass(frozen=True)
|
@@ -36,7 +38,7 @@ class ClustererOutput(OutputMetadata):
|
|
36
38
|
potential_duplicates: list[list[int]]
|
37
39
|
|
38
40
|
|
39
|
-
def
|
41
|
+
def _extend_linkage(link_arr: NDArray) -> NDArray:
|
40
42
|
"""
|
41
43
|
Adds a column to the linkage matrix link_arr that tracks the new id assigned
|
42
44
|
to each row
|
@@ -60,10 +62,10 @@ def extend_linkage(link_arr: NDArray) -> NDArray:
|
|
60
62
|
return arr
|
61
63
|
|
62
64
|
|
63
|
-
class
|
65
|
+
class _Cluster:
|
64
66
|
__slots__ = "merged", "samples", "sample_dist", "is_copy", "count", "dist_avg", "dist_std", "out1", "out2"
|
65
67
|
|
66
|
-
def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False):
|
68
|
+
def __init__(self, merged: int, samples: NDArray, sample_dist: float | NDArray, is_copy: bool = False) -> None:
|
67
69
|
self.merged = merged
|
68
70
|
self.samples = np.array(samples, dtype=np.int32)
|
69
71
|
self.sample_dist = np.array([sample_dist] if np.isscalar(sample_dist) else sample_dist)
|
@@ -85,8 +87,8 @@ class Cluster:
|
|
85
87
|
self.out1 = dist > out1
|
86
88
|
self.out2 = dist > out2
|
87
89
|
|
88
|
-
def copy(self) ->
|
89
|
-
return
|
90
|
+
def copy(self) -> _Cluster:
|
91
|
+
return _Cluster(False, self.samples, self.sample_dist, True)
|
90
92
|
|
91
93
|
def __repr__(self) -> str:
|
92
94
|
_params = {
|
@@ -98,32 +100,32 @@ class Cluster:
|
|
98
100
|
return f"{self.__class__.__name__}(**{repr(_params)})"
|
99
101
|
|
100
102
|
|
101
|
-
class
|
102
|
-
def __init__(self, *args,
|
103
|
-
super().__init__(*args
|
103
|
+
class _Clusters(dict[int, dict[int, _Cluster]]):
|
104
|
+
def __init__(self, *args: dict[int, dict[int, _Cluster]]) -> None:
|
105
|
+
super().__init__(*args)
|
104
106
|
self.max_level: int = 1
|
105
107
|
|
106
108
|
|
107
|
-
class
|
109
|
+
class _ClusterPosition(NamedTuple):
|
108
110
|
"""Keeps track of a cluster's level and ID"""
|
109
111
|
|
110
112
|
level: int
|
111
113
|
cid: int
|
112
114
|
|
113
115
|
|
114
|
-
class
|
116
|
+
class _ClusterMergeEntry:
|
115
117
|
__slots__ = "level", "outer_cluster", "inner_cluster", "status"
|
116
118
|
|
117
|
-
def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int):
|
119
|
+
def __init__(self, level: int, outer_cluster: int, inner_cluster: int, status: int) -> None:
|
118
120
|
self.level = level
|
119
121
|
self.outer_cluster = outer_cluster
|
120
122
|
self.inner_cluster = inner_cluster
|
121
123
|
self.status = status
|
122
124
|
|
123
|
-
def __lt__(self, value:
|
125
|
+
def __lt__(self, value: _ClusterMergeEntry) -> bool:
|
124
126
|
return self.level.__lt__(value.level)
|
125
127
|
|
126
|
-
def __gt__(self, value:
|
128
|
+
def __gt__(self, value: _ClusterMergeEntry) -> bool:
|
127
129
|
return self.level.__gt__(value.level)
|
128
130
|
|
129
131
|
|
@@ -153,36 +155,36 @@ class Clusterer:
|
|
153
155
|
>>> cluster = Clusterer(dataset)
|
154
156
|
"""
|
155
157
|
|
156
|
-
def __init__(self, dataset: ArrayLike):
|
158
|
+
def __init__(self, dataset: ArrayLike) -> None:
|
157
159
|
# Allows an update to dataset to reset the state rather than instantiate a new class
|
158
160
|
self._on_init(dataset)
|
159
161
|
|
160
162
|
def _on_init(self, dataset: ArrayLike):
|
161
|
-
self._data: NDArray = flatten(to_numpy(dataset))
|
163
|
+
self._data: NDArray[Any] = flatten(to_numpy(dataset))
|
162
164
|
self._validate_data(self._data)
|
163
165
|
self._num_samples = len(self._data)
|
164
166
|
|
165
|
-
self._darr: NDArray = pdist(self._data, metric="euclidean")
|
166
|
-
self._sqdmat: NDArray = squareform(self._darr)
|
167
|
-
self._larr: NDArray =
|
167
|
+
self._darr: NDArray[np.floating[Any]] = pdist(self._data, metric="euclidean")
|
168
|
+
self._sqdmat: NDArray[np.floating[Any]] = squareform(self._darr)
|
169
|
+
self._larr: NDArray[np.floating[Any]] = _extend_linkage(linkage(self._darr))
|
168
170
|
self._max_clusters: int = np.count_nonzero(self._larr[:, 3] == 2)
|
169
171
|
|
170
172
|
min_num = int(self._num_samples * 0.05)
|
171
|
-
self._min_num_samples_per_cluster = min(max(2, min_num), 100)
|
173
|
+
self._min_num_samples_per_cluster: int = min(max(2, min_num), 100)
|
172
174
|
|
173
|
-
self._clusters = None
|
174
|
-
self._last_good_merge_levels = None
|
175
|
+
self._clusters: _Clusters | None = None
|
176
|
+
self._last_good_merge_levels: dict[int, int] | None = None
|
175
177
|
|
176
178
|
@property
|
177
|
-
def data(self) -> NDArray:
|
179
|
+
def data(self) -> NDArray[Any]:
|
178
180
|
return self._data
|
179
181
|
|
180
182
|
@data.setter
|
181
|
-
def data(self, x: ArrayLike):
|
183
|
+
def data(self, x: ArrayLike) -> None:
|
182
184
|
self._on_init(x)
|
183
185
|
|
184
186
|
@property
|
185
|
-
def clusters(self) ->
|
187
|
+
def clusters(self) -> _Clusters:
|
186
188
|
if self._clusters is None:
|
187
189
|
self._clusters = self._create_clusters()
|
188
190
|
return self._clusters
|
@@ -209,11 +211,11 @@ class Clusterer:
|
|
209
211
|
if features < 1:
|
210
212
|
raise ValueError(f"Samples should have at least 1 feature; got {features}")
|
211
213
|
|
212
|
-
def _create_clusters(self) ->
|
214
|
+
def _create_clusters(self) -> _Clusters:
|
213
215
|
"""Generates clusters based on linkage matrix"""
|
214
216
|
next_cluster_id = 0
|
215
|
-
cluster_map: dict[int,
|
216
|
-
clusters:
|
217
|
+
cluster_map: dict[int, _ClusterPosition] = {} # Dictionary to associate new cluster ids with actual clusters
|
218
|
+
clusters: _Clusters = _Clusters()
|
217
219
|
|
218
220
|
# Walking through the linkage array to generate clusters
|
219
221
|
for arr_i in self._larr:
|
@@ -240,7 +242,7 @@ class Clusterer:
|
|
240
242
|
# Update clusters to include previously skipped levels
|
241
243
|
clusters = self._fill_levels(clusters, left, right)
|
242
244
|
elif left or right:
|
243
|
-
child, other_id = cast(tuple[
|
245
|
+
child, other_id = cast(tuple[_ClusterPosition, int], (left, right_id) if left else (right, left_id))
|
244
246
|
cc = clusters[child.level][child.cid]
|
245
247
|
samples = np.concatenate([cc.samples, [other_id]])
|
246
248
|
sample_dist = np.concatenate([cc.sample_dist, sample_dist])
|
@@ -254,12 +256,12 @@ class Clusterer:
|
|
254
256
|
if level not in clusters:
|
255
257
|
clusters[level] = {}
|
256
258
|
|
257
|
-
clusters[level][cid] =
|
258
|
-
cluster_map[int(arr_i[-1])] =
|
259
|
+
clusters[level][cid] = _Cluster(merged, samples, sample_dist)
|
260
|
+
cluster_map[int(arr_i[-1])] = _ClusterPosition(level, cid)
|
259
261
|
|
260
262
|
return clusters
|
261
263
|
|
262
|
-
def _fill_levels(self, clusters:
|
264
|
+
def _fill_levels(self, clusters: _Clusters, left: _ClusterPosition, right: _ClusterPosition) -> _Clusters:
|
263
265
|
# Sets each level's cluster info if it does not exist
|
264
266
|
if left.level != right.level:
|
265
267
|
(level, cid), max_level = (left, right[0]) if left[0] < right[0] else (right, left[0])
|
@@ -312,7 +314,7 @@ class Clusterer:
|
|
312
314
|
mask2 = mask2_vals < one_std_check
|
313
315
|
return np.logical_or(desired_merge, mask2)
|
314
316
|
|
315
|
-
def _generate_merge_list(self, cluster_matrix: NDArray) -> list[
|
317
|
+
def _generate_merge_list(self, cluster_matrix: NDArray) -> list[_ClusterMergeEntry]:
|
316
318
|
"""
|
317
319
|
Runs through the clusters dictionary determining when clusters merge,
|
318
320
|
and how close are those clusters when they merge.
|
@@ -329,7 +331,7 @@ class Clusterer:
|
|
329
331
|
"""
|
330
332
|
intra_max = []
|
331
333
|
merge_mean = []
|
332
|
-
merge_list: list[
|
334
|
+
merge_list: list[_ClusterMergeEntry] = []
|
333
335
|
|
334
336
|
for level, cluster_set in self.clusters.items():
|
335
337
|
for outer_cluster, cluster in cluster_set.items():
|
@@ -356,7 +358,7 @@ class Clusterer:
|
|
356
358
|
# Calculate the corresponding distance stats
|
357
359
|
distance_stats_arr = aggregate_func(distances)
|
358
360
|
merge_mean.append(distance_stats_arr)
|
359
|
-
merge_list.append(
|
361
|
+
merge_list.append(_ClusterMergeEntry(level, outer_cluster, inner_cluster, 0))
|
360
362
|
|
361
363
|
all_merge_indices = self._calc_merge_indices(merge_mean=merge_mean, intra_max=intra_max)
|
362
364
|
|
@@ -493,7 +495,7 @@ class Clusterer:
|
|
493
495
|
return exact_dupes, near_dupes
|
494
496
|
|
495
497
|
# TODO: Move data input to evaluate from class
|
496
|
-
@set_metadata(
|
498
|
+
@set_metadata(["data"])
|
497
499
|
def evaluate(self) -> ClustererOutput:
|
498
500
|
"""Finds and flags indices of the data for Outliers and :term:`duplicates<Duplicates>`
|
499
501
|
|
@@ -1,13 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = ["DuplicatesOutput", "Duplicates"]
|
4
|
+
|
3
5
|
from dataclasses import dataclass
|
4
|
-
from typing import Generic, Iterable, Sequence, TypeVar
|
6
|
+
from typing import Generic, Iterable, Sequence, TypeVar, overload
|
5
7
|
|
6
8
|
from numpy.typing import ArrayLike
|
7
9
|
|
8
|
-
from dataeval.
|
9
|
-
from dataeval.
|
10
|
-
from dataeval.
|
10
|
+
from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
|
11
|
+
from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
|
12
|
+
from dataeval.output import OutputMetadata, set_metadata
|
11
13
|
|
12
14
|
DuplicateGroup = list[int]
|
13
15
|
DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
|
@@ -58,7 +60,7 @@ class Duplicates:
|
|
58
60
|
>>> exact_dupes = Duplicates(only_exact=True)
|
59
61
|
"""
|
60
62
|
|
61
|
-
def __init__(self, only_exact: bool = False):
|
63
|
+
def __init__(self, only_exact: bool = False) -> None:
|
62
64
|
self.stats: HashStatsOutput
|
63
65
|
self.only_exact = only_exact
|
64
66
|
|
@@ -81,8 +83,16 @@ class Duplicates:
|
|
81
83
|
"near": sorted(near),
|
82
84
|
}
|
83
85
|
|
84
|
-
@
|
85
|
-
def from_stats(self, hashes: HashStatsOutput
|
86
|
+
@overload
|
87
|
+
def from_stats(self, hashes: HashStatsOutput) -> DuplicatesOutput[DuplicateGroup]: ...
|
88
|
+
|
89
|
+
@overload
|
90
|
+
def from_stats(self, hashes: Sequence[HashStatsOutput]) -> DuplicatesOutput[DatasetDuplicateGroupMap]: ...
|
91
|
+
|
92
|
+
@set_metadata(["only_exact"])
|
93
|
+
def from_stats(
|
94
|
+
self, hashes: HashStatsOutput | Sequence[HashStatsOutput]
|
95
|
+
) -> DuplicatesOutput[DuplicateGroup] | DuplicatesOutput[DatasetDuplicateGroupMap]:
|
86
96
|
"""
|
87
97
|
Returns duplicate image indices for both exact matches and near matches
|
88
98
|
|
@@ -128,8 +138,8 @@ class Duplicates:
|
|
128
138
|
|
129
139
|
return DuplicatesOutput(**duplicates)
|
130
140
|
|
131
|
-
@set_metadata(
|
132
|
-
def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
|
141
|
+
@set_metadata(["only_exact"])
|
142
|
+
def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]:
|
133
143
|
"""
|
134
144
|
Returns duplicate image indices for both exact matches and near matches
|
135
145
|
|
@@ -1,11 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = []
|
4
|
+
|
3
5
|
from copy import deepcopy
|
4
6
|
from typing import Sequence, TypeVar
|
5
7
|
|
6
8
|
import numpy as np
|
7
9
|
|
8
|
-
from dataeval.
|
10
|
+
from dataeval.metrics.stats.base import BaseStatsOutput
|
9
11
|
|
10
12
|
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
|
11
13
|
|
@@ -1,18 +1,20 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
__all__ = ["OutliersOutput", "Outliers"]
|
4
|
+
|
3
5
|
from dataclasses import dataclass
|
4
6
|
from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
|
5
7
|
|
6
8
|
import numpy as np
|
7
9
|
from numpy.typing import ArrayLike, NDArray
|
8
10
|
|
9
|
-
from dataeval.
|
10
|
-
from dataeval.
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
14
|
-
from dataeval.
|
15
|
-
from dataeval.
|
11
|
+
from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_step_from_idx
|
12
|
+
from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
|
13
|
+
from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
|
14
|
+
from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
|
15
|
+
from dataeval.metrics.stats.pixelstats import PixelStatsOutput
|
16
|
+
from dataeval.metrics.stats.visualstats import VisualStatsOutput
|
17
|
+
from dataeval.output import OutputMetadata, set_metadata
|
16
18
|
|
17
19
|
IndexIssueMap = dict[int, dict[str, float]]
|
18
20
|
OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
@@ -37,7 +39,7 @@ class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
|
|
37
39
|
|
38
40
|
issues: TIndexIssueMap
|
39
41
|
|
40
|
-
def __len__(self):
|
42
|
+
def __len__(self) -> int:
|
41
43
|
if isinstance(self.issues, dict):
|
42
44
|
return len(self.issues)
|
43
45
|
else:
|
@@ -157,10 +159,10 @@ class Outliers:
|
|
157
159
|
@overload
|
158
160
|
def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
|
159
161
|
|
160
|
-
@set_metadata(
|
162
|
+
@set_metadata(["outlier_method", "outlier_threshold"])
|
161
163
|
def from_stats(
|
162
164
|
self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
|
163
|
-
) -> OutliersOutput:
|
165
|
+
) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
|
164
166
|
"""
|
165
167
|
Returns indices of Outliers with the issues identified for each
|
166
168
|
|
@@ -195,7 +197,7 @@ class Outliers:
|
|
195
197
|
{}
|
196
198
|
""" # noqa: E501
|
197
199
|
if isinstance(stats, DatasetStatsOutput):
|
198
|
-
outliers = self._get_outliers({k: v for o in stats.
|
200
|
+
outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
|
199
201
|
return OutliersOutput(outliers)
|
200
202
|
|
201
203
|
if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
|
@@ -226,16 +228,7 @@ class Outliers:
|
|
226
228
|
|
227
229
|
return OutliersOutput(output_list)
|
228
230
|
|
229
|
-
@set_metadata(
|
230
|
-
"dataeval.detectors",
|
231
|
-
[
|
232
|
-
"use_dimension",
|
233
|
-
"use_pixel",
|
234
|
-
"use_visual",
|
235
|
-
"outlier_method",
|
236
|
-
"outlier_threshold",
|
237
|
-
],
|
238
|
-
)
|
231
|
+
@set_metadata(["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
|
239
232
|
def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
|
240
233
|
"""
|
241
234
|
Returns indices of Outliers with the issues identified for each
|
@@ -5,11 +5,11 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
|
|
5
5
|
from dataeval import _IS_TENSORFLOW_AVAILABLE
|
6
6
|
|
7
7
|
if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
|
8
|
-
from dataeval.
|
9
|
-
from dataeval.
|
10
|
-
from dataeval.
|
11
|
-
from dataeval.
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
8
|
+
from dataeval.detectors.ood.ae import OOD_AE
|
9
|
+
from dataeval.detectors.ood.aegmm import OOD_AEGMM
|
10
|
+
from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
|
11
|
+
from dataeval.detectors.ood.llr import OOD_LLR
|
12
|
+
from dataeval.detectors.ood.vae import OOD_VAE
|
13
|
+
from dataeval.detectors.ood.vaegmm import OOD_VAEGMM
|
14
14
|
|
15
15
|
__all__ = ["OOD_AE", "OOD_AEGMM", "OOD_LLR", "OOD_VAE", "OOD_VAEGMM", "OODOutput", "OODScoreOutput"]
|
@@ -8,18 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
|
11
|
+
__all__ = ["OOD_AE"]
|
12
|
+
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
12
14
|
|
13
15
|
import numpy as np
|
14
|
-
import tensorflow as tf
|
15
|
-
import tf_keras as keras
|
16
16
|
from numpy.typing import ArrayLike
|
17
17
|
|
18
|
-
from dataeval.
|
19
|
-
from dataeval.
|
20
|
-
from dataeval.
|
21
|
-
from dataeval.
|
22
|
-
|
18
|
+
from dataeval.detectors.ood.base import OODBase, OODScoreOutput
|
19
|
+
from dataeval.interop import as_numpy
|
20
|
+
from dataeval.utils.lazy import lazyload
|
21
|
+
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
22
|
+
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
import tensorflow as tf
|
25
|
+
import tf_keras as keras
|
26
|
+
|
27
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
28
|
+
else:
|
29
|
+
tf = lazyload("tensorflow")
|
30
|
+
keras = lazyload("tf_keras")
|
31
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
23
32
|
|
24
33
|
|
25
34
|
class OOD_AE(OODBase):
|
@@ -32,7 +41,7 @@ class OOD_AE(OODBase):
|
|
32
41
|
An :term:`autoencoder<Autoencoder>` model.
|
33
42
|
"""
|
34
43
|
|
35
|
-
def __init__(self, model: AE) -> None:
|
44
|
+
def __init__(self, model: tf_models.AE) -> None:
|
36
45
|
super().__init__(model)
|
37
46
|
|
38
47
|
def fit(
|
@@ -40,7 +49,7 @@ class OOD_AE(OODBase):
|
|
40
49
|
x_ref: ArrayLike,
|
41
50
|
threshold_perc: float = 100.0,
|
42
51
|
loss_fn: Callable[..., tf.Tensor] | None = None,
|
43
|
-
optimizer: keras.optimizers.Optimizer =
|
52
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
44
53
|
epochs: int = 20,
|
45
54
|
batch_size: int = 64,
|
46
55
|
verbose: bool = True,
|
@@ -49,8 +58,7 @@ class OOD_AE(OODBase):
|
|
49
58
|
loss_fn = keras.losses.MeanSquaredError()
|
50
59
|
super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
51
60
|
|
52
|
-
|
53
|
-
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
61
|
+
def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
54
62
|
self._validate(X := as_numpy(X))
|
55
63
|
|
56
64
|
# reconstruct instances
|
@@ -0,0 +1,66 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from __future__ import annotations
|
10
|
+
|
11
|
+
__all__ = ["OOD_AEGMM"]
|
12
|
+
|
13
|
+
from typing import TYPE_CHECKING, Callable
|
14
|
+
|
15
|
+
from numpy.typing import ArrayLike
|
16
|
+
|
17
|
+
from dataeval.detectors.ood.base import OODGMMBase, OODScoreOutput
|
18
|
+
from dataeval.interop import to_numpy
|
19
|
+
from dataeval.utils.lazy import lazyload
|
20
|
+
from dataeval.utils.tensorflow._internal.gmm import gmm_energy
|
21
|
+
from dataeval.utils.tensorflow._internal.loss import LossGMM
|
22
|
+
from dataeval.utils.tensorflow._internal.utils import predict_batch
|
23
|
+
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
import tensorflow as tf
|
26
|
+
import tf_keras as keras
|
27
|
+
|
28
|
+
import dataeval.utils.tensorflow._internal.models as tf_models
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
32
|
+
tf_models = lazyload("dataeval.utils.tensorflow._internal.models")
|
33
|
+
|
34
|
+
|
35
|
+
class OOD_AEGMM(OODGMMBase):
|
36
|
+
"""
|
37
|
+
AE with Gaussian Mixture Model based outlier detector.
|
38
|
+
|
39
|
+
Parameters
|
40
|
+
----------
|
41
|
+
model : AEGMM
|
42
|
+
An AEGMM model.
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(self, model: tf_models.AEGMM) -> None:
|
46
|
+
super().__init__(model)
|
47
|
+
|
48
|
+
def fit(
|
49
|
+
self,
|
50
|
+
x_ref: ArrayLike,
|
51
|
+
threshold_perc: float = 100.0,
|
52
|
+
loss_fn: Callable[..., tf.Tensor] | None = None,
|
53
|
+
optimizer: keras.optimizers.Optimizer | None = None,
|
54
|
+
epochs: int = 20,
|
55
|
+
batch_size: int = 64,
|
56
|
+
verbose: bool = True,
|
57
|
+
) -> None:
|
58
|
+
if loss_fn is None:
|
59
|
+
loss_fn = LossGMM()
|
60
|
+
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
61
|
+
|
62
|
+
def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
63
|
+
self._validate(X := to_numpy(X))
|
64
|
+
_, z, _ = predict_batch(X, self.model, batch_size=batch_size)
|
65
|
+
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
66
|
+
return OODScoreOutput(energy.numpy()) # type: ignore
|
@@ -8,19 +8,27 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
+
__all__ = ["OODOutput", "OODScoreOutput"]
|
12
|
+
|
11
13
|
from abc import ABC, abstractmethod
|
12
14
|
from dataclasses import dataclass
|
13
|
-
from typing import Callable, Literal, cast
|
15
|
+
from typing import TYPE_CHECKING, Callable, Literal, cast
|
14
16
|
|
15
17
|
import numpy as np
|
16
|
-
import tensorflow as tf
|
17
|
-
import tf_keras as keras
|
18
18
|
from numpy.typing import ArrayLike, NDArray
|
19
19
|
|
20
|
-
from dataeval.
|
21
|
-
from dataeval.
|
22
|
-
from dataeval.
|
23
|
-
from dataeval._internal.
|
20
|
+
from dataeval.interop import to_numpy
|
21
|
+
from dataeval.output import OutputMetadata, set_metadata
|
22
|
+
from dataeval.utils.lazy import lazyload
|
23
|
+
from dataeval.utils.tensorflow._internal.gmm import GaussianMixtureModelParams, gmm_params
|
24
|
+
from dataeval.utils.tensorflow._internal.trainer import trainer
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
import tensorflow as tf
|
28
|
+
import tf_keras as keras
|
29
|
+
else:
|
30
|
+
tf = lazyload("tensorflow")
|
31
|
+
keras = lazyload("tf_keras")
|
24
32
|
|
25
33
|
|
26
34
|
@dataclass(frozen=True)
|
@@ -61,7 +69,7 @@ class OODScoreOutput(OutputMetadata):
|
|
61
69
|
instance_score: NDArray[np.float32]
|
62
70
|
feature_score: NDArray[np.float32] | None = None
|
63
71
|
|
64
|
-
def get(self, ood_type: Literal["instance", "feature"]) -> NDArray:
|
72
|
+
def get(self, ood_type: Literal["instance", "feature"]) -> NDArray[np.float32]:
|
65
73
|
"""
|
66
74
|
Returns either the instance or feature score
|
67
75
|
|
@@ -107,6 +115,9 @@ class OODBase(ABC):
|
|
107
115
|
self._validate(X)
|
108
116
|
|
109
117
|
@abstractmethod
|
118
|
+
def _score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput: ...
|
119
|
+
|
120
|
+
@set_metadata()
|
110
121
|
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
111
122
|
"""
|
112
123
|
Compute the :term:`out of distribution<Out-of-distribution (OOD)>` scores for a given dataset.
|
@@ -124,6 +135,7 @@ class OODBase(ABC):
|
|
124
135
|
OODScoreOutput
|
125
136
|
An object containing the instance-level and feature-level OOD scores.
|
126
137
|
"""
|
138
|
+
return self._score(X, batch_size)
|
127
139
|
|
128
140
|
def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
|
129
141
|
return np.percentile(self._ref_score.get(ood_type), self._threshold_perc)
|
@@ -131,12 +143,12 @@ class OODBase(ABC):
|
|
131
143
|
def fit(
|
132
144
|
self,
|
133
145
|
x_ref: ArrayLike,
|
134
|
-
threshold_perc: float
|
135
|
-
loss_fn: Callable[..., tf.Tensor]
|
136
|
-
optimizer: keras.optimizers.Optimizer
|
137
|
-
epochs: int
|
138
|
-
batch_size: int
|
139
|
-
verbose: bool
|
146
|
+
threshold_perc: float,
|
147
|
+
loss_fn: Callable[..., tf.Tensor],
|
148
|
+
optimizer: keras.optimizers.Optimizer,
|
149
|
+
epochs: int,
|
150
|
+
batch_size: int,
|
151
|
+
verbose: bool,
|
140
152
|
) -> None:
|
141
153
|
"""
|
142
154
|
Train the model and infer the threshold value.
|
@@ -174,7 +186,7 @@ class OODBase(ABC):
|
|
174
186
|
self._ref_score = self.score(x_ref, batch_size)
|
175
187
|
self._threshold_perc = threshold_perc
|
176
188
|
|
177
|
-
@set_metadata(
|
189
|
+
@set_metadata()
|
178
190
|
def predict(
|
179
191
|
self,
|
180
192
|
X: ArrayLike,
|
@@ -218,12 +230,12 @@ class OODGMMBase(OODBase):
|
|
218
230
|
def fit(
|
219
231
|
self,
|
220
232
|
x_ref: ArrayLike,
|
221
|
-
threshold_perc: float
|
222
|
-
loss_fn: Callable[..., tf.Tensor]
|
223
|
-
optimizer: keras.optimizers.Optimizer
|
224
|
-
epochs: int
|
225
|
-
batch_size: int
|
226
|
-
verbose: bool
|
233
|
+
threshold_perc: float,
|
234
|
+
loss_fn: Callable[..., tf.Tensor],
|
235
|
+
optimizer: keras.optimizers.Optimizer,
|
236
|
+
epochs: int,
|
237
|
+
batch_size: int,
|
238
|
+
verbose: bool,
|
227
239
|
) -> None:
|
228
240
|
# Train the model
|
229
241
|
trainer(
|