dataeval 0.69.4__py3-none-any.whl → 0.70.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/_internal/detectors/drift/base.py +5 -6
- dataeval/_internal/detectors/drift/mmd.py +3 -3
- dataeval/_internal/detectors/duplicates.py +62 -45
- dataeval/_internal/detectors/merged_stats.py +23 -54
- dataeval/_internal/detectors/ood/ae.py +3 -3
- dataeval/_internal/detectors/outliers.py +133 -61
- dataeval/_internal/interop.py +11 -7
- dataeval/_internal/metrics/balance.py +9 -9
- dataeval/_internal/metrics/ber.py +3 -3
- dataeval/_internal/metrics/divergence.py +3 -3
- dataeval/_internal/metrics/diversity.py +6 -6
- dataeval/_internal/metrics/parity.py +24 -16
- dataeval/_internal/metrics/stats/base.py +231 -0
- dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
- dataeval/_internal/metrics/stats/datasetstats.py +97 -0
- dataeval/_internal/metrics/stats/dimensionstats.py +111 -0
- dataeval/_internal/metrics/stats/hashstats.py +73 -0
- dataeval/_internal/metrics/stats/labelstats.py +125 -0
- dataeval/_internal/metrics/stats/pixelstats.py +117 -0
- dataeval/_internal/metrics/stats/visualstats.py +122 -0
- dataeval/_internal/metrics/uap.py +2 -2
- dataeval/_internal/metrics/utils.py +28 -13
- dataeval/_internal/output.py +3 -18
- dataeval/_internal/workflows/sufficiency.py +123 -133
- dataeval/metrics/stats/__init__.py +14 -3
- dataeval/workflows/__init__.py +2 -2
- {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/METADATA +3 -3
- {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/RECORD +31 -26
- {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/WHEEL +1 -1
- dataeval/_internal/flags.py +0 -77
- dataeval/_internal/metrics/stats.py +0 -397
- dataeval/flags/__init__.py +0 -3
- {dataeval-0.69.4.dist-info → dataeval-0.70.0.dist-info}/LICENSE.txt +0 -0
dataeval/__init__.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.
|
1
|
+
__version__ = "0.70.0"
|
2
2
|
|
3
3
|
from importlib.util import find_spec
|
4
4
|
|
@@ -7,9 +7,9 @@ _IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("te
|
|
7
7
|
|
8
8
|
del find_spec
|
9
9
|
|
10
|
-
from . import detectors,
|
10
|
+
from . import detectors, metrics # noqa: E402
|
11
11
|
|
12
|
-
__all__ = ["detectors", "
|
12
|
+
__all__ = ["detectors", "metrics"]
|
13
13
|
|
14
14
|
if _IS_TORCH_AVAILABLE: # pragma: no cover
|
15
15
|
from . import torch, utils, workflows
|
@@ -16,7 +16,7 @@ from typing import Callable, Literal
|
|
16
16
|
import numpy as np
|
17
17
|
from numpy.typing import ArrayLike, NDArray
|
18
18
|
|
19
|
-
from dataeval._internal.interop import to_numpy
|
19
|
+
from dataeval._internal.interop import as_numpy, to_numpy
|
20
20
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
21
21
|
|
22
22
|
|
@@ -234,7 +234,7 @@ class BaseDrift:
|
|
234
234
|
if correction not in ["bonferroni", "fdr"]:
|
235
235
|
raise ValueError("`correction` must be `bonferroni` or `fdr`.")
|
236
236
|
|
237
|
-
self._x_ref = x_ref
|
237
|
+
self._x_ref = to_numpy(x_ref)
|
238
238
|
self.x_ref_preprocessed = x_ref_preprocessed
|
239
239
|
|
240
240
|
# Other attributes
|
@@ -242,7 +242,7 @@ class BaseDrift:
|
|
242
242
|
self.update_x_ref = update_x_ref
|
243
243
|
self.preprocess_fn = preprocess_fn
|
244
244
|
self.correction = correction
|
245
|
-
self.n = len(self._x_ref)
|
245
|
+
self.n = len(self._x_ref)
|
246
246
|
|
247
247
|
# Ref counter for preprocessed x
|
248
248
|
self._x_refcount = 0
|
@@ -260,9 +260,8 @@ class BaseDrift:
|
|
260
260
|
if not self.x_ref_preprocessed:
|
261
261
|
self.x_ref_preprocessed = True
|
262
262
|
if self.preprocess_fn is not None:
|
263
|
-
self._x_ref = self.preprocess_fn(self._x_ref)
|
263
|
+
self._x_ref = as_numpy(self.preprocess_fn(self._x_ref))
|
264
264
|
|
265
|
-
self._x_ref = to_numpy(self._x_ref)
|
266
265
|
return self._x_ref
|
267
266
|
|
268
267
|
def _preprocess(self, x: ArrayLike) -> ArrayLike:
|
@@ -380,7 +379,7 @@ class BaseDriftUnivariate(BaseDrift):
|
|
380
379
|
self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
|
381
380
|
else:
|
382
381
|
# infer number of features after applying preprocessing step
|
383
|
-
x =
|
382
|
+
x = as_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
|
384
383
|
self._n_features = x.reshape(x.shape[0], -1).shape[-1]
|
385
384
|
|
386
385
|
return self._n_features
|
@@ -14,7 +14,7 @@ from typing import Callable
|
|
14
14
|
import torch
|
15
15
|
from numpy.typing import ArrayLike
|
16
16
|
|
17
|
-
from dataeval._internal.interop import
|
17
|
+
from dataeval._internal.interop import as_numpy
|
18
18
|
from dataeval._internal.output import set_metadata
|
19
19
|
|
20
20
|
from .base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
|
@@ -110,7 +110,7 @@ class DriftMMD(BaseDrift):
|
|
110
110
|
self.device = get_device(device)
|
111
111
|
|
112
112
|
# initialize kernel
|
113
|
-
sigma_tensor = torch.from_numpy(
|
113
|
+
sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
|
114
114
|
self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
|
115
115
|
|
116
116
|
# compute kernel matrix for the reference data
|
@@ -147,7 +147,7 @@ class DriftMMD(BaseDrift):
|
|
147
147
|
p-value obtained from the permutation test, MMD^2 between the reference and test set,
|
148
148
|
and MMD^2 threshold above which drift is flagged
|
149
149
|
"""
|
150
|
-
x =
|
150
|
+
x = as_numpy(x)
|
151
151
|
x_ref = torch.from_numpy(self.x_ref).to(self.device)
|
152
152
|
n = x.shape[0]
|
153
153
|
kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
|
@@ -1,13 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from typing import Generic, Iterable, Sequence, TypeVar
|
4
|
+
from typing import Generic, Iterable, Sequence, TypeVar
|
5
5
|
|
6
6
|
from numpy.typing import ArrayLike
|
7
7
|
|
8
8
|
from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
|
9
|
-
from dataeval._internal.
|
10
|
-
from dataeval._internal.metrics.stats import StatsOutput, imagestats
|
9
|
+
from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
|
11
10
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
12
11
|
|
13
12
|
DuplicateGroup = list[int]
|
@@ -53,26 +52,23 @@ class Duplicates:
|
|
53
52
|
-------
|
54
53
|
Initialize the Duplicates class:
|
55
54
|
|
56
|
-
>>>
|
55
|
+
>>> all_dupes = Duplicates()
|
56
|
+
>>> exact_dupes = Duplicates(only_exact=True)
|
57
57
|
"""
|
58
58
|
|
59
59
|
def __init__(self, only_exact: bool = False):
|
60
|
-
self.stats:
|
60
|
+
self.stats: HashStatsOutput
|
61
61
|
self.only_exact = only_exact
|
62
62
|
|
63
|
-
def _get_duplicates(self) -> dict[str, list[list[int]]]:
|
64
|
-
|
65
|
-
|
66
|
-
exact_dict
|
67
|
-
|
68
|
-
exact_dict.setdefault(value, []).append(i)
|
69
|
-
exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
|
70
|
-
else:
|
71
|
-
exact = []
|
63
|
+
def _get_duplicates(self, stats: dict) -> dict[str, list[list[int]]]:
|
64
|
+
exact_dict: dict[int, list] = {}
|
65
|
+
for i, value in enumerate(stats["xxhash"]):
|
66
|
+
exact_dict.setdefault(value, []).append(i)
|
67
|
+
exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
|
72
68
|
|
73
|
-
if
|
69
|
+
if not self.only_exact:
|
74
70
|
near_dict: dict[int, list] = {}
|
75
|
-
for i, value in enumerate(
|
71
|
+
for i, value in enumerate(stats["pchash"]):
|
76
72
|
near_dict.setdefault(value, []).append(i)
|
77
73
|
near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
|
78
74
|
else:
|
@@ -84,14 +80,14 @@ class Duplicates:
|
|
84
80
|
}
|
85
81
|
|
86
82
|
@set_metadata("dataeval.detectors", ["only_exact"])
|
87
|
-
def
|
83
|
+
def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
|
88
84
|
"""
|
89
85
|
Returns duplicate image indices for both exact matches and near matches
|
90
86
|
|
91
87
|
Parameters
|
92
88
|
----------
|
93
|
-
data :
|
94
|
-
|
89
|
+
data : HashStatsOutput | Sequence[HashStatsOutput]
|
90
|
+
The output(s) from a hashstats analysis
|
95
91
|
|
96
92
|
Returns
|
97
93
|
-------
|
@@ -100,39 +96,60 @@ class Duplicates:
|
|
100
96
|
|
101
97
|
See Also
|
102
98
|
--------
|
103
|
-
|
99
|
+
hashstats
|
104
100
|
|
105
101
|
Example
|
106
102
|
-------
|
107
|
-
>>>
|
108
|
-
DuplicatesOutput(exact=[[3, 20], [16
|
109
|
-
"""
|
103
|
+
>>> exact_dupes.from_stats([hashes1, hashes2])
|
104
|
+
DuplicatesOutput(exact=[{0: [3, 20]}, {0: [16], 1: [12]}], near=[])
|
105
|
+
"""
|
110
106
|
|
111
|
-
|
107
|
+
if isinstance(hashes, HashStatsOutput):
|
108
|
+
return DuplicatesOutput(**self._get_duplicates(hashes.dict()))
|
112
109
|
|
113
|
-
if isinstance(
|
114
|
-
|
115
|
-
raise ValueError("StatsOutput must include xxhash information of the images.")
|
116
|
-
if not self.only_exact and not stats.pchash:
|
117
|
-
raise ValueError("StatsOutput must include pchash information of the images for near matches.")
|
118
|
-
self.stats = stats
|
119
|
-
else:
|
120
|
-
flags = ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH)
|
121
|
-
self.stats = imagestats(cast(Iterable[ArrayLike], data), flags)
|
110
|
+
if not isinstance(hashes, Sequence):
|
111
|
+
raise TypeError("Invalid stats output type; only use output from hashstats.")
|
122
112
|
|
123
|
-
|
113
|
+
combined, dataset_steps = combine_stats(hashes)
|
114
|
+
duplicates = self._get_duplicates(combined.dict())
|
124
115
|
|
125
116
|
# split up results from combined dataset into individual dataset buckets
|
126
|
-
|
127
|
-
|
128
|
-
for
|
129
|
-
|
130
|
-
for
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
117
|
+
for dup_type, dup_list in duplicates.items():
|
118
|
+
dup_list_dict = []
|
119
|
+
for idxs in dup_list:
|
120
|
+
dup_dict = {}
|
121
|
+
for idx in idxs:
|
122
|
+
k, v = get_dataset_step_from_idx(idx, dataset_steps)
|
123
|
+
dup_dict.setdefault(k, []).append(v)
|
124
|
+
dup_list_dict.append(dup_dict)
|
125
|
+
duplicates[dup_type] = dup_list_dict
|
126
|
+
|
127
|
+
return DuplicatesOutput(**duplicates)
|
128
|
+
|
129
|
+
@set_metadata("dataeval.detectors", ["only_exact"])
|
130
|
+
def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
|
131
|
+
"""
|
132
|
+
Returns duplicate image indices for both exact matches and near matches
|
133
|
+
|
134
|
+
Parameters
|
135
|
+
----------
|
136
|
+
data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
|
137
|
+
A dataset of images in an ArrayLike format or the output(s) from a hashstats analysis
|
138
|
+
|
139
|
+
Returns
|
140
|
+
-------
|
141
|
+
DuplicatesOutput
|
142
|
+
List of groups of indices that are exact and near matches
|
137
143
|
|
144
|
+
See Also
|
145
|
+
--------
|
146
|
+
hashstats
|
147
|
+
|
148
|
+
Example
|
149
|
+
-------
|
150
|
+
>>> all_dupes.evaluate(images)
|
151
|
+
DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
|
152
|
+
""" # noqa: E501
|
153
|
+
self.stats = hashstats(data)
|
154
|
+
duplicates = self._get_duplicates(self.stats.dict())
|
138
155
|
return DuplicatesOutput(**duplicates)
|
@@ -1,71 +1,40 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from
|
4
|
-
from
|
3
|
+
from copy import deepcopy
|
4
|
+
from typing import Sequence, TypeVar
|
5
5
|
|
6
6
|
import numpy as np
|
7
7
|
|
8
|
-
from dataeval._internal.metrics.stats import
|
9
|
-
from dataeval._internal.output import populate_defaults
|
8
|
+
from dataeval._internal.metrics.stats.base import BaseStatsOutput
|
10
9
|
|
10
|
+
TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
|
11
11
|
|
12
|
-
def add_stats(a: StatsOutput, b: StatsOutput) -> StatsOutput:
|
13
|
-
if not isinstance(a, StatsOutput) or not isinstance(b, StatsOutput):
|
14
|
-
raise TypeError(f"Cannot add object of type {type(a)} and type {type(b)}.")
|
15
12
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
b_keys = set(b_dict)
|
13
|
+
def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
|
14
|
+
if type(a) is not type(b):
|
15
|
+
raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
|
20
16
|
|
21
|
-
|
22
|
-
if missing_keys:
|
23
|
-
raise ValueError(f"Required keys are missing: {missing_keys}.")
|
17
|
+
sum_dict = deepcopy(a.dict())
|
24
18
|
|
25
|
-
|
26
|
-
|
27
|
-
|
19
|
+
for k in sum_dict:
|
20
|
+
if isinstance(sum_dict[k], list):
|
21
|
+
sum_dict[k].extend(b.dict()[k])
|
22
|
+
else:
|
23
|
+
sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
|
28
24
|
|
29
|
-
|
30
|
-
if "ch_idx_map" in a_dict:
|
31
|
-
for k, v in a_dict.items():
|
32
|
-
if k == "ch_idx_map":
|
33
|
-
offset = sum([len(idxs) for idxs in v.values()])
|
34
|
-
for ch_k, ch_v in b_dict[k].items():
|
35
|
-
if ch_k not in v:
|
36
|
-
v[ch_k] = []
|
37
|
-
a_dict[k][ch_k].extend([idx + offset for idx in ch_v])
|
38
|
-
else:
|
39
|
-
for ch_k in b_dict[k]:
|
40
|
-
if ch_k not in v:
|
41
|
-
v[ch_k] = b_dict[k][ch_k]
|
42
|
-
else:
|
43
|
-
v[ch_k] = np.concatenate((v[ch_k], b_dict[k][ch_k]), axis=1)
|
44
|
-
else:
|
45
|
-
for k in a_dict:
|
46
|
-
if isinstance(a_dict[k], list):
|
47
|
-
a_dict[k].extend(b_dict[k])
|
48
|
-
else:
|
49
|
-
a_dict[k] = np.concatenate((a_dict[k], b_dict[k]))
|
25
|
+
return type(a)(**sum_dict)
|
50
26
|
|
51
|
-
return StatsOutput(**populate_defaults(a_dict, StatsOutput))
|
52
|
-
|
53
|
-
|
54
|
-
def combine_stats(stats) -> tuple[StatsOutput | None, list[int]]:
|
55
|
-
dataset_steps = []
|
56
|
-
|
57
|
-
if isinstance(stats, StatsOutput):
|
58
|
-
return stats, dataset_steps
|
59
27
|
|
28
|
+
def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
|
60
29
|
output = None
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
30
|
+
dataset_steps = []
|
31
|
+
cur_len = 0
|
32
|
+
for s in stats:
|
33
|
+
output = s if output is None else add_stats(output, s)
|
34
|
+
cur_len += len(s)
|
35
|
+
dataset_steps.append(cur_len)
|
36
|
+
if output is None:
|
37
|
+
raise TypeError("Cannot combine empty sequence of stats.")
|
69
38
|
return output, dataset_steps
|
70
39
|
|
71
40
|
|
@@ -16,7 +16,7 @@ import tensorflow as tf
|
|
16
16
|
from numpy.typing import ArrayLike
|
17
17
|
|
18
18
|
from dataeval._internal.detectors.ood.base import OODBase, OODScore
|
19
|
-
from dataeval._internal.interop import
|
19
|
+
from dataeval._internal.interop import as_numpy
|
20
20
|
from dataeval._internal.models.tensorflow.autoencoder import AE
|
21
21
|
from dataeval._internal.models.tensorflow.utils import predict_batch
|
22
22
|
|
@@ -46,10 +46,10 @@ class OOD_AE(OODBase):
|
|
46
46
|
) -> None:
|
47
47
|
if loss_fn is None:
|
48
48
|
loss_fn = keras.losses.MeanSquaredError()
|
49
|
-
super().fit(
|
49
|
+
super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
50
50
|
|
51
51
|
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
|
52
|
-
self._validate(X :=
|
52
|
+
self._validate(X := as_numpy(X))
|
53
53
|
|
54
54
|
# reconstruct instances
|
55
55
|
X_recon = predict_batch(X, self.model, batch_size=batch_size)
|
@@ -1,39 +1,45 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import dataclass
|
4
|
-
from typing import Iterable, Literal, Sequence,
|
5
|
-
from warnings import warn
|
4
|
+
from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
|
6
5
|
|
7
6
|
import numpy as np
|
8
7
|
from numpy.typing import ArrayLike, NDArray
|
9
8
|
|
10
9
|
from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
|
11
|
-
from dataeval._internal.
|
12
|
-
from dataeval._internal.metrics.stats import
|
10
|
+
from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
|
11
|
+
from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
|
12
|
+
from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
|
13
|
+
from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
|
14
|
+
from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
|
13
15
|
from dataeval._internal.output import OutputMetadata, set_metadata
|
14
16
|
|
15
17
|
IndexIssueMap = dict[int, dict[str, float]]
|
16
|
-
|
17
|
-
""
|
18
|
-
Mapping of image indices to a dictionary of issue types and calculated values
|
19
|
-
"""
|
18
|
+
OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
19
|
+
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
|
20
20
|
|
21
21
|
|
22
22
|
@dataclass(frozen=True)
|
23
|
-
class OutliersOutput(OutputMetadata):
|
23
|
+
class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
|
24
24
|
"""
|
25
25
|
Attributes
|
26
26
|
----------
|
27
|
-
issues : dict[int, dict[str, float]] |
|
27
|
+
issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
|
28
28
|
Indices of image outliers with their associated issue type and calculated values.
|
29
29
|
|
30
30
|
- For a single dataset, a dictionary containing the indices of outliers and
|
31
31
|
a dictionary showing the issues and calculated values for the given index.
|
32
|
-
- For multiple
|
33
|
-
and their associated issues and calculated values.
|
32
|
+
- For multiple stats outputs, a list of dictionaries containing the indices of
|
33
|
+
outliers and their associated issues and calculated values.
|
34
34
|
"""
|
35
35
|
|
36
|
-
issues:
|
36
|
+
issues: TIndexIssueMap
|
37
|
+
|
38
|
+
def __len__(self):
|
39
|
+
if isinstance(self.issues, dict):
|
40
|
+
return len(self.issues)
|
41
|
+
else:
|
42
|
+
return sum(len(d) for d in self.issues)
|
37
43
|
|
38
44
|
|
39
45
|
def _get_outlier_mask(
|
@@ -43,7 +49,7 @@ def _get_outlier_mask(
|
|
43
49
|
threshold = threshold if threshold else 3.0
|
44
50
|
std = np.std(values)
|
45
51
|
abs_diff = np.abs(values - np.mean(values))
|
46
|
-
return (abs_diff / std) > threshold
|
52
|
+
return std != 0 and (abs_diff / std) > threshold
|
47
53
|
elif method == "modzscore":
|
48
54
|
threshold = threshold if threshold else 3.5
|
49
55
|
abs_diff = np.abs(values - np.median(values))
|
@@ -65,9 +71,6 @@ class Outliers:
|
|
65
71
|
|
66
72
|
Parameters
|
67
73
|
----------
|
68
|
-
flags : ImageStat, default ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS
|
69
|
-
Metric(s) to calculate for each image - calculates all metrics if None
|
70
|
-
Only supports ImageStat.ALL_STATS
|
71
74
|
outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
|
72
75
|
Statistical method used to identify outliers
|
73
76
|
outlier_threshold : float, optional - default None
|
@@ -76,8 +79,8 @@ class Outliers:
|
|
76
79
|
|
77
80
|
Attributes
|
78
81
|
----------
|
79
|
-
stats :
|
80
|
-
|
82
|
+
stats : tuple[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
83
|
+
Various stats output classes that hold the value of each metric for each image
|
81
84
|
|
82
85
|
See Also
|
83
86
|
--------
|
@@ -109,52 +112,61 @@ class Outliers:
|
|
109
112
|
|
110
113
|
>>> outliers = Outliers()
|
111
114
|
|
112
|
-
Specifying specific metrics to analyze:
|
113
|
-
|
114
|
-
>>> outliers = Outliers(flags=ImageStat.SIZE | ImageStat.ALL_VISUALS)
|
115
|
-
|
116
115
|
Specifying an outlier method:
|
117
116
|
|
118
117
|
>>> outliers = Outliers(outlier_method="iqr")
|
119
118
|
|
120
119
|
Specifying an outlier method and threshold:
|
121
120
|
|
122
|
-
>>> outliers = Outliers(outlier_method="zscore", outlier_threshold=
|
121
|
+
>>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
|
123
122
|
"""
|
124
123
|
|
125
124
|
def __init__(
|
126
125
|
self,
|
127
|
-
|
126
|
+
use_dimension: bool = True,
|
127
|
+
use_pixel: bool = True,
|
128
|
+
use_visual: bool = True,
|
128
129
|
outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
|
129
130
|
outlier_threshold: float | None = None,
|
130
131
|
):
|
131
|
-
|
132
|
-
self.
|
132
|
+
self.stats: DatasetStatsOutput
|
133
|
+
self.use_dimension = use_dimension
|
134
|
+
self.use_pixel = use_pixel
|
135
|
+
self.use_visual = use_visual
|
133
136
|
self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
|
134
137
|
self.outlier_threshold = outlier_threshold
|
135
138
|
|
136
|
-
def _get_outliers(self) -> dict:
|
137
|
-
flagged_images = {}
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
if
|
142
|
-
mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
|
139
|
+
def _get_outliers(self, stats: dict) -> dict[int, dict[str, float]]:
|
140
|
+
flagged_images: dict[int, dict[str, float]] = {}
|
141
|
+
for stat, values in stats.items():
|
142
|
+
if stat in (SOURCE_INDEX, BOX_COUNT):
|
143
|
+
continue
|
144
|
+
if values.ndim == 1:
|
145
|
+
mask = _get_outlier_mask(values.astype(np.float64), self.outlier_method, self.outlier_threshold)
|
143
146
|
indices = np.flatnonzero(mask)
|
144
147
|
for i, value in zip(indices, values[mask]):
|
145
|
-
flagged_images.setdefault(i, {}).update({stat:
|
148
|
+
flagged_images.setdefault(i, {}).update({stat: value})
|
146
149
|
|
147
150
|
return dict(sorted(flagged_images.items()))
|
148
151
|
|
149
|
-
@
|
150
|
-
def
|
152
|
+
@overload
|
153
|
+
def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
|
154
|
+
|
155
|
+
@overload
|
156
|
+
def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
|
157
|
+
|
158
|
+
@set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
|
159
|
+
def from_stats(
|
160
|
+
self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
|
161
|
+
) -> OutliersOutput:
|
151
162
|
"""
|
152
163
|
Returns indices of outliers with the issues identified for each
|
153
164
|
|
154
165
|
Parameters
|
155
166
|
----------
|
156
|
-
|
157
|
-
|
167
|
+
stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
|
168
|
+
The output(s) from a dimensionstats, pixelstats, or visualstats metric
|
169
|
+
analysis or an aggregate DatasetStatsOutput
|
158
170
|
|
159
171
|
Returns
|
160
172
|
-------
|
@@ -162,36 +174,96 @@ class Outliers:
|
|
162
174
|
Output class containing the indices of outliers and a dictionary showing
|
163
175
|
the issues and calculated values for the given index.
|
164
176
|
|
177
|
+
See Also
|
178
|
+
--------
|
179
|
+
dimensionstats
|
180
|
+
pixelstats
|
181
|
+
visualstats
|
182
|
+
|
165
183
|
Example
|
166
184
|
-------
|
167
185
|
Evaluate the dataset:
|
168
186
|
|
169
|
-
>>> outliers.
|
170
|
-
|
187
|
+
>>> results = outliers.from_stats([stats1, stats2])
|
188
|
+
>>> len(results)
|
189
|
+
2
|
190
|
+
>>> results.issues[0]
|
191
|
+
{10: {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}, 12: {'std': 0.00536, 'var': 2.87e-05, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}}
|
192
|
+
>>> results.issues[1]
|
193
|
+
{}
|
171
194
|
""" # noqa: E501
|
172
|
-
stats,
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
195
|
+
if isinstance(stats, DatasetStatsOutput):
|
196
|
+
outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
|
197
|
+
return OutliersOutput(outliers)
|
198
|
+
|
199
|
+
if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
|
200
|
+
return OutliersOutput(self._get_outliers(stats.dict()))
|
201
|
+
|
202
|
+
if not isinstance(stats, Sequence):
|
203
|
+
raise TypeError(
|
204
|
+
"Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
|
205
|
+
)
|
206
|
+
|
207
|
+
stats_map: dict[type, list[int]] = {}
|
208
|
+
for i, stats_output in enumerate(stats):
|
209
|
+
if not isinstance(
|
210
|
+
stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
|
211
|
+
):
|
212
|
+
raise TypeError(
|
213
|
+
"Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
|
182
214
|
)
|
183
|
-
|
184
|
-
else:
|
185
|
-
self.stats = imagestats(cast(Iterable[ArrayLike], data), self.flags)
|
186
|
-
|
187
|
-
outliers = self._get_outliers()
|
215
|
+
stats_map.setdefault(type(stats_output), []).append(i)
|
188
216
|
|
189
|
-
|
190
|
-
|
191
|
-
|
217
|
+
output_list: list[dict[int, dict[str, float]]] = [{} for _ in stats]
|
218
|
+
for _, indices in stats_map.items():
|
219
|
+
substats, dataset_steps = combine_stats([stats[i] for i in indices])
|
220
|
+
outliers = self._get_outliers(substats.dict())
|
192
221
|
for idx, issue in outliers.items():
|
193
222
|
k, v = get_dataset_step_from_idx(idx, dataset_steps)
|
194
|
-
|
195
|
-
|
223
|
+
output_list[indices[k]][v] = issue
|
224
|
+
|
225
|
+
return OutliersOutput(output_list)
|
226
|
+
|
227
|
+
@set_metadata(
|
228
|
+
"dataeval.detectors",
|
229
|
+
[
|
230
|
+
"use_dimension",
|
231
|
+
"use_pixel",
|
232
|
+
"use_visual",
|
233
|
+
"outlier_method",
|
234
|
+
"outlier_threshold",
|
235
|
+
],
|
236
|
+
)
|
237
|
+
def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
|
238
|
+
"""
|
239
|
+
Returns indices of outliers with the issues identified for each
|
196
240
|
|
241
|
+
Parameters
|
242
|
+
----------
|
243
|
+
data : Iterable[ArrayLike], shape - (C, H, W)
|
244
|
+
A dataset of images in an ArrayLike format
|
245
|
+
|
246
|
+
Returns
|
247
|
+
-------
|
248
|
+
OutliersOutput
|
249
|
+
Output class containing the indices of outliers and a dictionary showing
|
250
|
+
the issues and calculated values for the given index.
|
251
|
+
|
252
|
+
Example
|
253
|
+
-------
|
254
|
+
Evaluate the dataset:
|
255
|
+
|
256
|
+
>>> results = outliers.evaluate(images)
|
257
|
+
>>> list(results.issues)
|
258
|
+
[10, 12]
|
259
|
+
>>> results.issues[10]
|
260
|
+
{'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
|
261
|
+
"""
|
262
|
+
self.stats = datasetstats(
|
263
|
+
images=data,
|
264
|
+
use_dimension=self.use_dimension,
|
265
|
+
use_pixel=self.use_pixel,
|
266
|
+
use_visual=self.use_visual,
|
267
|
+
)
|
268
|
+
outliers = self._get_outliers({k: v for o in self.stats.outputs() for k, v in o.dict().items()})
|
197
269
|
return OutliersOutput(outliers)
|