dataeval 0.61.0__py3-none-any.whl → 0.64.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/_internal/detectors/clusterer.py +45 -16
- dataeval/_internal/detectors/drift/base.py +15 -12
- dataeval/_internal/detectors/drift/cvm.py +12 -8
- dataeval/_internal/detectors/drift/ks.py +7 -3
- dataeval/_internal/detectors/drift/mmd.py +15 -12
- dataeval/_internal/detectors/drift/uncertainty.py +6 -5
- dataeval/_internal/detectors/duplicates.py +35 -11
- dataeval/_internal/detectors/linter.py +85 -16
- dataeval/_internal/detectors/ood/ae.py +7 -5
- dataeval/_internal/detectors/ood/aegmm.py +6 -5
- dataeval/_internal/detectors/ood/base.py +15 -13
- dataeval/_internal/detectors/ood/llr.py +8 -5
- dataeval/_internal/detectors/ood/vae.py +6 -4
- dataeval/_internal/detectors/ood/vaegmm.py +6 -4
- dataeval/_internal/interop.py +43 -0
- dataeval/_internal/metrics/balance.py +180 -0
- dataeval/_internal/metrics/base.py +2 -84
- dataeval/_internal/metrics/ber.py +77 -53
- dataeval/_internal/metrics/coverage.py +80 -55
- dataeval/_internal/metrics/divergence.py +62 -54
- dataeval/_internal/metrics/diversity.py +206 -0
- dataeval/_internal/metrics/parity.py +292 -163
- dataeval/_internal/metrics/stats.py +48 -35
- dataeval/_internal/metrics/uap.py +31 -26
- dataeval/_internal/metrics/utils.py +237 -2
- dataeval/_internal/utils.py +64 -0
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/metrics/__init__.py +25 -5
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -2
- dataeval-0.64.0.dist-info/RECORD +60 -0
- dataeval/_internal/metrics/hash.py +0 -79
- dataeval-0.61.0.dist-info/RECORD +0 -55
- {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -1,26 +1,29 @@
|
|
1
|
+
from abc import abstractmethod
|
1
2
|
from enum import Flag
|
2
3
|
from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Sequence, TypeVar, Union
|
3
4
|
|
4
5
|
import numpy as np
|
6
|
+
from numpy.typing import ArrayLike
|
5
7
|
from scipy.stats import entropy, kurtosis, skew
|
6
8
|
|
7
9
|
from dataeval._internal.flags import ImageHash, ImageProperty, ImageStatistics, ImageStatsFlags, ImageVisuals
|
8
|
-
from dataeval._internal.
|
9
|
-
from dataeval._internal.metrics.
|
10
|
-
from dataeval._internal.metrics.utils import edge_filter, get_bitdepth, normalize_image_shape, rescale
|
10
|
+
from dataeval._internal.interop import to_numpy_iter
|
11
|
+
from dataeval._internal.metrics.base import EvaluateMixin
|
12
|
+
from dataeval._internal.metrics.utils import edge_filter, get_bitdepth, normalize_image_shape, pchash, rescale, xxhash
|
11
13
|
|
12
14
|
QUARTILES = (0, 25, 50, 75, 100)
|
13
15
|
|
14
|
-
TBatch = TypeVar("TBatch", bound=Sequence)
|
16
|
+
TBatch = TypeVar("TBatch", bound=Sequence[ArrayLike])
|
15
17
|
TFlag = TypeVar("TFlag", bound=Flag)
|
16
18
|
|
17
19
|
|
18
|
-
class BaseStatsMetric(
|
20
|
+
class BaseStatsMetric(EvaluateMixin, Generic[TBatch, TFlag]):
|
19
21
|
def __init__(self, flags: TFlag):
|
20
22
|
self.flags = flags
|
21
23
|
self.results = []
|
22
24
|
|
23
|
-
|
25
|
+
@abstractmethod
|
26
|
+
def update(self, images: TBatch) -> None:
|
24
27
|
"""
|
25
28
|
Updates internal metric cache for later calculation
|
26
29
|
|
@@ -66,6 +69,16 @@ class BaseStatsMetric(MetricMixin, Generic[TBatch, TFlag]):
|
|
66
69
|
)
|
67
70
|
return [flag.name.lower() for flag in flags if flag.name is not None]
|
68
71
|
|
72
|
+
def evaluate(self, images: TBatch) -> Dict[str, Any]:
|
73
|
+
"""Calculate metric results given a single batch of images"""
|
74
|
+
if self.results:
|
75
|
+
raise RuntimeError("Call reset before calling evaluate")
|
76
|
+
|
77
|
+
self.update(images)
|
78
|
+
results = self.compute()
|
79
|
+
self.reset()
|
80
|
+
return results
|
81
|
+
|
69
82
|
|
70
83
|
class ImageHashMetric(BaseStatsMetric):
|
71
84
|
"""
|
@@ -80,12 +93,12 @@ class ImageHashMetric(BaseStatsMetric):
|
|
80
93
|
def __init__(self, flags: ImageHash = ImageHash.ALL):
|
81
94
|
super().__init__(flags)
|
82
95
|
|
83
|
-
def update(self,
|
84
|
-
for
|
96
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
97
|
+
for image in to_numpy_iter(images):
|
85
98
|
results = self._map(
|
86
99
|
{
|
87
|
-
ImageHash.XXHASH: lambda: xxhash(
|
88
|
-
ImageHash.PCHASH: lambda: pchash(
|
100
|
+
ImageHash.XXHASH: lambda: xxhash(image),
|
101
|
+
ImageHash.PCHASH: lambda: pchash(image),
|
89
102
|
}
|
90
103
|
)
|
91
104
|
self.results.append(results)
|
@@ -104,16 +117,16 @@ class ImagePropertyMetric(BaseStatsMetric):
|
|
104
117
|
def __init__(self, flags: ImageProperty = ImageProperty.ALL):
|
105
118
|
super().__init__(flags)
|
106
119
|
|
107
|
-
def update(self,
|
108
|
-
for
|
120
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
121
|
+
for image in to_numpy_iter(images):
|
109
122
|
results = self._map(
|
110
123
|
{
|
111
|
-
ImageProperty.WIDTH: lambda: np.int32(
|
112
|
-
ImageProperty.HEIGHT: lambda: np.int32(
|
113
|
-
ImageProperty.SIZE: lambda: np.int32(
|
114
|
-
ImageProperty.ASPECT_RATIO: lambda:
|
115
|
-
ImageProperty.CHANNELS: lambda:
|
116
|
-
ImageProperty.DEPTH: lambda: get_bitdepth(
|
124
|
+
ImageProperty.WIDTH: lambda: np.int32(image.shape[-1]),
|
125
|
+
ImageProperty.HEIGHT: lambda: np.int32(image.shape[-2]),
|
126
|
+
ImageProperty.SIZE: lambda: np.int32(image.shape[-1] * image.shape[-2]),
|
127
|
+
ImageProperty.ASPECT_RATIO: lambda: image.shape[-1] / np.int32(image.shape[-2]),
|
128
|
+
ImageProperty.CHANNELS: lambda: image.shape[-3],
|
129
|
+
ImageProperty.DEPTH: lambda: get_bitdepth(image).depth,
|
117
130
|
}
|
118
131
|
)
|
119
132
|
self.results.append(results)
|
@@ -132,14 +145,14 @@ class ImageVisualsMetric(BaseStatsMetric):
|
|
132
145
|
def __init__(self, flags: ImageVisuals = ImageVisuals.ALL):
|
133
146
|
super().__init__(flags)
|
134
147
|
|
135
|
-
def update(self,
|
136
|
-
for
|
148
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
149
|
+
for image in to_numpy_iter(images):
|
137
150
|
results = self._map(
|
138
151
|
{
|
139
|
-
ImageVisuals.BRIGHTNESS: lambda: np.mean(rescale(
|
140
|
-
ImageVisuals.BLURRINESS: lambda: np.std(edge_filter(np.mean(
|
141
|
-
ImageVisuals.MISSING: lambda: np.sum(np.isnan(
|
142
|
-
ImageVisuals.ZERO: lambda: np.int32(np.count_nonzero(
|
152
|
+
ImageVisuals.BRIGHTNESS: lambda: np.mean(rescale(image)),
|
153
|
+
ImageVisuals.BLURRINESS: lambda: np.std(edge_filter(np.mean(image, axis=0))),
|
154
|
+
ImageVisuals.MISSING: lambda: np.sum(np.isnan(image)),
|
155
|
+
ImageVisuals.ZERO: lambda: np.int32(np.count_nonzero(image == 0)),
|
143
156
|
}
|
144
157
|
)
|
145
158
|
self.results.append(results)
|
@@ -158,9 +171,9 @@ class ImageStatisticsMetric(BaseStatsMetric):
|
|
158
171
|
def __init__(self, flags: ImageStatistics = ImageStatistics.ALL):
|
159
172
|
super().__init__(flags)
|
160
173
|
|
161
|
-
def update(self,
|
162
|
-
for
|
163
|
-
scaled = rescale(
|
174
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
175
|
+
for image in to_numpy_iter(images):
|
176
|
+
scaled = rescale(image)
|
164
177
|
if (ImageStatistics.HISTOGRAM | ImageStatistics.ENTROPY) & self.flags:
|
165
178
|
hist = np.histogram(scaled, bins=256, range=(0, 1))[0]
|
166
179
|
|
@@ -192,10 +205,10 @@ class ChannelStatisticsMetric(BaseStatsMetric):
|
|
192
205
|
def __init__(self, flags: ImageStatistics = ImageStatistics.ALL):
|
193
206
|
super().__init__(flags)
|
194
207
|
|
195
|
-
def update(self,
|
196
|
-
for
|
197
|
-
scaled = rescale(
|
198
|
-
flattened = scaled.reshape(
|
208
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
209
|
+
for image in to_numpy_iter(images):
|
210
|
+
scaled = rescale(image)
|
211
|
+
flattened = scaled.reshape(image.shape[0], -1)
|
199
212
|
|
200
213
|
if (ImageStatistics.HISTOGRAM | ImageStatistics.ENTROPY) & self.flags:
|
201
214
|
hist = np.apply_along_axis(lambda x: np.histogram(x, bins=256, range=(0, 1))[0], 1, flattened)
|
@@ -253,8 +266,8 @@ class ImageStats(BaseAggregateMetric):
|
|
253
266
|
super().__init__(flags)
|
254
267
|
self._length = 0
|
255
268
|
|
256
|
-
def update(self,
|
257
|
-
for image in
|
269
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
270
|
+
for image in to_numpy_iter(images):
|
258
271
|
self._length += 1
|
259
272
|
img = normalize_image_shape(image)
|
260
273
|
for metric in self._metrics_dict:
|
@@ -295,8 +308,8 @@ class ChannelStats(BaseAggregateMetric):
|
|
295
308
|
def __init__(self, flags: Optional[ImageStatistics] = None) -> None:
|
296
309
|
super().__init__(flags)
|
297
310
|
|
298
|
-
def update(self,
|
299
|
-
for image in
|
311
|
+
def update(self, images: Iterable[ArrayLike]) -> None:
|
312
|
+
for image in to_numpy_iter(images):
|
300
313
|
img = normalize_image_shape(image)
|
301
314
|
for metric in self._metrics_dict:
|
302
315
|
metric.update([img])
|
@@ -4,42 +4,47 @@ FR Test Statistic based estimate for the upperbound
|
|
4
4
|
average precision using empirical mean precision
|
5
5
|
"""
|
6
6
|
|
7
|
-
from typing import
|
7
|
+
from typing import NamedTuple
|
8
8
|
|
9
|
-
|
9
|
+
from numpy.typing import ArrayLike
|
10
10
|
from sklearn.metrics import average_precision_score
|
11
11
|
|
12
|
-
from dataeval._internal.
|
12
|
+
from dataeval._internal.interop import to_numpy
|
13
13
|
|
14
14
|
|
15
|
-
class
|
15
|
+
class UAPOutput(NamedTuple):
|
16
16
|
"""
|
17
|
-
|
17
|
+
Attributes
|
18
|
+
----------
|
19
|
+
uap : float
|
20
|
+
The empirical mean precision estimate
|
21
|
+
"""
|
22
|
+
|
23
|
+
uap: float
|
24
|
+
|
25
|
+
|
26
|
+
def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
|
27
|
+
"""
|
28
|
+
FR Test Statistic based estimate of the empirical mean precision for
|
29
|
+
the upperbound average precision
|
18
30
|
|
19
31
|
Parameters
|
20
32
|
----------
|
21
|
-
labels :
|
33
|
+
labels : ArrayLike
|
22
34
|
A numpy array of n_samples of class labels with M unique classes.
|
23
|
-
|
24
|
-
scores : np.ndarray
|
35
|
+
scores : ArrayLike
|
25
36
|
A 2D array of class probabilities per image
|
37
|
+
|
38
|
+
Returns
|
39
|
+
-------
|
40
|
+
Dict[str, float]
|
41
|
+
uap : The empirical mean precision estimate
|
42
|
+
|
43
|
+
Raises
|
44
|
+
------
|
45
|
+
ValueError
|
46
|
+
If unique classes M < 2
|
26
47
|
"""
|
27
48
|
|
28
|
-
|
29
|
-
|
30
|
-
self.scores = scores
|
31
|
-
|
32
|
-
def evaluate(self) -> Dict[str, float]:
|
33
|
-
"""
|
34
|
-
Returns
|
35
|
-
-------
|
36
|
-
Dict[str, float]
|
37
|
-
uap : The empirical mean precision estimate
|
38
|
-
|
39
|
-
Raises
|
40
|
-
------
|
41
|
-
ValueError
|
42
|
-
If unique classes M < 2
|
43
|
-
"""
|
44
|
-
uap = float(average_precision_score(self.labels, self.scores, average="weighted"))
|
45
|
-
return {"uap": uap}
|
49
|
+
precision = float(average_precision_score(to_numpy(labels), to_numpy(scores), average="weighted"))
|
50
|
+
return UAPOutput(precision)
|
@@ -1,15 +1,180 @@
|
|
1
|
-
from typing import Any, Literal, NamedTuple, Tuple, Union
|
1
|
+
from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional, Sequence, Tuple, Union
|
2
2
|
|
3
3
|
import numpy as np
|
4
|
+
import xxhash as xxh
|
5
|
+
from PIL import Image
|
6
|
+
from scipy.fftpack import dct
|
4
7
|
from scipy.signal import convolve2d
|
5
8
|
from scipy.sparse import csr_matrix
|
6
9
|
from scipy.sparse.csgraph import minimum_spanning_tree as mst
|
7
10
|
from scipy.spatial.distance import pdist, squareform
|
11
|
+
from scipy.stats import entropy as sp_entropy
|
8
12
|
from sklearn.neighbors import NearestNeighbors
|
9
13
|
|
10
14
|
EPSILON = 1e-5
|
11
15
|
EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
|
12
16
|
BIT_DEPTH = (1, 8, 12, 16, 32)
|
17
|
+
HASH_SIZE = 8
|
18
|
+
MAX_FACTOR = 4
|
19
|
+
|
20
|
+
|
21
|
+
def get_method(method_map: Dict[str, Callable], method: str) -> Callable:
|
22
|
+
if method not in method_map:
|
23
|
+
raise ValueError(f"Specified method {method} is not a valid method: {method_map}.")
|
24
|
+
return method_map[method]
|
25
|
+
|
26
|
+
|
27
|
+
def get_counts(
|
28
|
+
data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
|
29
|
+
) -> tuple[Dict, Dict]:
|
30
|
+
"""
|
31
|
+
Initialize dictionary of histogram counts --- treat categorical values
|
32
|
+
as histogram bins.
|
33
|
+
|
34
|
+
Parameters
|
35
|
+
----------
|
36
|
+
subset_mask: Optional[np.ndarray[bool]]
|
37
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
38
|
+
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
counts: Dict
|
42
|
+
histogram counts per metadata factor in `factors`. Each
|
43
|
+
factor will have a different number of bins. Counts get reused
|
44
|
+
across metrics, so hist_counts are cached but only if computed
|
45
|
+
globally, i.e. without masked samples.
|
46
|
+
"""
|
47
|
+
|
48
|
+
hist_counts, hist_bins = {}, {}
|
49
|
+
# np.where needed to satisfy linter
|
50
|
+
mask = np.where(subset_mask if subset_mask is not None else np.ones(data.shape[0], dtype=bool))
|
51
|
+
|
52
|
+
for cdx, fn in enumerate(names):
|
53
|
+
# linter doesn't like double indexing
|
54
|
+
col_data = data[mask, cdx].squeeze()
|
55
|
+
if is_categorical[cdx]:
|
56
|
+
# if discrete, use unique values as bins
|
57
|
+
bins, cnts = np.unique(col_data, return_counts=True)
|
58
|
+
else:
|
59
|
+
bins = hist_bins.get(fn, "auto")
|
60
|
+
cnts, bins = np.histogram(col_data, bins=bins, density=True)
|
61
|
+
|
62
|
+
hist_counts[fn] = cnts
|
63
|
+
hist_bins[fn] = bins
|
64
|
+
|
65
|
+
return hist_counts, hist_bins
|
66
|
+
|
67
|
+
|
68
|
+
def entropy(
|
69
|
+
data: np.ndarray,
|
70
|
+
names: List[str],
|
71
|
+
is_categorical: List[bool],
|
72
|
+
normalized: bool = False,
|
73
|
+
subset_mask: Optional[np.ndarray] = None,
|
74
|
+
) -> np.ndarray:
|
75
|
+
"""
|
76
|
+
Meant for use with Bias metrics, Balance, Diversity, ClasswiseBalance,
|
77
|
+
and Classwise Diversity.
|
78
|
+
|
79
|
+
Compute entropy for discrete/categorical variables and, through standard
|
80
|
+
histogram binning, for continuous variables.
|
81
|
+
|
82
|
+
Parameters
|
83
|
+
----------
|
84
|
+
normalized: bool
|
85
|
+
Flag that determines whether or not to normalize entropy by log(num_bins)
|
86
|
+
subset_mask: Optional[np.ndarray[bool]]
|
87
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
88
|
+
|
89
|
+
Notes
|
90
|
+
-----
|
91
|
+
For continuous variables, histogram bins are chosen automatically. See
|
92
|
+
numpy.histogram for details.
|
93
|
+
|
94
|
+
Returns
|
95
|
+
-------
|
96
|
+
ent: np.ndarray[float]
|
97
|
+
Entropy estimate per column of X
|
98
|
+
|
99
|
+
See Also
|
100
|
+
--------
|
101
|
+
numpy.histogram
|
102
|
+
scipy.stats.entropy
|
103
|
+
"""
|
104
|
+
|
105
|
+
num_factors = len(names)
|
106
|
+
hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
|
107
|
+
|
108
|
+
ev_index = np.empty(num_factors)
|
109
|
+
for col, cnts in enumerate(hist_counts.values()):
|
110
|
+
# entropy in nats, normalizes counts
|
111
|
+
ev_index[col] = sp_entropy(cnts)
|
112
|
+
if normalized:
|
113
|
+
if len(cnts) == 1:
|
114
|
+
# log(0)
|
115
|
+
ev_index[col] = 0
|
116
|
+
else:
|
117
|
+
ev_index[col] /= np.log(len(cnts))
|
118
|
+
return ev_index
|
119
|
+
|
120
|
+
|
121
|
+
def get_num_bins(
|
122
|
+
data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
|
123
|
+
) -> np.ndarray:
|
124
|
+
"""
|
125
|
+
Number of bins or unique values for each metadata factor, used to
|
126
|
+
normalize entropy/diversity.
|
127
|
+
|
128
|
+
Parameters
|
129
|
+
----------
|
130
|
+
subset_mask: Optional[np.ndarray[bool]]
|
131
|
+
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
132
|
+
"""
|
133
|
+
# likely cached
|
134
|
+
hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
|
135
|
+
num_bins = np.empty(len(hist_counts))
|
136
|
+
for idx, cnts in enumerate(hist_counts.values()):
|
137
|
+
num_bins[idx] = len(cnts)
|
138
|
+
|
139
|
+
return num_bins
|
140
|
+
|
141
|
+
|
142
|
+
def infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
|
143
|
+
"""
|
144
|
+
Compute fraction of feature values that are unique --- intended to be used
|
145
|
+
for inferring whether variables are categorical.
|
146
|
+
"""
|
147
|
+
if X.ndim == 1:
|
148
|
+
X = np.expand_dims(X, axis=1)
|
149
|
+
num_samples = X.shape[0]
|
150
|
+
pct_unique = np.empty(X.shape[1])
|
151
|
+
for col in range(X.shape[1]): # type: ignore
|
152
|
+
uvals = np.unique(X[:, col], axis=0)
|
153
|
+
pct_unique[col] = len(uvals) / num_samples
|
154
|
+
return pct_unique < threshold
|
155
|
+
|
156
|
+
|
157
|
+
def preprocess_metadata(class_labels: Sequence[int], metadata: List[Dict]) -> Tuple[np.ndarray, List[str], List[bool]]:
|
158
|
+
# convert class_labels and list of metadata dicts to dict of ndarrays
|
159
|
+
metadata_dict: Dict[str, np.ndarray] = {
|
160
|
+
"class_label": np.asarray(class_labels, dtype=int),
|
161
|
+
**{k: np.array([d[k] for d in metadata]) for k in metadata[0]},
|
162
|
+
}
|
163
|
+
|
164
|
+
# map columns of dict that are not numeric (e.g. string) to numeric values
|
165
|
+
# that mutual information and diversity functions can accommodate. Each
|
166
|
+
# unique string receives a unique integer value.
|
167
|
+
for k, v in metadata_dict.items():
|
168
|
+
# if not numeric
|
169
|
+
if not np.issubdtype(v.dtype, np.number):
|
170
|
+
_, mapped_vals = np.unique(v, return_inverse=True)
|
171
|
+
metadata_dict[k] = mapped_vals
|
172
|
+
|
173
|
+
data = np.stack(list(metadata_dict.values()), axis=-1)
|
174
|
+
names = list(metadata_dict.keys())
|
175
|
+
is_categorical = [infer_categorical(metadata_dict[var], 0.25)[0] for var in names]
|
176
|
+
|
177
|
+
return data, names, is_categorical
|
13
178
|
|
14
179
|
|
15
180
|
def minimum_spanning_tree(X: np.ndarray) -> Any:
|
@@ -89,7 +254,7 @@ def compute_neighbors(
|
|
89
254
|
|
90
255
|
See Also
|
91
256
|
--------
|
92
|
-
|
257
|
+
sklearn.neighbors.NearestNeighbors
|
93
258
|
"""
|
94
259
|
|
95
260
|
nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
|
@@ -156,3 +321,73 @@ def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
|
|
156
321
|
edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
|
157
322
|
np.clip(edges, 0, 255, edges)
|
158
323
|
return edges
|
324
|
+
|
325
|
+
|
326
|
+
def pchash(image: np.ndarray) -> str:
|
327
|
+
"""
|
328
|
+
Performs a perceptual hash on an image by resizing to a square NxN image
|
329
|
+
using the Lanczos algorithm where N is 32x32 or the largest multiple of
|
330
|
+
8 that is smaller than the input image dimensions. The resampled image
|
331
|
+
is compressed using a discrete cosine transform and the lowest frequency
|
332
|
+
component is encoded as a bit array of greater or less than median value
|
333
|
+
and returned as a hex string.
|
334
|
+
|
335
|
+
Parameters
|
336
|
+
----------
|
337
|
+
image : np.ndarray
|
338
|
+
An image as a numpy array in CxHxW format
|
339
|
+
|
340
|
+
Returns
|
341
|
+
-------
|
342
|
+
str
|
343
|
+
The hex string hash of the image using perceptual hashing
|
344
|
+
"""
|
345
|
+
# Verify that the image is at least larger than an 8x8 image
|
346
|
+
min_dim = min(image.shape[-2:])
|
347
|
+
if min_dim < HASH_SIZE + 1:
|
348
|
+
raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
|
349
|
+
|
350
|
+
# Calculates the dimensions of the resized square image
|
351
|
+
resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
|
352
|
+
|
353
|
+
# Normalizes the image to CxHxW and takes the mean over all the channels
|
354
|
+
normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
|
355
|
+
|
356
|
+
# Rescales the pixel values to an 8-bit 0-255 image
|
357
|
+
rescaled = rescale(normalized, 8).astype(np.uint8)
|
358
|
+
|
359
|
+
# Resizes the image using the Lanczos algorithm to a square image
|
360
|
+
im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
|
361
|
+
|
362
|
+
# Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
|
363
|
+
transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
|
364
|
+
|
365
|
+
# Encodes the transform as a bit array over the median value
|
366
|
+
diff = transform > np.median(transform)
|
367
|
+
|
368
|
+
# Pads the front of the bit array to a multiple of 8 with False
|
369
|
+
padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
|
370
|
+
padded[-diff.size :] = diff.ravel()
|
371
|
+
|
372
|
+
# Converts the bit array to a hex string and strips leading 0s
|
373
|
+
hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
|
374
|
+
return hash_hex if hash_hex else "0"
|
375
|
+
|
376
|
+
|
377
|
+
def xxhash(image: np.ndarray) -> str:
|
378
|
+
"""
|
379
|
+
Performs a fast non-cryptographic hash using the xxhash algorithm
|
380
|
+
(xxhash.com) against the image as a flattened bytearray. The hash
|
381
|
+
is returned as a hex string.
|
382
|
+
|
383
|
+
Parameters
|
384
|
+
----------
|
385
|
+
image : np.ndarray
|
386
|
+
An image as a numpy array
|
387
|
+
|
388
|
+
Returns
|
389
|
+
-------
|
390
|
+
str
|
391
|
+
The hex string hash of the image using the xxHash algorithm
|
392
|
+
"""
|
393
|
+
return xxh.xxh3_64_hexdigest(image.ravel().tobytes())
|
@@ -0,0 +1,64 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import Any, Dict, List
|
3
|
+
|
4
|
+
from torch.utils.data import Dataset
|
5
|
+
|
6
|
+
|
7
|
+
def read_dataset(dataset: Dataset) -> List[List[Any]]:
|
8
|
+
"""
|
9
|
+
Extract information from a dataset at each index into a individual lists of each information position
|
10
|
+
|
11
|
+
Parameters
|
12
|
+
----------
|
13
|
+
dataset : torch.utils.data.Dataset
|
14
|
+
Input dataset
|
15
|
+
|
16
|
+
Returns
|
17
|
+
-------
|
18
|
+
List[List[Any]]
|
19
|
+
All objects in individual lists based on return position from dataset
|
20
|
+
|
21
|
+
Warning
|
22
|
+
-------
|
23
|
+
No type checking is done between lists or data inside lists
|
24
|
+
|
25
|
+
See Also
|
26
|
+
--------
|
27
|
+
torch.utils.data.Dataset
|
28
|
+
|
29
|
+
Examples
|
30
|
+
--------
|
31
|
+
>>> import numpy as np
|
32
|
+
|
33
|
+
>>> data = np.ones((10, 3, 3))
|
34
|
+
>>> labels = np.ones((10,))
|
35
|
+
>>> class ICDataset:
|
36
|
+
... def __init__(self, data, labels):
|
37
|
+
... self.data = data
|
38
|
+
... self.labels = labels
|
39
|
+
|
40
|
+
... def __getitem__(self, idx):
|
41
|
+
... return self.data[idx], self.labels[idx]
|
42
|
+
|
43
|
+
>>> ds = ICDataset(data, labels)
|
44
|
+
|
45
|
+
>>> result = read_dataset(ds)
|
46
|
+
>>> assert len(result) == 2
|
47
|
+
True
|
48
|
+
>>> assert result[0].shape == (10, 3, 3) # 10 3x3 images
|
49
|
+
True
|
50
|
+
>>> assert result[1].shape == (10,) # 10 labels
|
51
|
+
True
|
52
|
+
"""
|
53
|
+
|
54
|
+
ddict: Dict[int, List] = defaultdict(list)
|
55
|
+
|
56
|
+
for data in dataset:
|
57
|
+
# Convert to tuple if single return (e.g. images only)
|
58
|
+
if not isinstance(data, tuple):
|
59
|
+
data = (data,)
|
60
|
+
|
61
|
+
for i, d in enumerate(data):
|
62
|
+
ddict[i].append(d)
|
63
|
+
|
64
|
+
return list(ddict.values())
|
File without changes
|
dataeval/metrics/__init__.py
CHANGED
@@ -1,7 +1,27 @@
|
|
1
|
-
from
|
2
|
-
|
3
|
-
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
__all__: List[str] = []
|
4
|
+
|
5
|
+
from dataeval._internal.metrics.balance import balance, balance_classwise
|
6
|
+
from dataeval._internal.metrics.ber import ber
|
7
|
+
from dataeval._internal.metrics.coverage import coverage
|
8
|
+
from dataeval._internal.metrics.divergence import divergence
|
9
|
+
from dataeval._internal.metrics.diversity import diversity, diversity_classwise
|
10
|
+
from dataeval._internal.metrics.parity import parity, parity_metadata
|
4
11
|
from dataeval._internal.metrics.stats import ChannelStats, ImageStats
|
5
|
-
from dataeval._internal.metrics.uap import
|
12
|
+
from dataeval._internal.metrics.uap import uap
|
6
13
|
|
7
|
-
__all__
|
14
|
+
__all__ += [
|
15
|
+
"balance",
|
16
|
+
"balance_classwise",
|
17
|
+
"ber",
|
18
|
+
"coverage",
|
19
|
+
"divergence",
|
20
|
+
"diversity",
|
21
|
+
"diversity_classwise",
|
22
|
+
"parity",
|
23
|
+
"parity_metadata",
|
24
|
+
"ChannelStats",
|
25
|
+
"ImageStats",
|
26
|
+
"uap",
|
27
|
+
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.64.0
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -23,7 +23,6 @@ Provides-Extra: all
|
|
23
23
|
Provides-Extra: tensorflow
|
24
24
|
Provides-Extra: torch
|
25
25
|
Requires-Dist: hdbscan (>=0.8.36)
|
26
|
-
Requires-Dist: maite
|
27
26
|
Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
|
28
27
|
Requires-Dist: numpy (>1.24.3)
|
29
28
|
Requires-Dist: nvidia-cudnn-cu11 (>=8.6.0.163) ; extra == "tensorflow" or extra == "torch" or extra == "all"
|