dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/detectors/__init__.py +4 -3
- dataeval/detectors/drift/__init__.py +9 -10
- dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
- dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
- dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
- dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
- dataeval/detectors/drift/updates.py +61 -0
- dataeval/detectors/linters/__init__.py +3 -3
- dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
- dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
- dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
- dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
- dataeval/detectors/ood/__init__.py +6 -6
- dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
- dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
- dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
- dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
- dataeval/detectors/ood/metadata_ks_compare.py +99 -0
- dataeval/detectors/ood/metadata_least_likely.py +119 -0
- dataeval/detectors/ood/metadata_ood_mi.py +92 -0
- dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
- dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
- dataeval/{_internal/interop.py → interop.py} +12 -7
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +4 -4
- dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
- dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
- dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
- dataeval/metrics/bias/metadata.py +275 -0
- dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
- dataeval/metrics/estimators/__init__.py +3 -3
- dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
- dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
- dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
- dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
- dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
- dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
- dataeval/metrics/stats/hashstats.py +156 -0
- dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
- dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
- dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
- dataeval/{_internal/output.py → output.py} +26 -6
- dataeval/utils/__init__.py +7 -3
- dataeval/utils/image.py +71 -0
- dataeval/utils/shared.py +151 -0
- dataeval/{_internal → utils}/split_dataset.py +98 -33
- dataeval/utils/tensorflow/__init__.py +7 -6
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
- dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
- dataeval/utils/tensorflow/loss/__init__.py +6 -2
- dataeval/utils/torch/__init__.py +7 -3
- dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
- dataeval/{_internal → utils/torch}/datasets.py +48 -42
- dataeval/utils/torch/models.py +138 -0
- dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
- dataeval/{_internal → utils/torch}/utils.py +3 -1
- dataeval/workflows/__init__.py +1 -1
- dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
- {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
- dataeval-0.72.2.dist-info/RECORD +72 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/stats/hashstats.py +0 -75
- dataeval/_internal/metrics/utils.py +0 -447
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/utils.py +0 -67
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/detectors/drift/kernels/__init__.py +0 -10
- dataeval/detectors/drift/updates/__init__.py +0 -8
- dataeval/utils/tensorflow/models/__init__.py +0 -9
- dataeval/utils/tensorflow/recon/__init__.py +0 -3
- dataeval/utils/torch/datasets/__init__.py +0 -12
- dataeval/utils/torch/models/__init__.py +0 -11
- dataeval/utils/torch/trainer/__init__.py +0 -7
- dataeval-0.72.1.dist-info/RECORD +0 -81
- /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
- {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,75 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from dataclasses import dataclass
|
4
|
-
from typing import Iterable
|
5
|
-
|
6
|
-
from numpy.typing import ArrayLike
|
7
|
-
|
8
|
-
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
9
|
-
from dataeval._internal.metrics.utils import pchash, xxhash
|
10
|
-
from dataeval._internal.output import set_metadata
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass(frozen=True)
|
14
|
-
class HashStatsOutput(BaseStatsOutput):
|
15
|
-
"""
|
16
|
-
Output class for :func:`hashstats` stats metric
|
17
|
-
|
18
|
-
Attributes
|
19
|
-
----------
|
20
|
-
xxhash : List[str]
|
21
|
-
xxHash hash of the images as a hex string
|
22
|
-
pchash : List[str]
|
23
|
-
:term:`Perception-based Hash` of the images as a hex string
|
24
|
-
"""
|
25
|
-
|
26
|
-
xxhash: list[str]
|
27
|
-
pchash: list[str]
|
28
|
-
|
29
|
-
|
30
|
-
class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
|
31
|
-
output_class = HashStatsOutput
|
32
|
-
image_function_map = {
|
33
|
-
"xxhash": lambda x: xxhash(x.image),
|
34
|
-
"pchash": lambda x: pchash(x.image),
|
35
|
-
}
|
36
|
-
|
37
|
-
|
38
|
-
@set_metadata("dataeval.metrics")
|
39
|
-
def hashstats(
|
40
|
-
images: Iterable[ArrayLike],
|
41
|
-
bboxes: Iterable[ArrayLike] | None = None,
|
42
|
-
) -> HashStatsOutput:
|
43
|
-
"""
|
44
|
-
Calculates hashes for each image
|
45
|
-
|
46
|
-
This function computes hashes from the images including exact hashes and perception-based
|
47
|
-
hashes. These hash values can be used to determine if images are exact or near matches.
|
48
|
-
|
49
|
-
Parameters
|
50
|
-
----------
|
51
|
-
images : ArrayLike
|
52
|
-
Images to hashing
|
53
|
-
bboxes : Iterable[ArrayLike] or None
|
54
|
-
Bounding boxes in `xyxy` format for each image
|
55
|
-
|
56
|
-
Returns
|
57
|
-
-------
|
58
|
-
HashStatsOutput
|
59
|
-
A dictionary-like object containing the computed hashes for each image.
|
60
|
-
|
61
|
-
See Also
|
62
|
-
--------
|
63
|
-
:term:`Duplicates`
|
64
|
-
|
65
|
-
Examples
|
66
|
-
--------
|
67
|
-
Calculating the :term:`statistics<Statistics>` on the images, whose shape is (C, H, W)
|
68
|
-
|
69
|
-
>>> results = hashstats(images)
|
70
|
-
>>> print(results.xxhash)
|
71
|
-
['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
|
72
|
-
>>> print(results.pchash)
|
73
|
-
['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
|
74
|
-
"""
|
75
|
-
return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
|
@@ -1,447 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from typing import Any, Callable, Literal, Mapping, NamedTuple
|
4
|
-
|
5
|
-
import numpy as np
|
6
|
-
import xxhash as xxh
|
7
|
-
from numpy.typing import ArrayLike, NDArray
|
8
|
-
from PIL import Image
|
9
|
-
from scipy.fftpack import dct
|
10
|
-
from scipy.signal import convolve2d
|
11
|
-
from scipy.sparse import csr_matrix
|
12
|
-
from scipy.sparse.csgraph import minimum_spanning_tree as mst
|
13
|
-
from scipy.spatial.distance import pdist, squareform
|
14
|
-
from scipy.stats import entropy as sp_entropy
|
15
|
-
from sklearn.neighbors import NearestNeighbors
|
16
|
-
|
17
|
-
from dataeval._internal.interop import to_numpy
|
18
|
-
|
19
|
-
EPSILON = 1e-5
|
20
|
-
EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
|
21
|
-
BIT_DEPTH = (1, 8, 12, 16, 32)
|
22
|
-
HASH_SIZE = 8
|
23
|
-
MAX_FACTOR = 4
|
24
|
-
|
25
|
-
|
26
|
-
def get_method(method_map: dict[str, Callable], method: str) -> Callable:
|
27
|
-
if method not in method_map:
|
28
|
-
raise ValueError(f"Specified method {method} is not a valid method: {method_map}.")
|
29
|
-
return method_map[method]
|
30
|
-
|
31
|
-
|
32
|
-
def get_counts(
|
33
|
-
data: NDArray, names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
|
34
|
-
) -> tuple[dict, dict]:
|
35
|
-
"""
|
36
|
-
Initialize dictionary of histogram counts --- treat categorical values
|
37
|
-
as histogram bins.
|
38
|
-
|
39
|
-
Parameters
|
40
|
-
----------
|
41
|
-
subset_mask: NDArray[np.bool_] | None
|
42
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
43
|
-
|
44
|
-
Returns
|
45
|
-
-------
|
46
|
-
counts: Dict
|
47
|
-
histogram counts per metadata factor in `factors`. Each
|
48
|
-
factor will have a different number of bins. Counts get reused
|
49
|
-
across metrics, so hist_counts are cached but only if computed
|
50
|
-
globally, i.e. without masked samples.
|
51
|
-
"""
|
52
|
-
|
53
|
-
hist_counts, hist_bins = {}, {}
|
54
|
-
# np.where needed to satisfy linter
|
55
|
-
mask = np.where(subset_mask if subset_mask is not None else np.ones(data.shape[0], dtype=bool))
|
56
|
-
|
57
|
-
for cdx, fn in enumerate(names):
|
58
|
-
# linter doesn't like double indexing
|
59
|
-
col_data = data[mask, cdx].squeeze()
|
60
|
-
if is_categorical[cdx]:
|
61
|
-
# if discrete, use unique values as bins
|
62
|
-
bins, cnts = np.unique(col_data, return_counts=True)
|
63
|
-
else:
|
64
|
-
bins = hist_bins.get(fn, "auto")
|
65
|
-
cnts, bins = np.histogram(col_data, bins=bins, density=True)
|
66
|
-
|
67
|
-
hist_counts[fn] = cnts
|
68
|
-
hist_bins[fn] = bins
|
69
|
-
|
70
|
-
return hist_counts, hist_bins
|
71
|
-
|
72
|
-
|
73
|
-
def entropy(
|
74
|
-
data: NDArray,
|
75
|
-
names: list[str],
|
76
|
-
is_categorical: list[bool],
|
77
|
-
normalized: bool = False,
|
78
|
-
subset_mask: NDArray[np.bool_] | None = None,
|
79
|
-
) -> NDArray[np.float64]:
|
80
|
-
"""
|
81
|
-
Meant for use with :term:`bias<Bias>` metrics, :term:`balance<Balance>`, :term:`diversity<Diversity>`,
|
82
|
-
ClasswiseBalance, and Classwise Diversity.
|
83
|
-
|
84
|
-
Compute entropy for discrete/categorical variables and for continuous variables through standard
|
85
|
-
histogram binning.
|
86
|
-
|
87
|
-
Parameters
|
88
|
-
----------
|
89
|
-
normalized: bool
|
90
|
-
Flag that determines whether or not to normalize entropy by log(num_bins)
|
91
|
-
subset_mask: NDArray[np.bool_] | None
|
92
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
93
|
-
|
94
|
-
Note
|
95
|
-
----
|
96
|
-
For continuous variables, histogram bins are chosen automatically. See
|
97
|
-
numpy.histogram for details.
|
98
|
-
|
99
|
-
Returns
|
100
|
-
-------
|
101
|
-
ent: NDArray[np.float64]
|
102
|
-
Entropy estimate per column of X
|
103
|
-
|
104
|
-
See Also
|
105
|
-
--------
|
106
|
-
numpy.histogram
|
107
|
-
scipy.stats.entropy
|
108
|
-
"""
|
109
|
-
|
110
|
-
num_factors = len(names)
|
111
|
-
hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
|
112
|
-
|
113
|
-
ev_index = np.empty(num_factors)
|
114
|
-
for col, cnts in enumerate(hist_counts.values()):
|
115
|
-
# entropy in nats, normalizes counts
|
116
|
-
ev_index[col] = sp_entropy(cnts)
|
117
|
-
if normalized:
|
118
|
-
if len(cnts) == 1:
|
119
|
-
# log(0)
|
120
|
-
ev_index[col] = 0
|
121
|
-
else:
|
122
|
-
ev_index[col] /= np.log(len(cnts))
|
123
|
-
return ev_index
|
124
|
-
|
125
|
-
|
126
|
-
def get_num_bins(
|
127
|
-
data: NDArray, names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
|
128
|
-
) -> NDArray[np.float64]:
|
129
|
-
"""
|
130
|
-
Number of bins or unique values for each metadata factor, used to
|
131
|
-
normalize entropy/:term:`diversity<Diversity>`.
|
132
|
-
|
133
|
-
Parameters
|
134
|
-
----------
|
135
|
-
subset_mask: NDArray[np.bool_] | None
|
136
|
-
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
137
|
-
|
138
|
-
Returns
|
139
|
-
-------
|
140
|
-
NDArray[np.float64]
|
141
|
-
"""
|
142
|
-
# likely cached
|
143
|
-
hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
|
144
|
-
num_bins = np.empty(len(hist_counts))
|
145
|
-
for idx, cnts in enumerate(hist_counts.values()):
|
146
|
-
num_bins[idx] = len(cnts)
|
147
|
-
|
148
|
-
return num_bins
|
149
|
-
|
150
|
-
|
151
|
-
def infer_categorical(X: NDArray, threshold: float = 0.2) -> NDArray:
|
152
|
-
"""
|
153
|
-
Compute fraction of feature values that are unique --- intended to be used
|
154
|
-
for inferring whether variables are categorical.
|
155
|
-
"""
|
156
|
-
if X.ndim == 1:
|
157
|
-
X = np.expand_dims(X, axis=1)
|
158
|
-
num_samples = X.shape[0]
|
159
|
-
pct_unique = np.empty(X.shape[1])
|
160
|
-
for col in range(X.shape[1]): # type: ignore
|
161
|
-
uvals = np.unique(X[:, col], axis=0)
|
162
|
-
pct_unique[col] = len(uvals) / num_samples
|
163
|
-
return pct_unique < threshold
|
164
|
-
|
165
|
-
|
166
|
-
def preprocess_metadata(
|
167
|
-
class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], cat_thresh: float = 0.2
|
168
|
-
) -> tuple[NDArray, list[str], list[bool]]:
|
169
|
-
# convert class_labels and dict of lists to matrix of metadata values
|
170
|
-
preprocessed_metadata = {"class_label": np.asarray(class_labels, dtype=int)}
|
171
|
-
|
172
|
-
# map columns of dict that are not numeric (e.g. string) to numeric values
|
173
|
-
# that mutual information and diversity functions can accommodate. Each
|
174
|
-
# unique string receives a unique integer value.
|
175
|
-
for k, v in metadata.items():
|
176
|
-
# if not numeric
|
177
|
-
v = to_numpy(v)
|
178
|
-
if not np.issubdtype(v.dtype, np.number):
|
179
|
-
_, mapped_vals = np.unique(v, return_inverse=True)
|
180
|
-
preprocessed_metadata[k] = mapped_vals
|
181
|
-
else:
|
182
|
-
preprocessed_metadata[k] = v
|
183
|
-
|
184
|
-
data = np.stack(list(preprocessed_metadata.values()), axis=-1)
|
185
|
-
names = list(preprocessed_metadata.keys())
|
186
|
-
is_categorical = [infer_categorical(preprocessed_metadata[var], cat_thresh)[0] for var in names]
|
187
|
-
|
188
|
-
return data, names, is_categorical
|
189
|
-
|
190
|
-
|
191
|
-
def flatten(X: NDArray):
|
192
|
-
"""
|
193
|
-
Flattens input array from (N, ... ) to (N, -1) where all samples N have all data in their last dimension
|
194
|
-
|
195
|
-
Parameters
|
196
|
-
----------
|
197
|
-
X : NDArray, shape - (N, ... )
|
198
|
-
Input array
|
199
|
-
|
200
|
-
Returns
|
201
|
-
-------
|
202
|
-
NDArray, shape - (N, -1)
|
203
|
-
"""
|
204
|
-
|
205
|
-
return X.reshape((X.shape[0], -1))
|
206
|
-
|
207
|
-
|
208
|
-
def minimum_spanning_tree(X: NDArray) -> Any:
|
209
|
-
"""
|
210
|
-
Returns the minimum spanning tree from a :term:`NumPy` image array.
|
211
|
-
|
212
|
-
Parameters
|
213
|
-
----------
|
214
|
-
X : NDArray
|
215
|
-
NumPy image array
|
216
|
-
|
217
|
-
Returns
|
218
|
-
-------
|
219
|
-
Data representing the minimum spanning tree
|
220
|
-
"""
|
221
|
-
# All features belong on second dimension
|
222
|
-
X = flatten(X)
|
223
|
-
# We add a small constant to the distance matrix to ensure scipy interprets
|
224
|
-
# the input graph as fully-connected.
|
225
|
-
dense_eudist = squareform(pdist(X)) + EPSILON
|
226
|
-
eudist_csr = csr_matrix(dense_eudist)
|
227
|
-
return mst(eudist_csr)
|
228
|
-
|
229
|
-
|
230
|
-
def get_classes_counts(labels: NDArray) -> tuple[int, int]:
|
231
|
-
"""
|
232
|
-
Returns the classes and counts of from an array of labels
|
233
|
-
|
234
|
-
Parameters
|
235
|
-
----------
|
236
|
-
label : NDArray
|
237
|
-
:term:`NumPy` labels array
|
238
|
-
|
239
|
-
Returns
|
240
|
-
-------
|
241
|
-
Classes and counts
|
242
|
-
|
243
|
-
Raises
|
244
|
-
------
|
245
|
-
ValueError
|
246
|
-
If the number of unique classes is less than 2
|
247
|
-
"""
|
248
|
-
classes, counts = np.unique(labels, return_counts=True)
|
249
|
-
M = len(classes)
|
250
|
-
if M < 2:
|
251
|
-
raise ValueError("Label vector contains less than 2 classes!")
|
252
|
-
N = np.sum(counts).astype(int)
|
253
|
-
return M, N
|
254
|
-
|
255
|
-
|
256
|
-
def compute_neighbors(
|
257
|
-
A: NDArray,
|
258
|
-
B: NDArray,
|
259
|
-
k: int = 1,
|
260
|
-
algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
|
261
|
-
) -> NDArray:
|
262
|
-
"""
|
263
|
-
For each sample in A, compute the nearest neighbor in B
|
264
|
-
|
265
|
-
Parameters
|
266
|
-
----------
|
267
|
-
A, B : NDArray
|
268
|
-
The n_samples and n_features respectively
|
269
|
-
k : int
|
270
|
-
The number of neighbors to find
|
271
|
-
algorithm : Literal
|
272
|
-
Tree method for nearest neighbor (auto, ball_tree or kd_tree)
|
273
|
-
|
274
|
-
Note
|
275
|
-
----
|
276
|
-
Do not use kd_tree if n_features > 20
|
277
|
-
|
278
|
-
Returns
|
279
|
-
-------
|
280
|
-
List:
|
281
|
-
Closest points to each point in A and B
|
282
|
-
|
283
|
-
Raises
|
284
|
-
------
|
285
|
-
ValueError
|
286
|
-
If algorithm is not "auto", "ball_tree", or "kd_tree"
|
287
|
-
|
288
|
-
See Also
|
289
|
-
--------
|
290
|
-
sklearn.neighbors.NearestNeighbors
|
291
|
-
"""
|
292
|
-
|
293
|
-
if k < 1:
|
294
|
-
raise ValueError("k must be >= 1")
|
295
|
-
if algorithm not in ["auto", "ball_tree", "kd_tree"]:
|
296
|
-
raise ValueError("Algorithm must be 'auto', 'ball_tree', or 'kd_tree'")
|
297
|
-
|
298
|
-
A = flatten(A)
|
299
|
-
B = flatten(B)
|
300
|
-
|
301
|
-
nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
|
302
|
-
nns = nbrs.kneighbors(A)[1]
|
303
|
-
nns = nns[:, 1:].squeeze()
|
304
|
-
|
305
|
-
return nns
|
306
|
-
|
307
|
-
|
308
|
-
class BitDepth(NamedTuple):
|
309
|
-
depth: int
|
310
|
-
pmin: float | int
|
311
|
-
pmax: float | int
|
312
|
-
|
313
|
-
|
314
|
-
def get_bitdepth(image: NDArray) -> BitDepth:
|
315
|
-
"""
|
316
|
-
Approximates the bit depth of the image using the
|
317
|
-
min and max pixel values.
|
318
|
-
"""
|
319
|
-
pmin, pmax = np.min(image), np.max(image)
|
320
|
-
if pmin < 0:
|
321
|
-
return BitDepth(0, pmin, pmax)
|
322
|
-
else:
|
323
|
-
depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
|
324
|
-
return BitDepth(depth, 0, 2**depth - 1)
|
325
|
-
|
326
|
-
|
327
|
-
def rescale(image: NDArray, depth: int = 1) -> NDArray:
|
328
|
-
"""
|
329
|
-
Rescales the image using the bit depth provided.
|
330
|
-
"""
|
331
|
-
bitdepth = get_bitdepth(image)
|
332
|
-
if bitdepth.depth == depth:
|
333
|
-
return image
|
334
|
-
else:
|
335
|
-
normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
|
336
|
-
return normalized * (2**depth - 1)
|
337
|
-
|
338
|
-
|
339
|
-
def normalize_image_shape(image: NDArray) -> NDArray:
|
340
|
-
"""
|
341
|
-
Normalizes the image shape into (C,H,W).
|
342
|
-
"""
|
343
|
-
ndim = image.ndim
|
344
|
-
if ndim == 2:
|
345
|
-
return np.expand_dims(image, axis=0)
|
346
|
-
elif ndim == 3:
|
347
|
-
return image
|
348
|
-
elif ndim > 3:
|
349
|
-
# Slice all but the last 3 dimensions
|
350
|
-
return image[(0,) * (ndim - 3)]
|
351
|
-
else:
|
352
|
-
raise ValueError("Images must have 2 or more dimensions.")
|
353
|
-
|
354
|
-
|
355
|
-
def normalize_box_shape(bounding_box: NDArray) -> NDArray:
|
356
|
-
"""
|
357
|
-
Normalizes the bounding box shape into (N,4).
|
358
|
-
"""
|
359
|
-
ndim = bounding_box.ndim
|
360
|
-
if ndim == 1:
|
361
|
-
return np.expand_dims(bounding_box, axis=0)
|
362
|
-
elif ndim > 2:
|
363
|
-
raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
|
364
|
-
else:
|
365
|
-
return bounding_box
|
366
|
-
|
367
|
-
|
368
|
-
def edge_filter(image: NDArray, offset: float = 0.5) -> NDArray:
|
369
|
-
"""
|
370
|
-
Returns the image filtered using a 3x3 edge detection kernel:
|
371
|
-
[[ -1, -1, -1 ],
|
372
|
-
[ -1, 8, -1 ],
|
373
|
-
[ -1, -1, -1 ]]
|
374
|
-
"""
|
375
|
-
edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
|
376
|
-
np.clip(edges, 0, 255, edges)
|
377
|
-
return edges
|
378
|
-
|
379
|
-
|
380
|
-
def pchash(image: NDArray) -> str:
|
381
|
-
"""
|
382
|
-
Performs a perceptual hash on an image by resizing to a square NxN image
|
383
|
-
using the Lanczos algorithm where N is 32x32 or the largest multiple of
|
384
|
-
8 that is smaller than the input image dimensions. The resampled image
|
385
|
-
is compressed using a discrete cosine transform and the lowest frequency
|
386
|
-
component is encoded as a bit array of greater or less than median value
|
387
|
-
and returned as a hex string.
|
388
|
-
|
389
|
-
Parameters
|
390
|
-
----------
|
391
|
-
image : NDArray
|
392
|
-
An image as a :term:`NumPy` array in CxHxW format
|
393
|
-
|
394
|
-
Returns
|
395
|
-
-------
|
396
|
-
str
|
397
|
-
The hex string hash of the image using perceptual hashing
|
398
|
-
"""
|
399
|
-
# Verify that the image is at least larger than an 8x8 image
|
400
|
-
min_dim = min(image.shape[-2:])
|
401
|
-
if min_dim < HASH_SIZE + 1:
|
402
|
-
raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
|
403
|
-
|
404
|
-
# Calculates the dimensions of the resized square image
|
405
|
-
resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
|
406
|
-
|
407
|
-
# Normalizes the image to CxHxW and takes the mean over all the channels
|
408
|
-
normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
|
409
|
-
|
410
|
-
# Rescales the pixel values to an 8-bit 0-255 image
|
411
|
-
rescaled = rescale(normalized, 8).astype(np.uint8)
|
412
|
-
|
413
|
-
# Resizes the image using the Lanczos algorithm to a square image
|
414
|
-
im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
|
415
|
-
|
416
|
-
# Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
|
417
|
-
transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
|
418
|
-
|
419
|
-
# Encodes the transform as a bit array over the median value
|
420
|
-
diff = transform > np.median(transform)
|
421
|
-
|
422
|
-
# Pads the front of the bit array to a multiple of 8 with False
|
423
|
-
padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
|
424
|
-
padded[-diff.size :] = diff.ravel()
|
425
|
-
|
426
|
-
# Converts the bit array to a hex string and strips leading 0s
|
427
|
-
hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
|
428
|
-
return hash_hex if hash_hex else "0"
|
429
|
-
|
430
|
-
|
431
|
-
def xxhash(image: NDArray) -> str:
|
432
|
-
"""
|
433
|
-
Performs a fast non-cryptographic hash using the xxhash algorithm
|
434
|
-
(xxhash.com) against the image as a flattened bytearray. The hash
|
435
|
-
is returned as a hex string.
|
436
|
-
|
437
|
-
Parameters
|
438
|
-
----------
|
439
|
-
image : NDArray
|
440
|
-
An image as a :term:NumPy` array
|
441
|
-
|
442
|
-
Returns
|
443
|
-
-------
|
444
|
-
str
|
445
|
-
The hex string hash of the image using the xxHash algorithm
|
446
|
-
"""
|
447
|
-
return xxh.xxh3_64_hexdigest(image.ravel().tobytes())
|
File without changes
|
File without changes
|
@@ -1,67 +0,0 @@
|
|
1
|
-
from numpy import float32, ndarray
|
2
|
-
from torch import Tensor, from_numpy
|
3
|
-
|
4
|
-
|
5
|
-
def torch_to_numpy(tensor: Tensor) -> ndarray:
|
6
|
-
"""
|
7
|
-
Converts a PyTorch tensor to a NumPy array
|
8
|
-
"""
|
9
|
-
if isinstance(tensor, ndarray): # Already array, return
|
10
|
-
return tensor
|
11
|
-
if not isinstance(tensor, Tensor):
|
12
|
-
raise TypeError("Tensor is not of type Tensor")
|
13
|
-
|
14
|
-
x: ndarray = tensor.detach().cpu().numpy()
|
15
|
-
return x
|
16
|
-
|
17
|
-
|
18
|
-
def numpy_to_torch(array: ndarray) -> Tensor:
|
19
|
-
"""
|
20
|
-
Converts a :term:`NumPy` array to a PyTorch tensor
|
21
|
-
"""
|
22
|
-
if isinstance(array, Tensor): # Already tensor, return
|
23
|
-
return array
|
24
|
-
if not isinstance(array, ndarray):
|
25
|
-
raise TypeError("Array is not of type numpy.ndarray")
|
26
|
-
x: Tensor = from_numpy(array.astype(float32))
|
27
|
-
return x
|
28
|
-
|
29
|
-
|
30
|
-
def permute_to_torch(array: ndarray) -> Tensor:
|
31
|
-
"""
|
32
|
-
Converts and permutes a :term:`NumPy` image array into a PyTorch image tensor.
|
33
|
-
|
34
|
-
Parameters
|
35
|
-
----------
|
36
|
-
array: ndarray
|
37
|
-
Array containing image data in the format NHWC
|
38
|
-
|
39
|
-
Returns
|
40
|
-
-------
|
41
|
-
Tensor
|
42
|
-
Tensor containing image data in the format NCHW
|
43
|
-
"""
|
44
|
-
x = numpy_to_torch(array)
|
45
|
-
x = x.permute(0, 3, 1, 2) # NHWC -> NCHW
|
46
|
-
return x
|
47
|
-
|
48
|
-
|
49
|
-
def permute_to_numpy(tensor: Tensor) -> ndarray:
|
50
|
-
"""
|
51
|
-
Converts and permutes a PyTorch image tensor into a :term:`NumPy` image array.
|
52
|
-
|
53
|
-
Does not permute if given ndarray
|
54
|
-
|
55
|
-
Parameters
|
56
|
-
----------
|
57
|
-
tensor: Tensor
|
58
|
-
Tensor containing image data in the format NCHW
|
59
|
-
|
60
|
-
Returns
|
61
|
-
-------
|
62
|
-
ndarray
|
63
|
-
Array containing image data in the format NHWC
|
64
|
-
"""
|
65
|
-
x = tensor.permute(0, 2, 3, 1)
|
66
|
-
x = torch_to_numpy(x) # NCHW -> NHWC
|
67
|
-
return x
|
File without changes
|
File without changes
|
@@ -1,10 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Kernels are used to map non-linear data to a higher dimensional space.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from dataeval import _IS_TORCH_AVAILABLE
|
6
|
-
|
7
|
-
if _IS_TORCH_AVAILABLE: # pragma: no cover
|
8
|
-
from dataeval._internal.detectors.drift.torch import GaussianRBF
|
9
|
-
|
10
|
-
__all__ = ["GaussianRBF"]
|
@@ -1,8 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
|
3
|
-
for drift.
|
4
|
-
"""
|
5
|
-
|
6
|
-
from dataeval._internal.detectors.drift.base import LastSeenUpdate, ReservoirSamplingUpdate
|
7
|
-
|
8
|
-
__all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
|
@@ -1,9 +0,0 @@
|
|
1
|
-
from dataeval import _IS_TENSORFLOW_AVAILABLE
|
2
|
-
from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
|
3
|
-
from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
|
4
|
-
from dataeval._internal.models.tensorflow.utils import create_model
|
5
|
-
|
6
|
-
__all__ = []
|
7
|
-
|
8
|
-
if _IS_TENSORFLOW_AVAILABLE:
|
9
|
-
__all__ += ["create_model", "AE", "AEGMM", "PixelCNN", "VAE", "VAEGMM"]
|
@@ -1,12 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Provide access to common Torch datasets used for computer vision
|
3
|
-
"""
|
4
|
-
|
5
|
-
from dataeval import _IS_TORCHVISION_AVAILABLE
|
6
|
-
|
7
|
-
__all__ = []
|
8
|
-
|
9
|
-
if _IS_TORCHVISION_AVAILABLE:
|
10
|
-
from dataeval._internal.datasets import CIFAR10, MNIST, VOCDetection
|
11
|
-
|
12
|
-
__all__ += ["CIFAR10", "MNIST", "VOCDetection"]
|