dataeval 0.72.1__py3-none-any.whl → 0.72.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +9 -10
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
  16. dataeval/detectors/ood/__init__.py +6 -6
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +7 -7
  18. dataeval/{_internal/detectors → detectors}/ood/aegmm.py +9 -29
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +24 -18
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +24 -20
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +10 -12
  25. dataeval/{_internal/detectors → detectors}/ood/vaegmm.py +10 -32
  26. dataeval/{_internal/interop.py → interop.py} +12 -7
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -9
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +6 -4
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +48 -14
  32. dataeval/metrics/bias/metadata.py +275 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +12 -10
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +7 -3
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/shared.py +151 -0
  51. dataeval/{_internal → utils}/split_dataset.py +98 -33
  52. dataeval/utils/tensorflow/__init__.py +7 -6
  53. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/autoencoder.py +60 -64
  54. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +9 -8
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/pixelcnn.py +16 -20
  56. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +3 -1
  57. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +17 -17
  58. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  59. dataeval/utils/torch/__init__.py +7 -3
  60. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  61. dataeval/{_internal → utils/torch}/datasets.py +48 -42
  62. dataeval/utils/torch/models.py +138 -0
  63. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
  64. dataeval/{_internal → utils/torch}/utils.py +3 -1
  65. dataeval/workflows/__init__.py +1 -1
  66. dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
  67. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/METADATA +2 -1
  68. dataeval-0.72.2.dist-info/RECORD +72 -0
  69. dataeval/_internal/detectors/__init__.py +0 -0
  70. dataeval/_internal/detectors/drift/__init__.py +0 -0
  71. dataeval/_internal/detectors/ood/__init__.py +0 -0
  72. dataeval/_internal/metrics/__init__.py +0 -0
  73. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  74. dataeval/_internal/metrics/utils.py +0 -447
  75. dataeval/_internal/models/__init__.py +0 -0
  76. dataeval/_internal/models/pytorch/__init__.py +0 -0
  77. dataeval/_internal/models/pytorch/utils.py +0 -67
  78. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  79. dataeval/_internal/workflows/__init__.py +0 -0
  80. dataeval/detectors/drift/kernels/__init__.py +0 -10
  81. dataeval/detectors/drift/updates/__init__.py +0 -8
  82. dataeval/utils/tensorflow/models/__init__.py +0 -9
  83. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  84. dataeval/utils/torch/datasets/__init__.py +0 -12
  85. dataeval/utils/torch/models/__init__.py +0 -11
  86. dataeval/utils/torch/trainer/__init__.py +0 -7
  87. dataeval-0.72.1.dist-info/RECORD +0 -81
  88. /dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +0 -0
  89. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/LICENSE.txt +0 -0
  90. {dataeval-0.72.1.dist-info → dataeval-0.72.2.dist-info}/WHEEL +0 -0
@@ -1,75 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- from typing import Iterable
5
-
6
- from numpy.typing import ArrayLike
7
-
8
- from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
9
- from dataeval._internal.metrics.utils import pchash, xxhash
10
- from dataeval._internal.output import set_metadata
11
-
12
-
13
- @dataclass(frozen=True)
14
- class HashStatsOutput(BaseStatsOutput):
15
- """
16
- Output class for :func:`hashstats` stats metric
17
-
18
- Attributes
19
- ----------
20
- xxhash : List[str]
21
- xxHash hash of the images as a hex string
22
- pchash : List[str]
23
- :term:`Perception-based Hash` of the images as a hex string
24
- """
25
-
26
- xxhash: list[str]
27
- pchash: list[str]
28
-
29
-
30
- class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
31
- output_class = HashStatsOutput
32
- image_function_map = {
33
- "xxhash": lambda x: xxhash(x.image),
34
- "pchash": lambda x: pchash(x.image),
35
- }
36
-
37
-
38
- @set_metadata("dataeval.metrics")
39
- def hashstats(
40
- images: Iterable[ArrayLike],
41
- bboxes: Iterable[ArrayLike] | None = None,
42
- ) -> HashStatsOutput:
43
- """
44
- Calculates hashes for each image
45
-
46
- This function computes hashes from the images including exact hashes and perception-based
47
- hashes. These hash values can be used to determine if images are exact or near matches.
48
-
49
- Parameters
50
- ----------
51
- images : ArrayLike
52
- Images to hashing
53
- bboxes : Iterable[ArrayLike] or None
54
- Bounding boxes in `xyxy` format for each image
55
-
56
- Returns
57
- -------
58
- HashStatsOutput
59
- A dictionary-like object containing the computed hashes for each image.
60
-
61
- See Also
62
- --------
63
- :term:`Duplicates`
64
-
65
- Examples
66
- --------
67
- Calculating the :term:`statistics<Statistics>` on the images, whose shape is (C, H, W)
68
-
69
- >>> results = hashstats(images)
70
- >>> print(results.xxhash)
71
- ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
72
- >>> print(results.pchash)
73
- ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
74
- """
75
- return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
@@ -1,447 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, Callable, Literal, Mapping, NamedTuple
4
-
5
- import numpy as np
6
- import xxhash as xxh
7
- from numpy.typing import ArrayLike, NDArray
8
- from PIL import Image
9
- from scipy.fftpack import dct
10
- from scipy.signal import convolve2d
11
- from scipy.sparse import csr_matrix
12
- from scipy.sparse.csgraph import minimum_spanning_tree as mst
13
- from scipy.spatial.distance import pdist, squareform
14
- from scipy.stats import entropy as sp_entropy
15
- from sklearn.neighbors import NearestNeighbors
16
-
17
- from dataeval._internal.interop import to_numpy
18
-
19
- EPSILON = 1e-5
20
- EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
21
- BIT_DEPTH = (1, 8, 12, 16, 32)
22
- HASH_SIZE = 8
23
- MAX_FACTOR = 4
24
-
25
-
26
- def get_method(method_map: dict[str, Callable], method: str) -> Callable:
27
- if method not in method_map:
28
- raise ValueError(f"Specified method {method} is not a valid method: {method_map}.")
29
- return method_map[method]
30
-
31
-
32
- def get_counts(
33
- data: NDArray, names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
34
- ) -> tuple[dict, dict]:
35
- """
36
- Initialize dictionary of histogram counts --- treat categorical values
37
- as histogram bins.
38
-
39
- Parameters
40
- ----------
41
- subset_mask: NDArray[np.bool_] | None
42
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
43
-
44
- Returns
45
- -------
46
- counts: Dict
47
- histogram counts per metadata factor in `factors`. Each
48
- factor will have a different number of bins. Counts get reused
49
- across metrics, so hist_counts are cached but only if computed
50
- globally, i.e. without masked samples.
51
- """
52
-
53
- hist_counts, hist_bins = {}, {}
54
- # np.where needed to satisfy linter
55
- mask = np.where(subset_mask if subset_mask is not None else np.ones(data.shape[0], dtype=bool))
56
-
57
- for cdx, fn in enumerate(names):
58
- # linter doesn't like double indexing
59
- col_data = data[mask, cdx].squeeze()
60
- if is_categorical[cdx]:
61
- # if discrete, use unique values as bins
62
- bins, cnts = np.unique(col_data, return_counts=True)
63
- else:
64
- bins = hist_bins.get(fn, "auto")
65
- cnts, bins = np.histogram(col_data, bins=bins, density=True)
66
-
67
- hist_counts[fn] = cnts
68
- hist_bins[fn] = bins
69
-
70
- return hist_counts, hist_bins
71
-
72
-
73
- def entropy(
74
- data: NDArray,
75
- names: list[str],
76
- is_categorical: list[bool],
77
- normalized: bool = False,
78
- subset_mask: NDArray[np.bool_] | None = None,
79
- ) -> NDArray[np.float64]:
80
- """
81
- Meant for use with :term:`bias<Bias>` metrics, :term:`balance<Balance>`, :term:`diversity<Diversity>`,
82
- ClasswiseBalance, and Classwise Diversity.
83
-
84
- Compute entropy for discrete/categorical variables and for continuous variables through standard
85
- histogram binning.
86
-
87
- Parameters
88
- ----------
89
- normalized: bool
90
- Flag that determines whether or not to normalize entropy by log(num_bins)
91
- subset_mask: NDArray[np.bool_] | None
92
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
93
-
94
- Note
95
- ----
96
- For continuous variables, histogram bins are chosen automatically. See
97
- numpy.histogram for details.
98
-
99
- Returns
100
- -------
101
- ent: NDArray[np.float64]
102
- Entropy estimate per column of X
103
-
104
- See Also
105
- --------
106
- numpy.histogram
107
- scipy.stats.entropy
108
- """
109
-
110
- num_factors = len(names)
111
- hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
112
-
113
- ev_index = np.empty(num_factors)
114
- for col, cnts in enumerate(hist_counts.values()):
115
- # entropy in nats, normalizes counts
116
- ev_index[col] = sp_entropy(cnts)
117
- if normalized:
118
- if len(cnts) == 1:
119
- # log(0)
120
- ev_index[col] = 0
121
- else:
122
- ev_index[col] /= np.log(len(cnts))
123
- return ev_index
124
-
125
-
126
- def get_num_bins(
127
- data: NDArray, names: list[str], is_categorical: list[bool], subset_mask: NDArray[np.bool_] | None = None
128
- ) -> NDArray[np.float64]:
129
- """
130
- Number of bins or unique values for each metadata factor, used to
131
- normalize entropy/:term:`diversity<Diversity>`.
132
-
133
- Parameters
134
- ----------
135
- subset_mask: NDArray[np.bool_] | None
136
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
137
-
138
- Returns
139
- -------
140
- NDArray[np.float64]
141
- """
142
- # likely cached
143
- hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
144
- num_bins = np.empty(len(hist_counts))
145
- for idx, cnts in enumerate(hist_counts.values()):
146
- num_bins[idx] = len(cnts)
147
-
148
- return num_bins
149
-
150
-
151
- def infer_categorical(X: NDArray, threshold: float = 0.2) -> NDArray:
152
- """
153
- Compute fraction of feature values that are unique --- intended to be used
154
- for inferring whether variables are categorical.
155
- """
156
- if X.ndim == 1:
157
- X = np.expand_dims(X, axis=1)
158
- num_samples = X.shape[0]
159
- pct_unique = np.empty(X.shape[1])
160
- for col in range(X.shape[1]): # type: ignore
161
- uvals = np.unique(X[:, col], axis=0)
162
- pct_unique[col] = len(uvals) / num_samples
163
- return pct_unique < threshold
164
-
165
-
166
- def preprocess_metadata(
167
- class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], cat_thresh: float = 0.2
168
- ) -> tuple[NDArray, list[str], list[bool]]:
169
- # convert class_labels and dict of lists to matrix of metadata values
170
- preprocessed_metadata = {"class_label": np.asarray(class_labels, dtype=int)}
171
-
172
- # map columns of dict that are not numeric (e.g. string) to numeric values
173
- # that mutual information and diversity functions can accommodate. Each
174
- # unique string receives a unique integer value.
175
- for k, v in metadata.items():
176
- # if not numeric
177
- v = to_numpy(v)
178
- if not np.issubdtype(v.dtype, np.number):
179
- _, mapped_vals = np.unique(v, return_inverse=True)
180
- preprocessed_metadata[k] = mapped_vals
181
- else:
182
- preprocessed_metadata[k] = v
183
-
184
- data = np.stack(list(preprocessed_metadata.values()), axis=-1)
185
- names = list(preprocessed_metadata.keys())
186
- is_categorical = [infer_categorical(preprocessed_metadata[var], cat_thresh)[0] for var in names]
187
-
188
- return data, names, is_categorical
189
-
190
-
191
- def flatten(X: NDArray):
192
- """
193
- Flattens input array from (N, ... ) to (N, -1) where all samples N have all data in their last dimension
194
-
195
- Parameters
196
- ----------
197
- X : NDArray, shape - (N, ... )
198
- Input array
199
-
200
- Returns
201
- -------
202
- NDArray, shape - (N, -1)
203
- """
204
-
205
- return X.reshape((X.shape[0], -1))
206
-
207
-
208
- def minimum_spanning_tree(X: NDArray) -> Any:
209
- """
210
- Returns the minimum spanning tree from a :term:`NumPy` image array.
211
-
212
- Parameters
213
- ----------
214
- X : NDArray
215
- NumPy image array
216
-
217
- Returns
218
- -------
219
- Data representing the minimum spanning tree
220
- """
221
- # All features belong on second dimension
222
- X = flatten(X)
223
- # We add a small constant to the distance matrix to ensure scipy interprets
224
- # the input graph as fully-connected.
225
- dense_eudist = squareform(pdist(X)) + EPSILON
226
- eudist_csr = csr_matrix(dense_eudist)
227
- return mst(eudist_csr)
228
-
229
-
230
- def get_classes_counts(labels: NDArray) -> tuple[int, int]:
231
- """
232
- Returns the classes and counts of from an array of labels
233
-
234
- Parameters
235
- ----------
236
- label : NDArray
237
- :term:`NumPy` labels array
238
-
239
- Returns
240
- -------
241
- Classes and counts
242
-
243
- Raises
244
- ------
245
- ValueError
246
- If the number of unique classes is less than 2
247
- """
248
- classes, counts = np.unique(labels, return_counts=True)
249
- M = len(classes)
250
- if M < 2:
251
- raise ValueError("Label vector contains less than 2 classes!")
252
- N = np.sum(counts).astype(int)
253
- return M, N
254
-
255
-
256
- def compute_neighbors(
257
- A: NDArray,
258
- B: NDArray,
259
- k: int = 1,
260
- algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
261
- ) -> NDArray:
262
- """
263
- For each sample in A, compute the nearest neighbor in B
264
-
265
- Parameters
266
- ----------
267
- A, B : NDArray
268
- The n_samples and n_features respectively
269
- k : int
270
- The number of neighbors to find
271
- algorithm : Literal
272
- Tree method for nearest neighbor (auto, ball_tree or kd_tree)
273
-
274
- Note
275
- ----
276
- Do not use kd_tree if n_features > 20
277
-
278
- Returns
279
- -------
280
- List:
281
- Closest points to each point in A and B
282
-
283
- Raises
284
- ------
285
- ValueError
286
- If algorithm is not "auto", "ball_tree", or "kd_tree"
287
-
288
- See Also
289
- --------
290
- sklearn.neighbors.NearestNeighbors
291
- """
292
-
293
- if k < 1:
294
- raise ValueError("k must be >= 1")
295
- if algorithm not in ["auto", "ball_tree", "kd_tree"]:
296
- raise ValueError("Algorithm must be 'auto', 'ball_tree', or 'kd_tree'")
297
-
298
- A = flatten(A)
299
- B = flatten(B)
300
-
301
- nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
302
- nns = nbrs.kneighbors(A)[1]
303
- nns = nns[:, 1:].squeeze()
304
-
305
- return nns
306
-
307
-
308
- class BitDepth(NamedTuple):
309
- depth: int
310
- pmin: float | int
311
- pmax: float | int
312
-
313
-
314
- def get_bitdepth(image: NDArray) -> BitDepth:
315
- """
316
- Approximates the bit depth of the image using the
317
- min and max pixel values.
318
- """
319
- pmin, pmax = np.min(image), np.max(image)
320
- if pmin < 0:
321
- return BitDepth(0, pmin, pmax)
322
- else:
323
- depth = ([x for x in BIT_DEPTH if 2**x > pmax] or [max(BIT_DEPTH)])[0]
324
- return BitDepth(depth, 0, 2**depth - 1)
325
-
326
-
327
- def rescale(image: NDArray, depth: int = 1) -> NDArray:
328
- """
329
- Rescales the image using the bit depth provided.
330
- """
331
- bitdepth = get_bitdepth(image)
332
- if bitdepth.depth == depth:
333
- return image
334
- else:
335
- normalized = (image + bitdepth.pmin) / (bitdepth.pmax - bitdepth.pmin)
336
- return normalized * (2**depth - 1)
337
-
338
-
339
- def normalize_image_shape(image: NDArray) -> NDArray:
340
- """
341
- Normalizes the image shape into (C,H,W).
342
- """
343
- ndim = image.ndim
344
- if ndim == 2:
345
- return np.expand_dims(image, axis=0)
346
- elif ndim == 3:
347
- return image
348
- elif ndim > 3:
349
- # Slice all but the last 3 dimensions
350
- return image[(0,) * (ndim - 3)]
351
- else:
352
- raise ValueError("Images must have 2 or more dimensions.")
353
-
354
-
355
- def normalize_box_shape(bounding_box: NDArray) -> NDArray:
356
- """
357
- Normalizes the bounding box shape into (N,4).
358
- """
359
- ndim = bounding_box.ndim
360
- if ndim == 1:
361
- return np.expand_dims(bounding_box, axis=0)
362
- elif ndim > 2:
363
- raise ValueError("Bounding boxes must have 2 dimensions: (# of boxes in an image, [X,Y,W,H]) -> (N,4)")
364
- else:
365
- return bounding_box
366
-
367
-
368
- def edge_filter(image: NDArray, offset: float = 0.5) -> NDArray:
369
- """
370
- Returns the image filtered using a 3x3 edge detection kernel:
371
- [[ -1, -1, -1 ],
372
- [ -1, 8, -1 ],
373
- [ -1, -1, -1 ]]
374
- """
375
- edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
376
- np.clip(edges, 0, 255, edges)
377
- return edges
378
-
379
-
380
- def pchash(image: NDArray) -> str:
381
- """
382
- Performs a perceptual hash on an image by resizing to a square NxN image
383
- using the Lanczos algorithm where N is 32x32 or the largest multiple of
384
- 8 that is smaller than the input image dimensions. The resampled image
385
- is compressed using a discrete cosine transform and the lowest frequency
386
- component is encoded as a bit array of greater or less than median value
387
- and returned as a hex string.
388
-
389
- Parameters
390
- ----------
391
- image : NDArray
392
- An image as a :term:`NumPy` array in CxHxW format
393
-
394
- Returns
395
- -------
396
- str
397
- The hex string hash of the image using perceptual hashing
398
- """
399
- # Verify that the image is at least larger than an 8x8 image
400
- min_dim = min(image.shape[-2:])
401
- if min_dim < HASH_SIZE + 1:
402
- raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
403
-
404
- # Calculates the dimensions of the resized square image
405
- resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
406
-
407
- # Normalizes the image to CxHxW and takes the mean over all the channels
408
- normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
409
-
410
- # Rescales the pixel values to an 8-bit 0-255 image
411
- rescaled = rescale(normalized, 8).astype(np.uint8)
412
-
413
- # Resizes the image using the Lanczos algorithm to a square image
414
- im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
415
-
416
- # Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
417
- transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
418
-
419
- # Encodes the transform as a bit array over the median value
420
- diff = transform > np.median(transform)
421
-
422
- # Pads the front of the bit array to a multiple of 8 with False
423
- padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
424
- padded[-diff.size :] = diff.ravel()
425
-
426
- # Converts the bit array to a hex string and strips leading 0s
427
- hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
428
- return hash_hex if hash_hex else "0"
429
-
430
-
431
- def xxhash(image: NDArray) -> str:
432
- """
433
- Performs a fast non-cryptographic hash using the xxhash algorithm
434
- (xxhash.com) against the image as a flattened bytearray. The hash
435
- is returned as a hex string.
436
-
437
- Parameters
438
- ----------
439
- image : NDArray
440
- An image as a :term:NumPy` array
441
-
442
- Returns
443
- -------
444
- str
445
- The hex string hash of the image using the xxHash algorithm
446
- """
447
- return xxh.xxh3_64_hexdigest(image.ravel().tobytes())
File without changes
File without changes
@@ -1,67 +0,0 @@
1
- from numpy import float32, ndarray
2
- from torch import Tensor, from_numpy
3
-
4
-
5
- def torch_to_numpy(tensor: Tensor) -> ndarray:
6
- """
7
- Converts a PyTorch tensor to a NumPy array
8
- """
9
- if isinstance(tensor, ndarray): # Already array, return
10
- return tensor
11
- if not isinstance(tensor, Tensor):
12
- raise TypeError("Tensor is not of type Tensor")
13
-
14
- x: ndarray = tensor.detach().cpu().numpy()
15
- return x
16
-
17
-
18
- def numpy_to_torch(array: ndarray) -> Tensor:
19
- """
20
- Converts a :term:`NumPy` array to a PyTorch tensor
21
- """
22
- if isinstance(array, Tensor): # Already tensor, return
23
- return array
24
- if not isinstance(array, ndarray):
25
- raise TypeError("Array is not of type numpy.ndarray")
26
- x: Tensor = from_numpy(array.astype(float32))
27
- return x
28
-
29
-
30
- def permute_to_torch(array: ndarray) -> Tensor:
31
- """
32
- Converts and permutes a :term:`NumPy` image array into a PyTorch image tensor.
33
-
34
- Parameters
35
- ----------
36
- array: ndarray
37
- Array containing image data in the format NHWC
38
-
39
- Returns
40
- -------
41
- Tensor
42
- Tensor containing image data in the format NCHW
43
- """
44
- x = numpy_to_torch(array)
45
- x = x.permute(0, 3, 1, 2) # NHWC -> NCHW
46
- return x
47
-
48
-
49
- def permute_to_numpy(tensor: Tensor) -> ndarray:
50
- """
51
- Converts and permutes a PyTorch image tensor into a :term:`NumPy` image array.
52
-
53
- Does not permute if given ndarray
54
-
55
- Parameters
56
- ----------
57
- tensor: Tensor
58
- Tensor containing image data in the format NCHW
59
-
60
- Returns
61
- -------
62
- ndarray
63
- Array containing image data in the format NHWC
64
- """
65
- x = tensor.permute(0, 2, 3, 1)
66
- x = torch_to_numpy(x) # NCHW -> NHWC
67
- return x
File without changes
File without changes
@@ -1,10 +0,0 @@
1
- """
2
- Kernels are used to map non-linear data to a higher dimensional space.
3
- """
4
-
5
- from dataeval import _IS_TORCH_AVAILABLE
6
-
7
- if _IS_TORCH_AVAILABLE: # pragma: no cover
8
- from dataeval._internal.detectors.drift.torch import GaussianRBF
9
-
10
- __all__ = ["GaussianRBF"]
@@ -1,8 +0,0 @@
1
- """
2
- Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
3
- for drift.
4
- """
5
-
6
- from dataeval._internal.detectors.drift.base import LastSeenUpdate, ReservoirSamplingUpdate
7
-
8
- __all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
@@ -1,9 +0,0 @@
1
- from dataeval import _IS_TENSORFLOW_AVAILABLE
2
- from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
3
- from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
4
- from dataeval._internal.models.tensorflow.utils import create_model
5
-
6
- __all__ = []
7
-
8
- if _IS_TENSORFLOW_AVAILABLE:
9
- __all__ += ["create_model", "AE", "AEGMM", "PixelCNN", "VAE", "VAEGMM"]
@@ -1,3 +0,0 @@
1
- from dataeval._internal.models.tensorflow.autoencoder import eucl_cosim_features
2
-
3
- __all__ = ["eucl_cosim_features"]
@@ -1,12 +0,0 @@
1
- """
2
- Provide access to common Torch datasets used for computer vision
3
- """
4
-
5
- from dataeval import _IS_TORCHVISION_AVAILABLE
6
-
7
- __all__ = []
8
-
9
- if _IS_TORCHVISION_AVAILABLE:
10
- from dataeval._internal.datasets import CIFAR10, MNIST, VOCDetection
11
-
12
- __all__ += ["CIFAR10", "MNIST", "VOCDetection"]
@@ -1,11 +0,0 @@
1
- from dataeval import _IS_TORCH_AVAILABLE
2
- from dataeval._internal.models.pytorch.autoencoder import (
3
- AriaAutoencoder,
4
- Decoder,
5
- Encoder,
6
- )
7
-
8
- __all__ = []
9
-
10
- if _IS_TORCH_AVAILABLE:
11
- __all__ += ["AriaAutoencoder", "Decoder", "Encoder"]
@@ -1,7 +0,0 @@
1
- from dataeval import _IS_TORCH_AVAILABLE
2
- from dataeval._internal.models.pytorch.autoencoder import AETrainer
3
-
4
- __all__ = []
5
-
6
- if _IS_TORCH_AVAILABLE:
7
- __all__ += ["AETrainer"]