dataeval 0.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +18 -0
  2. dataeval/_internal/detectors/__init__.py +0 -0
  3. dataeval/_internal/detectors/clusterer.py +469 -0
  4. dataeval/_internal/detectors/drift/__init__.py +0 -0
  5. dataeval/_internal/detectors/drift/base.py +265 -0
  6. dataeval/_internal/detectors/drift/cvm.py +97 -0
  7. dataeval/_internal/detectors/drift/ks.py +100 -0
  8. dataeval/_internal/detectors/drift/mmd.py +166 -0
  9. dataeval/_internal/detectors/drift/torch.py +310 -0
  10. dataeval/_internal/detectors/drift/uncertainty.py +149 -0
  11. dataeval/_internal/detectors/duplicates.py +49 -0
  12. dataeval/_internal/detectors/linter.py +78 -0
  13. dataeval/_internal/detectors/ood/__init__.py +0 -0
  14. dataeval/_internal/detectors/ood/ae.py +77 -0
  15. dataeval/_internal/detectors/ood/aegmm.py +69 -0
  16. dataeval/_internal/detectors/ood/base.py +199 -0
  17. dataeval/_internal/detectors/ood/llr.py +284 -0
  18. dataeval/_internal/detectors/ood/vae.py +86 -0
  19. dataeval/_internal/detectors/ood/vaegmm.py +79 -0
  20. dataeval/_internal/flags.py +47 -0
  21. dataeval/_internal/metrics/__init__.py +0 -0
  22. dataeval/_internal/metrics/base.py +92 -0
  23. dataeval/_internal/metrics/ber.py +124 -0
  24. dataeval/_internal/metrics/coverage.py +80 -0
  25. dataeval/_internal/metrics/divergence.py +94 -0
  26. dataeval/_internal/metrics/hash.py +79 -0
  27. dataeval/_internal/metrics/parity.py +180 -0
  28. dataeval/_internal/metrics/stats.py +332 -0
  29. dataeval/_internal/metrics/uap.py +45 -0
  30. dataeval/_internal/metrics/utils.py +158 -0
  31. dataeval/_internal/models/__init__.py +0 -0
  32. dataeval/_internal/models/pytorch/__init__.py +0 -0
  33. dataeval/_internal/models/pytorch/autoencoder.py +202 -0
  34. dataeval/_internal/models/pytorch/blocks.py +46 -0
  35. dataeval/_internal/models/pytorch/utils.py +67 -0
  36. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  37. dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
  38. dataeval/_internal/models/tensorflow/gmm.py +115 -0
  39. dataeval/_internal/models/tensorflow/losses.py +107 -0
  40. dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
  41. dataeval/_internal/models/tensorflow/trainer.py +102 -0
  42. dataeval/_internal/models/tensorflow/utils.py +254 -0
  43. dataeval/_internal/workflows/sufficiency.py +555 -0
  44. dataeval/detectors/__init__.py +29 -0
  45. dataeval/flags/__init__.py +3 -0
  46. dataeval/metrics/__init__.py +7 -0
  47. dataeval/models/__init__.py +15 -0
  48. dataeval/models/tensorflow/__init__.py +6 -0
  49. dataeval/models/torch/__init__.py +8 -0
  50. dataeval/py.typed +0 -0
  51. dataeval/workflows/__init__.py +8 -0
  52. dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
  53. dataeval-0.61.0.dist-info/METADATA +114 -0
  54. dataeval-0.61.0.dist-info/RECORD +55 -0
  55. dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,79 @@
1
+ """
2
+ Source code derived from Alibi-Detect 0.11.4
3
+ https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
+
5
+ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
+ Licensed under Apache Software License (Apache 2.0)
7
+ """
8
+
9
+ from typing import Callable
10
+
11
+ import keras
12
+ import numpy as np
13
+
14
+ from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
15
+ from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
16
+ from dataeval._internal.models.tensorflow.gmm import gmm_energy
17
+ from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
18
+ from dataeval._internal.models.tensorflow.utils import predict_batch
19
+
20
+
21
+ class OOD_VAEGMM(OODGMMBase):
22
+ def __init__(self, model: VAEGMM, samples: int = 10) -> None:
23
+ """
24
+ VAE with Gaussian Mixture Model based outlier detector.
25
+
26
+ Parameters
27
+ ----------
28
+ model : VAEGMM
29
+ A VAEGMM model.
30
+ samples
31
+ Number of samples sampled to evaluate each instance.
32
+ """
33
+ super().__init__(model)
34
+ self.samples = samples
35
+
36
+ def fit(
37
+ self,
38
+ x_ref: np.ndarray,
39
+ threshold_perc: float = 100.0,
40
+ loss_fn: Callable = LossGMM(elbo=Elbo(0.05)),
41
+ optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
42
+ epochs: int = 20,
43
+ batch_size: int = 64,
44
+ verbose: bool = True,
45
+ ) -> None:
46
+ """
47
+ Train the AE model with recommended loss function and optimizer.
48
+
49
+ Parameters
50
+ ----------
51
+ X : np.ndarray
52
+ Training batch.
53
+ threshold_perc : float, default 100.0
54
+ Percentage of reference data that is normal.
55
+ loss_fn : Callable, default LossGMM(elbo=Elbo(0.05))
56
+ Loss function used for training.
57
+ optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
58
+ Optimizer used for training.
59
+ epochs : int, default 20
60
+ Number of training epochs.
61
+ batch_size : int, default 64
62
+ Batch size used for training.
63
+ verbose : bool, default True
64
+ Whether to print training progress.
65
+ """
66
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
67
+
68
+ def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
69
+ self._validate(X)
70
+
71
+ # draw samples from latent space
72
+ X_samples = np.repeat(X, self.samples, axis=0)
73
+ _, z, _ = predict_batch(X_samples, self.model, batch_size=batch_size)
74
+
75
+ # compute average energy for samples
76
+ energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
77
+ energy_samples = energy.numpy().reshape((-1, self.samples)) # type: ignore
78
+ iscore = np.mean(energy_samples, axis=-1)
79
+ return OODScore(iscore)
@@ -0,0 +1,47 @@
1
+ from enum import Flag, auto
2
+ from typing import Union
3
+
4
+
5
+ class auto_all:
6
+ def __get__(self, _, cls):
7
+ return ~cls(0)
8
+
9
+
10
+ class ImageHash(Flag):
11
+ XXHASH = auto()
12
+ PCHASH = auto()
13
+ ALL = auto_all()
14
+
15
+
16
+ class ImageProperty(Flag):
17
+ WIDTH = auto()
18
+ HEIGHT = auto()
19
+ SIZE = auto()
20
+ ASPECT_RATIO = auto()
21
+ CHANNELS = auto()
22
+ DEPTH = auto()
23
+ ALL = auto_all()
24
+
25
+
26
+ class ImageVisuals(Flag):
27
+ BRIGHTNESS = auto()
28
+ BLURRINESS = auto()
29
+ MISSING = auto()
30
+ ZERO = auto()
31
+ ALL = auto_all()
32
+
33
+
34
+ class ImageStatistics(Flag):
35
+ MEAN = auto()
36
+ STD = auto()
37
+ VAR = auto()
38
+ SKEW = auto()
39
+ KURTOSIS = auto()
40
+ ENTROPY = auto()
41
+ PERCENTILES = auto()
42
+ HISTOGRAM = auto()
43
+ ALL = auto_all()
44
+
45
+
46
+ ImageStatsFlags = Union[ImageHash, ImageProperty, ImageVisuals, ImageStatistics]
47
+ LinterFlags = Union[ImageProperty, ImageVisuals, ImageStatistics]
File without changes
@@ -0,0 +1,92 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Callable, Dict, Generic, List, TypeVar
3
+
4
+ TOutput = TypeVar("TOutput", bound=dict)
5
+ TMethods = TypeVar("TMethods")
6
+ TCallable = TypeVar("TCallable", bound=Callable)
7
+
8
+
9
+ class MetricMixin(ABC, Generic[TOutput]):
10
+ @abstractmethod
11
+ def update(self, preds, targets): ...
12
+
13
+ @abstractmethod
14
+ def compute(self) -> TOutput: ...
15
+
16
+ @abstractmethod
17
+ def reset(self): ...
18
+
19
+
20
+ class EvaluateMixin(ABC, Generic[TOutput]):
21
+ @abstractmethod
22
+ def evaluate(self) -> TOutput:
23
+ """Abstract method to calculate metric based off of constructor parameters"""
24
+
25
+
26
+ class MethodsMixin(ABC, Generic[TMethods, TCallable]):
27
+ """
28
+ Use this mixin to define a mapping of functions to method names which
29
+ can be queried by the user and called internally with the appropriate
30
+ method name as the key.
31
+
32
+ Explicitly defining the Callable generic helps with type safety and
33
+ hinting for function signatures and recommended but optional.
34
+
35
+ e.g.:
36
+
37
+ def _mult(x: float, y: float) -> float:
38
+ return x * y
39
+
40
+ class MyMetric(MethodsMixin[Callable[float, float], float]):
41
+
42
+ def _methods(cls) -> Dict[str, Callable[float, float], float]:
43
+ return {
44
+ "ADD": lambda x, y: x + y,
45
+ "MULT": _mult,
46
+ ...
47
+ }
48
+
49
+ Then during evaluate, you can call the method specified with the getter.
50
+
51
+ e.g.:
52
+
53
+ def evaluate(self):
54
+ return self._method(x, y)
55
+
56
+ The resulting class can be used like so.
57
+
58
+ m = MyMetric(1.0, 2.0, "ADD")
59
+ m.evaluate() # returns 3.0
60
+ m.method # returns "ADD"
61
+ MyMetric.methods() # returns "['ADD', 'MULT']
62
+ m.method = "MULT"
63
+ m.evaluate() # returns 2.0
64
+ """
65
+
66
+ @classmethod
67
+ @abstractmethod
68
+ def _methods(cls) -> Dict[str, TCallable]:
69
+ """Abstract method returning available method functions for class"""
70
+
71
+ @property
72
+ def _method(self) -> TCallable:
73
+ return self._methods()[self.method]
74
+
75
+ @classmethod
76
+ def methods(cls) -> List[str]:
77
+ return list(cls._methods().keys())
78
+
79
+ @property
80
+ def method(self) -> str:
81
+ return self._method_key
82
+
83
+ @method.setter
84
+ def method(self, value: TMethods):
85
+ self._set_method(value)
86
+
87
+ def _set_method(self, value: TMethods):
88
+ """This setter is to fix pyright incorrect detection of
89
+ incorrectly overriding the 'method' property"""
90
+ if value not in self.methods():
91
+ raise KeyError(f"Specified method not available for class ({self.methods()}).")
92
+ self._method_key = value
@@ -0,0 +1,124 @@
1
+ """
2
+ This module contains the implementation of the
3
+ FR Test Statistic based estimate and the
4
+ KNN based estimate for the Bayes Error Rate
5
+
6
+ Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
7
+ https://arxiv.org/abs/1811.06419
8
+ """
9
+
10
+ from typing import Callable, Dict, Literal, Tuple
11
+
12
+ import numpy as np
13
+ from maite.protocols import ArrayLike
14
+ from scipy.sparse import coo_matrix
15
+ from scipy.stats import mode
16
+
17
+ from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
18
+
19
+ from .utils import compute_neighbors, get_classes_counts, minimum_spanning_tree
20
+
21
+
22
+ def _mst(X: np.ndarray, y: np.ndarray, _: int) -> Tuple[float, float]:
23
+ M, N = get_classes_counts(y)
24
+
25
+ tree = coo_matrix(minimum_spanning_tree(X))
26
+ matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
27
+ deltas = matches / (2 * N)
28
+ upper = 2 * deltas
29
+ lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
30
+ return upper, lower
31
+
32
+
33
+ def _knn(X: np.ndarray, y: np.ndarray, k: int) -> Tuple[float, float]:
34
+ M, N = get_classes_counts(y)
35
+
36
+ # All features belong on second dimension
37
+ X = X.reshape((X.shape[0], -1))
38
+ nn_indices = compute_neighbors(X, X, k=k)
39
+ nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
40
+ modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
41
+ upper = float(np.count_nonzero(modal_class - y) / N)
42
+ lower = _knn_lowerbound(upper, M, k)
43
+ return upper, lower
44
+
45
+
46
+ def _knn_lowerbound(value: float, classes: int, k: int) -> float:
47
+ "Several cases for computing the BER lower bound"
48
+ if value <= 1e-10:
49
+ return 0.0
50
+
51
+ if classes == 2 and k != 1:
52
+ if k > 5:
53
+ # Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
54
+ alpha = 0.3399
55
+ beta = 0.9749
56
+ a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
57
+ return value / (1 + a_k)
58
+ if k > 2:
59
+ return value / (1 + (1 / np.sqrt(k)))
60
+ # k == 2:
61
+ return value / 2
62
+
63
+ return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
64
+
65
+
66
+ _METHODS = Literal["MST", "KNN"]
67
+ _FUNCTION = Callable[[np.ndarray, np.ndarray, int], Tuple[float, float]]
68
+
69
+
70
+ class BER(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
71
+ """
72
+ An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
73
+
74
+ Parameters
75
+ ----------
76
+ data : np.ndarray
77
+ Array of images or image embeddings
78
+ labels : np.ndarray
79
+ Array of labels for each image or image embedding
80
+ method : Literal["MST", "KNN"], default "KNN"
81
+ Method to use when estimating the Bayes error rate
82
+ k : int, default 1
83
+ number of nearest neighbors for KNN estimator -- ignored by MST estimator
84
+
85
+
86
+ See Also
87
+ --------
88
+ `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
89
+
90
+ """
91
+
92
+ def __init__(self, data: ArrayLike, labels: ArrayLike, method: _METHODS = "KNN", k: int = 1) -> None:
93
+ self.data = data
94
+ self.labels = labels
95
+ self.k = k
96
+ self._set_method(method)
97
+
98
+ @classmethod
99
+ def _methods(
100
+ cls,
101
+ ) -> Dict[str, _FUNCTION]:
102
+ return {"MST": _mst, "KNN": _knn}
103
+
104
+ def evaluate(self) -> Dict[str, float]:
105
+ """
106
+ Calculates the Bayes Error Rate estimate using the provided method
107
+
108
+ Returns
109
+ -------
110
+ Dict[str, float]
111
+ ber : float
112
+ The estimated lower bounds of the Bayes Error Rate
113
+ ber_lower : float
114
+ The estimated upper bounds of the Bayes Error Rate
115
+
116
+ Raises
117
+ ------
118
+ ValueError
119
+ If unique classes M < 2
120
+ """
121
+ data = np.asarray(self.data)
122
+ labels = np.asarray(self.labels)
123
+ upper, lower = self._method(data, labels, self.k)
124
+ return {"ber": upper, "ber_lower": lower}
@@ -0,0 +1,80 @@
1
+ import math
2
+ from typing import Literal, Tuple
3
+
4
+ import numpy as np
5
+ from scipy.spatial.distance import pdist, squareform
6
+
7
+
8
+ class Coverage:
9
+ """
10
+ Class for evaluating coverage and identifying images/samples that are in undercovered regions.
11
+
12
+ This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
13
+
14
+ Parameters
15
+ ----------
16
+ embeddings : np.ndarray
17
+ n x p array of image embeddings from the dataset.
18
+ radius_type : Literal["adaptive", "naive"], default "adaptive"
19
+ The function used to determine radius.
20
+ k: int, default 20
21
+ Number of observations required in order to be covered.
22
+ percent: np.float64, default np.float(0.01)
23
+ Percent of observations to be considered uncovered. Only applies to adaptive radius.
24
+
25
+ Note
26
+ ----
27
+ Embeddings should be on the unit interval.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ embeddings: np.ndarray,
33
+ radius_type: Literal["adaptive", "naive"] = "adaptive",
34
+ k: int = 20,
35
+ percent: np.float64 = np.float64(0.01),
36
+ ):
37
+ self.embeddings = embeddings
38
+ self.radius_type = radius_type
39
+ self.k = k
40
+ self.percent = percent
41
+
42
+ def evaluate(self) -> Tuple[np.ndarray, np.ndarray]:
43
+ """
44
+ Perform a one-way chi-squared test between observation frequencies and expected frequencies that
45
+ tests the null hypothesis that the observed data has the expected frequencies.
46
+
47
+ Returns
48
+ -------
49
+ np.ndarray
50
+ Array of uncovered indices
51
+ np.ndarray
52
+ Array of critical value radii
53
+
54
+ Raises
55
+ ------
56
+ ValueError
57
+ If length of embeddings is less than or equal to k
58
+ ValueError
59
+ If radius_type is unknown
60
+ """
61
+
62
+ # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
63
+ n = len(self.embeddings)
64
+ if n <= self.k:
65
+ raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
66
+ mat = squareform(pdist(self.embeddings))
67
+ sorted_dists = np.sort(mat, axis=1)
68
+ crit = sorted_dists[:, self.k + 1]
69
+
70
+ d = np.shape(self.embeddings)[1]
71
+ if self.radius_type == "naive":
72
+ self.rho = (1 / math.sqrt(math.pi)) * ((2 * self.k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
73
+ pvals = np.where(crit > self.rho)[0]
74
+ elif self.radius_type == "adaptive":
75
+ # Use data adaptive cutoff
76
+ cutoff = int(n * self.percent)
77
+ pvals = np.argsort(crit)[::-1][:cutoff]
78
+ else:
79
+ raise ValueError("Invalid radius type.")
80
+ return pvals, crit
@@ -0,0 +1,94 @@
1
+ """
2
+ This module contains the implementation of HP Divergence
3
+ using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
4
+ """
5
+
6
+ from typing import Any, Callable, Dict, Literal
7
+
8
+ import numpy as np
9
+
10
+ from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
11
+
12
+ from .utils import compute_neighbors, minimum_spanning_tree
13
+
14
+
15
+ def _mst(data: np.ndarray, labels: np.ndarray) -> int:
16
+ mst = minimum_spanning_tree(data).toarray()
17
+ edgelist = np.transpose(np.nonzero(mst))
18
+ errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
19
+ return errors
20
+
21
+
22
+ def _fnn(data: np.ndarray, labels: np.ndarray) -> int:
23
+ nn_indices = compute_neighbors(data, data)
24
+ errors = np.sum(np.abs(labels[nn_indices] - labels))
25
+ return errors
26
+
27
+
28
+ _METHODS = Literal["MST", "FNN"]
29
+ _FUNCTION = Callable[[np.ndarray, np.ndarray], int]
30
+
31
+
32
+ class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
33
+ """
34
+ Calculates the estimated divergence between two datasets
35
+
36
+ Parameters
37
+ ----------
38
+ data_a : np.ndarray
39
+ Array of images or image embeddings to compare
40
+ data_b : np.ndarray
41
+ Array of images or image embeddings to compare
42
+ method : Literal["MST, "FNN"], default "MST"
43
+ Method used to estimate dataset divergence
44
+
45
+ See Also
46
+ --------
47
+ For more information about this divergence, its formal definition,
48
+ and its associated estimators see https://arxiv.org/abs/1412.6534.
49
+
50
+ Warning
51
+ -------
52
+ MST is very slow in this implementation, this is unlike matlab where
53
+ they have comparable speeds
54
+ Overall, MST takes ~25x LONGER!!
55
+ Source of slowdown:
56
+ conversion to and from CSR format adds ~10% of the time diff between
57
+ 1nn and scipy mst function the remaining 90%
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ data_a: np.ndarray,
63
+ data_b: np.ndarray,
64
+ method: _METHODS = "MST",
65
+ ) -> None:
66
+ self.data_a = data_a
67
+ self.data_b = data_b
68
+ self._set_method(method)
69
+
70
+ @classmethod
71
+ def _methods(cls) -> Dict[str, _FUNCTION]:
72
+ return {"FNN": _fnn, "MST": _mst}
73
+
74
+ def evaluate(self) -> Dict[str, Any]:
75
+ """
76
+ Calculates the divergence and any errors between the datasets
77
+
78
+ Returns
79
+ -------
80
+ Dict[str, Any]
81
+ dp : float
82
+ divergence value between 0.0 and 1.0
83
+ errors : int
84
+ the number of differing edges
85
+ """
86
+ N = self.data_a.shape[0]
87
+ M = self.data_b.shape[0]
88
+
89
+ stacked_data = np.vstack((self.data_a, self.data_b))
90
+ labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
91
+
92
+ errors = self._method(stacked_data, labels)
93
+ dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
94
+ return {"divergence": dp, "error": errors}
@@ -0,0 +1,79 @@
1
+ import numpy as np
2
+ import xxhash as xxh
3
+ from PIL import Image
4
+ from scipy.fftpack import dct
5
+
6
+ from dataeval._internal.metrics.utils import normalize_image_shape, rescale
7
+
8
+ HASH_SIZE = 8
9
+ MAX_FACTOR = 4
10
+
11
+
12
+ def pchash(image: np.ndarray) -> str:
13
+ """
14
+ Performs a perceptual hash on an image by resizing to a square NxN image
15
+ using the Lanczos algorithm where N is 32x32 or the largest multiple of
16
+ 8 that is smaller than the input image dimensions. The resampled image
17
+ is compressed using a discrete cosine transform and the lowest frequency
18
+ component is encoded as a bit array of greater or less than median value
19
+ and returned as a hex string.
20
+
21
+ Parameters
22
+ ----------
23
+ image : np.ndarray
24
+ An image as a numpy array in CxHxW format
25
+
26
+ Returns
27
+ -------
28
+ str
29
+ The hex string hash of the image using perceptual hashing
30
+ """
31
+ # Verify that the image is at least larger than an 8x8 image
32
+ min_dim = min(image.shape[-2:])
33
+ if min_dim < HASH_SIZE + 1:
34
+ raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
35
+
36
+ # Calculates the dimensions of the resized square image
37
+ resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
38
+
39
+ # Normalizes the image to CxHxW and takes the mean over all the channels
40
+ normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
41
+
42
+ # Rescales the pixel values to an 8-bit 0-255 image
43
+ rescaled = rescale(normalized, 8).astype(np.uint8)
44
+
45
+ # Resizes the image using the Lanczos algorithm to a square image
46
+ im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
47
+
48
+ # Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
49
+ transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
50
+
51
+ # Encodes the transform as a bit array over the median value
52
+ diff = transform > np.median(transform)
53
+
54
+ # Pads the front of the bit array to a multiple of 8 with False
55
+ padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
56
+ padded[-diff.size :] = diff.ravel()
57
+
58
+ # Converts the bit array to a hex string and strips leading 0s
59
+ hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
60
+ return hash_hex if hash_hex else "0"
61
+
62
+
63
+ def xxhash(image: np.ndarray) -> str:
64
+ """
65
+ Performs a fast non-cryptographic hash using the xxhash algorithm
66
+ (xxhash.com) against the image as a flattened bytearray. The hash
67
+ is returned as a hex string.
68
+
69
+ Parameters
70
+ ----------
71
+ image : np.ndarray
72
+ An image as a numpy array
73
+
74
+ Returns
75
+ -------
76
+ str
77
+ The hex string hash of the image using the xxHash algorithm
78
+ """
79
+ return xxh.xxh3_64_hexdigest(image.ravel().tobytes())