dataeval 0.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +18 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/clusterer.py +469 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/drift/base.py +265 -0
- dataeval/_internal/detectors/drift/cvm.py +97 -0
- dataeval/_internal/detectors/drift/ks.py +100 -0
- dataeval/_internal/detectors/drift/mmd.py +166 -0
- dataeval/_internal/detectors/drift/torch.py +310 -0
- dataeval/_internal/detectors/drift/uncertainty.py +149 -0
- dataeval/_internal/detectors/duplicates.py +49 -0
- dataeval/_internal/detectors/linter.py +78 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/detectors/ood/ae.py +77 -0
- dataeval/_internal/detectors/ood/aegmm.py +69 -0
- dataeval/_internal/detectors/ood/base.py +199 -0
- dataeval/_internal/detectors/ood/llr.py +284 -0
- dataeval/_internal/detectors/ood/vae.py +86 -0
- dataeval/_internal/detectors/ood/vaegmm.py +79 -0
- dataeval/_internal/flags.py +47 -0
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/base.py +92 -0
- dataeval/_internal/metrics/ber.py +124 -0
- dataeval/_internal/metrics/coverage.py +80 -0
- dataeval/_internal/metrics/divergence.py +94 -0
- dataeval/_internal/metrics/hash.py +79 -0
- dataeval/_internal/metrics/parity.py +180 -0
- dataeval/_internal/metrics/stats.py +332 -0
- dataeval/_internal/metrics/uap.py +45 -0
- dataeval/_internal/metrics/utils.py +158 -0
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/autoencoder.py +202 -0
- dataeval/_internal/models/pytorch/blocks.py +46 -0
- dataeval/_internal/models/pytorch/utils.py +67 -0
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/models/tensorflow/autoencoder.py +317 -0
- dataeval/_internal/models/tensorflow/gmm.py +115 -0
- dataeval/_internal/models/tensorflow/losses.py +107 -0
- dataeval/_internal/models/tensorflow/pixelcnn.py +1106 -0
- dataeval/_internal/models/tensorflow/trainer.py +102 -0
- dataeval/_internal/models/tensorflow/utils.py +254 -0
- dataeval/_internal/workflows/sufficiency.py +555 -0
- dataeval/detectors/__init__.py +29 -0
- dataeval/flags/__init__.py +3 -0
- dataeval/metrics/__init__.py +7 -0
- dataeval/models/__init__.py +15 -0
- dataeval/models/tensorflow/__init__.py +6 -0
- dataeval/models/torch/__init__.py +8 -0
- dataeval/py.typed +0 -0
- dataeval/workflows/__init__.py +8 -0
- dataeval-0.61.0.dist-info/LICENSE.txt +21 -0
- dataeval-0.61.0.dist-info/METADATA +114 -0
- dataeval-0.61.0.dist-info/RECORD +55 -0
- dataeval-0.61.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,79 @@
|
|
1
|
+
"""
|
2
|
+
Source code derived from Alibi-Detect 0.11.4
|
3
|
+
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
+
|
5
|
+
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
+
Licensed under Apache Software License (Apache 2.0)
|
7
|
+
"""
|
8
|
+
|
9
|
+
from typing import Callable
|
10
|
+
|
11
|
+
import keras
|
12
|
+
import numpy as np
|
13
|
+
|
14
|
+
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
|
15
|
+
from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
|
16
|
+
from dataeval._internal.models.tensorflow.gmm import gmm_energy
|
17
|
+
from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
|
18
|
+
from dataeval._internal.models.tensorflow.utils import predict_batch
|
19
|
+
|
20
|
+
|
21
|
+
class OOD_VAEGMM(OODGMMBase):
|
22
|
+
def __init__(self, model: VAEGMM, samples: int = 10) -> None:
|
23
|
+
"""
|
24
|
+
VAE with Gaussian Mixture Model based outlier detector.
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
model : VAEGMM
|
29
|
+
A VAEGMM model.
|
30
|
+
samples
|
31
|
+
Number of samples sampled to evaluate each instance.
|
32
|
+
"""
|
33
|
+
super().__init__(model)
|
34
|
+
self.samples = samples
|
35
|
+
|
36
|
+
def fit(
|
37
|
+
self,
|
38
|
+
x_ref: np.ndarray,
|
39
|
+
threshold_perc: float = 100.0,
|
40
|
+
loss_fn: Callable = LossGMM(elbo=Elbo(0.05)),
|
41
|
+
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
42
|
+
epochs: int = 20,
|
43
|
+
batch_size: int = 64,
|
44
|
+
verbose: bool = True,
|
45
|
+
) -> None:
|
46
|
+
"""
|
47
|
+
Train the AE model with recommended loss function and optimizer.
|
48
|
+
|
49
|
+
Parameters
|
50
|
+
----------
|
51
|
+
X : np.ndarray
|
52
|
+
Training batch.
|
53
|
+
threshold_perc : float, default 100.0
|
54
|
+
Percentage of reference data that is normal.
|
55
|
+
loss_fn : Callable, default LossGMM(elbo=Elbo(0.05))
|
56
|
+
Loss function used for training.
|
57
|
+
optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
|
58
|
+
Optimizer used for training.
|
59
|
+
epochs : int, default 20
|
60
|
+
Number of training epochs.
|
61
|
+
batch_size : int, default 64
|
62
|
+
Batch size used for training.
|
63
|
+
verbose : bool, default True
|
64
|
+
Whether to print training progress.
|
65
|
+
"""
|
66
|
+
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
67
|
+
|
68
|
+
def score(self, X: np.ndarray, batch_size: int = int(1e10)) -> OODScore:
|
69
|
+
self._validate(X)
|
70
|
+
|
71
|
+
# draw samples from latent space
|
72
|
+
X_samples = np.repeat(X, self.samples, axis=0)
|
73
|
+
_, z, _ = predict_batch(X_samples, self.model, batch_size=batch_size)
|
74
|
+
|
75
|
+
# compute average energy for samples
|
76
|
+
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
77
|
+
energy_samples = energy.numpy().reshape((-1, self.samples)) # type: ignore
|
78
|
+
iscore = np.mean(energy_samples, axis=-1)
|
79
|
+
return OODScore(iscore)
|
@@ -0,0 +1,47 @@
|
|
1
|
+
from enum import Flag, auto
|
2
|
+
from typing import Union
|
3
|
+
|
4
|
+
|
5
|
+
class auto_all:
|
6
|
+
def __get__(self, _, cls):
|
7
|
+
return ~cls(0)
|
8
|
+
|
9
|
+
|
10
|
+
class ImageHash(Flag):
|
11
|
+
XXHASH = auto()
|
12
|
+
PCHASH = auto()
|
13
|
+
ALL = auto_all()
|
14
|
+
|
15
|
+
|
16
|
+
class ImageProperty(Flag):
|
17
|
+
WIDTH = auto()
|
18
|
+
HEIGHT = auto()
|
19
|
+
SIZE = auto()
|
20
|
+
ASPECT_RATIO = auto()
|
21
|
+
CHANNELS = auto()
|
22
|
+
DEPTH = auto()
|
23
|
+
ALL = auto_all()
|
24
|
+
|
25
|
+
|
26
|
+
class ImageVisuals(Flag):
|
27
|
+
BRIGHTNESS = auto()
|
28
|
+
BLURRINESS = auto()
|
29
|
+
MISSING = auto()
|
30
|
+
ZERO = auto()
|
31
|
+
ALL = auto_all()
|
32
|
+
|
33
|
+
|
34
|
+
class ImageStatistics(Flag):
|
35
|
+
MEAN = auto()
|
36
|
+
STD = auto()
|
37
|
+
VAR = auto()
|
38
|
+
SKEW = auto()
|
39
|
+
KURTOSIS = auto()
|
40
|
+
ENTROPY = auto()
|
41
|
+
PERCENTILES = auto()
|
42
|
+
HISTOGRAM = auto()
|
43
|
+
ALL = auto_all()
|
44
|
+
|
45
|
+
|
46
|
+
ImageStatsFlags = Union[ImageHash, ImageProperty, ImageVisuals, ImageStatistics]
|
47
|
+
LinterFlags = Union[ImageProperty, ImageVisuals, ImageStatistics]
|
File without changes
|
@@ -0,0 +1,92 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import Callable, Dict, Generic, List, TypeVar
|
3
|
+
|
4
|
+
TOutput = TypeVar("TOutput", bound=dict)
|
5
|
+
TMethods = TypeVar("TMethods")
|
6
|
+
TCallable = TypeVar("TCallable", bound=Callable)
|
7
|
+
|
8
|
+
|
9
|
+
class MetricMixin(ABC, Generic[TOutput]):
|
10
|
+
@abstractmethod
|
11
|
+
def update(self, preds, targets): ...
|
12
|
+
|
13
|
+
@abstractmethod
|
14
|
+
def compute(self) -> TOutput: ...
|
15
|
+
|
16
|
+
@abstractmethod
|
17
|
+
def reset(self): ...
|
18
|
+
|
19
|
+
|
20
|
+
class EvaluateMixin(ABC, Generic[TOutput]):
|
21
|
+
@abstractmethod
|
22
|
+
def evaluate(self) -> TOutput:
|
23
|
+
"""Abstract method to calculate metric based off of constructor parameters"""
|
24
|
+
|
25
|
+
|
26
|
+
class MethodsMixin(ABC, Generic[TMethods, TCallable]):
|
27
|
+
"""
|
28
|
+
Use this mixin to define a mapping of functions to method names which
|
29
|
+
can be queried by the user and called internally with the appropriate
|
30
|
+
method name as the key.
|
31
|
+
|
32
|
+
Explicitly defining the Callable generic helps with type safety and
|
33
|
+
hinting for function signatures and recommended but optional.
|
34
|
+
|
35
|
+
e.g.:
|
36
|
+
|
37
|
+
def _mult(x: float, y: float) -> float:
|
38
|
+
return x * y
|
39
|
+
|
40
|
+
class MyMetric(MethodsMixin[Callable[float, float], float]):
|
41
|
+
|
42
|
+
def _methods(cls) -> Dict[str, Callable[float, float], float]:
|
43
|
+
return {
|
44
|
+
"ADD": lambda x, y: x + y,
|
45
|
+
"MULT": _mult,
|
46
|
+
...
|
47
|
+
}
|
48
|
+
|
49
|
+
Then during evaluate, you can call the method specified with the getter.
|
50
|
+
|
51
|
+
e.g.:
|
52
|
+
|
53
|
+
def evaluate(self):
|
54
|
+
return self._method(x, y)
|
55
|
+
|
56
|
+
The resulting class can be used like so.
|
57
|
+
|
58
|
+
m = MyMetric(1.0, 2.0, "ADD")
|
59
|
+
m.evaluate() # returns 3.0
|
60
|
+
m.method # returns "ADD"
|
61
|
+
MyMetric.methods() # returns "['ADD', 'MULT']
|
62
|
+
m.method = "MULT"
|
63
|
+
m.evaluate() # returns 2.0
|
64
|
+
"""
|
65
|
+
|
66
|
+
@classmethod
|
67
|
+
@abstractmethod
|
68
|
+
def _methods(cls) -> Dict[str, TCallable]:
|
69
|
+
"""Abstract method returning available method functions for class"""
|
70
|
+
|
71
|
+
@property
|
72
|
+
def _method(self) -> TCallable:
|
73
|
+
return self._methods()[self.method]
|
74
|
+
|
75
|
+
@classmethod
|
76
|
+
def methods(cls) -> List[str]:
|
77
|
+
return list(cls._methods().keys())
|
78
|
+
|
79
|
+
@property
|
80
|
+
def method(self) -> str:
|
81
|
+
return self._method_key
|
82
|
+
|
83
|
+
@method.setter
|
84
|
+
def method(self, value: TMethods):
|
85
|
+
self._set_method(value)
|
86
|
+
|
87
|
+
def _set_method(self, value: TMethods):
|
88
|
+
"""This setter is to fix pyright incorrect detection of
|
89
|
+
incorrectly overriding the 'method' property"""
|
90
|
+
if value not in self.methods():
|
91
|
+
raise KeyError(f"Specified method not available for class ({self.methods()}).")
|
92
|
+
self._method_key = value
|
@@ -0,0 +1,124 @@
|
|
1
|
+
"""
|
2
|
+
This module contains the implementation of the
|
3
|
+
FR Test Statistic based estimate and the
|
4
|
+
KNN based estimate for the Bayes Error Rate
|
5
|
+
|
6
|
+
Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
|
7
|
+
https://arxiv.org/abs/1811.06419
|
8
|
+
"""
|
9
|
+
|
10
|
+
from typing import Callable, Dict, Literal, Tuple
|
11
|
+
|
12
|
+
import numpy as np
|
13
|
+
from maite.protocols import ArrayLike
|
14
|
+
from scipy.sparse import coo_matrix
|
15
|
+
from scipy.stats import mode
|
16
|
+
|
17
|
+
from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
|
18
|
+
|
19
|
+
from .utils import compute_neighbors, get_classes_counts, minimum_spanning_tree
|
20
|
+
|
21
|
+
|
22
|
+
def _mst(X: np.ndarray, y: np.ndarray, _: int) -> Tuple[float, float]:
|
23
|
+
M, N = get_classes_counts(y)
|
24
|
+
|
25
|
+
tree = coo_matrix(minimum_spanning_tree(X))
|
26
|
+
matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
|
27
|
+
deltas = matches / (2 * N)
|
28
|
+
upper = 2 * deltas
|
29
|
+
lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
|
30
|
+
return upper, lower
|
31
|
+
|
32
|
+
|
33
|
+
def _knn(X: np.ndarray, y: np.ndarray, k: int) -> Tuple[float, float]:
|
34
|
+
M, N = get_classes_counts(y)
|
35
|
+
|
36
|
+
# All features belong on second dimension
|
37
|
+
X = X.reshape((X.shape[0], -1))
|
38
|
+
nn_indices = compute_neighbors(X, X, k=k)
|
39
|
+
nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
|
40
|
+
modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
|
41
|
+
upper = float(np.count_nonzero(modal_class - y) / N)
|
42
|
+
lower = _knn_lowerbound(upper, M, k)
|
43
|
+
return upper, lower
|
44
|
+
|
45
|
+
|
46
|
+
def _knn_lowerbound(value: float, classes: int, k: int) -> float:
|
47
|
+
"Several cases for computing the BER lower bound"
|
48
|
+
if value <= 1e-10:
|
49
|
+
return 0.0
|
50
|
+
|
51
|
+
if classes == 2 and k != 1:
|
52
|
+
if k > 5:
|
53
|
+
# Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
|
54
|
+
alpha = 0.3399
|
55
|
+
beta = 0.9749
|
56
|
+
a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
|
57
|
+
return value / (1 + a_k)
|
58
|
+
if k > 2:
|
59
|
+
return value / (1 + (1 / np.sqrt(k)))
|
60
|
+
# k == 2:
|
61
|
+
return value / 2
|
62
|
+
|
63
|
+
return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
|
64
|
+
|
65
|
+
|
66
|
+
_METHODS = Literal["MST", "KNN"]
|
67
|
+
_FUNCTION = Callable[[np.ndarray, np.ndarray, int], Tuple[float, float]]
|
68
|
+
|
69
|
+
|
70
|
+
class BER(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
|
71
|
+
"""
|
72
|
+
An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
|
73
|
+
|
74
|
+
Parameters
|
75
|
+
----------
|
76
|
+
data : np.ndarray
|
77
|
+
Array of images or image embeddings
|
78
|
+
labels : np.ndarray
|
79
|
+
Array of labels for each image or image embedding
|
80
|
+
method : Literal["MST", "KNN"], default "KNN"
|
81
|
+
Method to use when estimating the Bayes error rate
|
82
|
+
k : int, default 1
|
83
|
+
number of nearest neighbors for KNN estimator -- ignored by MST estimator
|
84
|
+
|
85
|
+
|
86
|
+
See Also
|
87
|
+
--------
|
88
|
+
`Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
|
89
|
+
|
90
|
+
"""
|
91
|
+
|
92
|
+
def __init__(self, data: ArrayLike, labels: ArrayLike, method: _METHODS = "KNN", k: int = 1) -> None:
|
93
|
+
self.data = data
|
94
|
+
self.labels = labels
|
95
|
+
self.k = k
|
96
|
+
self._set_method(method)
|
97
|
+
|
98
|
+
@classmethod
|
99
|
+
def _methods(
|
100
|
+
cls,
|
101
|
+
) -> Dict[str, _FUNCTION]:
|
102
|
+
return {"MST": _mst, "KNN": _knn}
|
103
|
+
|
104
|
+
def evaluate(self) -> Dict[str, float]:
|
105
|
+
"""
|
106
|
+
Calculates the Bayes Error Rate estimate using the provided method
|
107
|
+
|
108
|
+
Returns
|
109
|
+
-------
|
110
|
+
Dict[str, float]
|
111
|
+
ber : float
|
112
|
+
The estimated lower bounds of the Bayes Error Rate
|
113
|
+
ber_lower : float
|
114
|
+
The estimated upper bounds of the Bayes Error Rate
|
115
|
+
|
116
|
+
Raises
|
117
|
+
------
|
118
|
+
ValueError
|
119
|
+
If unique classes M < 2
|
120
|
+
"""
|
121
|
+
data = np.asarray(self.data)
|
122
|
+
labels = np.asarray(self.labels)
|
123
|
+
upper, lower = self._method(data, labels, self.k)
|
124
|
+
return {"ber": upper, "ber_lower": lower}
|
@@ -0,0 +1,80 @@
|
|
1
|
+
import math
|
2
|
+
from typing import Literal, Tuple
|
3
|
+
|
4
|
+
import numpy as np
|
5
|
+
from scipy.spatial.distance import pdist, squareform
|
6
|
+
|
7
|
+
|
8
|
+
class Coverage:
|
9
|
+
"""
|
10
|
+
Class for evaluating coverage and identifying images/samples that are in undercovered regions.
|
11
|
+
|
12
|
+
This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
|
13
|
+
|
14
|
+
Parameters
|
15
|
+
----------
|
16
|
+
embeddings : np.ndarray
|
17
|
+
n x p array of image embeddings from the dataset.
|
18
|
+
radius_type : Literal["adaptive", "naive"], default "adaptive"
|
19
|
+
The function used to determine radius.
|
20
|
+
k: int, default 20
|
21
|
+
Number of observations required in order to be covered.
|
22
|
+
percent: np.float64, default np.float(0.01)
|
23
|
+
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
24
|
+
|
25
|
+
Note
|
26
|
+
----
|
27
|
+
Embeddings should be on the unit interval.
|
28
|
+
"""
|
29
|
+
|
30
|
+
def __init__(
|
31
|
+
self,
|
32
|
+
embeddings: np.ndarray,
|
33
|
+
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
34
|
+
k: int = 20,
|
35
|
+
percent: np.float64 = np.float64(0.01),
|
36
|
+
):
|
37
|
+
self.embeddings = embeddings
|
38
|
+
self.radius_type = radius_type
|
39
|
+
self.k = k
|
40
|
+
self.percent = percent
|
41
|
+
|
42
|
+
def evaluate(self) -> Tuple[np.ndarray, np.ndarray]:
|
43
|
+
"""
|
44
|
+
Perform a one-way chi-squared test between observation frequencies and expected frequencies that
|
45
|
+
tests the null hypothesis that the observed data has the expected frequencies.
|
46
|
+
|
47
|
+
Returns
|
48
|
+
-------
|
49
|
+
np.ndarray
|
50
|
+
Array of uncovered indices
|
51
|
+
np.ndarray
|
52
|
+
Array of critical value radii
|
53
|
+
|
54
|
+
Raises
|
55
|
+
------
|
56
|
+
ValueError
|
57
|
+
If length of embeddings is less than or equal to k
|
58
|
+
ValueError
|
59
|
+
If radius_type is unknown
|
60
|
+
"""
|
61
|
+
|
62
|
+
# Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
|
63
|
+
n = len(self.embeddings)
|
64
|
+
if n <= self.k:
|
65
|
+
raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
|
66
|
+
mat = squareform(pdist(self.embeddings))
|
67
|
+
sorted_dists = np.sort(mat, axis=1)
|
68
|
+
crit = sorted_dists[:, self.k + 1]
|
69
|
+
|
70
|
+
d = np.shape(self.embeddings)[1]
|
71
|
+
if self.radius_type == "naive":
|
72
|
+
self.rho = (1 / math.sqrt(math.pi)) * ((2 * self.k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
|
73
|
+
pvals = np.where(crit > self.rho)[0]
|
74
|
+
elif self.radius_type == "adaptive":
|
75
|
+
# Use data adaptive cutoff
|
76
|
+
cutoff = int(n * self.percent)
|
77
|
+
pvals = np.argsort(crit)[::-1][:cutoff]
|
78
|
+
else:
|
79
|
+
raise ValueError("Invalid radius type.")
|
80
|
+
return pvals, crit
|
@@ -0,0 +1,94 @@
|
|
1
|
+
"""
|
2
|
+
This module contains the implementation of HP Divergence
|
3
|
+
using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
|
4
|
+
"""
|
5
|
+
|
6
|
+
from typing import Any, Callable, Dict, Literal
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
|
10
|
+
from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
|
11
|
+
|
12
|
+
from .utils import compute_neighbors, minimum_spanning_tree
|
13
|
+
|
14
|
+
|
15
|
+
def _mst(data: np.ndarray, labels: np.ndarray) -> int:
|
16
|
+
mst = minimum_spanning_tree(data).toarray()
|
17
|
+
edgelist = np.transpose(np.nonzero(mst))
|
18
|
+
errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
|
19
|
+
return errors
|
20
|
+
|
21
|
+
|
22
|
+
def _fnn(data: np.ndarray, labels: np.ndarray) -> int:
|
23
|
+
nn_indices = compute_neighbors(data, data)
|
24
|
+
errors = np.sum(np.abs(labels[nn_indices] - labels))
|
25
|
+
return errors
|
26
|
+
|
27
|
+
|
28
|
+
_METHODS = Literal["MST", "FNN"]
|
29
|
+
_FUNCTION = Callable[[np.ndarray, np.ndarray], int]
|
30
|
+
|
31
|
+
|
32
|
+
class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
|
33
|
+
"""
|
34
|
+
Calculates the estimated divergence between two datasets
|
35
|
+
|
36
|
+
Parameters
|
37
|
+
----------
|
38
|
+
data_a : np.ndarray
|
39
|
+
Array of images or image embeddings to compare
|
40
|
+
data_b : np.ndarray
|
41
|
+
Array of images or image embeddings to compare
|
42
|
+
method : Literal["MST, "FNN"], default "MST"
|
43
|
+
Method used to estimate dataset divergence
|
44
|
+
|
45
|
+
See Also
|
46
|
+
--------
|
47
|
+
For more information about this divergence, its formal definition,
|
48
|
+
and its associated estimators see https://arxiv.org/abs/1412.6534.
|
49
|
+
|
50
|
+
Warning
|
51
|
+
-------
|
52
|
+
MST is very slow in this implementation, this is unlike matlab where
|
53
|
+
they have comparable speeds
|
54
|
+
Overall, MST takes ~25x LONGER!!
|
55
|
+
Source of slowdown:
|
56
|
+
conversion to and from CSR format adds ~10% of the time diff between
|
57
|
+
1nn and scipy mst function the remaining 90%
|
58
|
+
"""
|
59
|
+
|
60
|
+
def __init__(
|
61
|
+
self,
|
62
|
+
data_a: np.ndarray,
|
63
|
+
data_b: np.ndarray,
|
64
|
+
method: _METHODS = "MST",
|
65
|
+
) -> None:
|
66
|
+
self.data_a = data_a
|
67
|
+
self.data_b = data_b
|
68
|
+
self._set_method(method)
|
69
|
+
|
70
|
+
@classmethod
|
71
|
+
def _methods(cls) -> Dict[str, _FUNCTION]:
|
72
|
+
return {"FNN": _fnn, "MST": _mst}
|
73
|
+
|
74
|
+
def evaluate(self) -> Dict[str, Any]:
|
75
|
+
"""
|
76
|
+
Calculates the divergence and any errors between the datasets
|
77
|
+
|
78
|
+
Returns
|
79
|
+
-------
|
80
|
+
Dict[str, Any]
|
81
|
+
dp : float
|
82
|
+
divergence value between 0.0 and 1.0
|
83
|
+
errors : int
|
84
|
+
the number of differing edges
|
85
|
+
"""
|
86
|
+
N = self.data_a.shape[0]
|
87
|
+
M = self.data_b.shape[0]
|
88
|
+
|
89
|
+
stacked_data = np.vstack((self.data_a, self.data_b))
|
90
|
+
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
91
|
+
|
92
|
+
errors = self._method(stacked_data, labels)
|
93
|
+
dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
|
94
|
+
return {"divergence": dp, "error": errors}
|
@@ -0,0 +1,79 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import xxhash as xxh
|
3
|
+
from PIL import Image
|
4
|
+
from scipy.fftpack import dct
|
5
|
+
|
6
|
+
from dataeval._internal.metrics.utils import normalize_image_shape, rescale
|
7
|
+
|
8
|
+
HASH_SIZE = 8
|
9
|
+
MAX_FACTOR = 4
|
10
|
+
|
11
|
+
|
12
|
+
def pchash(image: np.ndarray) -> str:
|
13
|
+
"""
|
14
|
+
Performs a perceptual hash on an image by resizing to a square NxN image
|
15
|
+
using the Lanczos algorithm where N is 32x32 or the largest multiple of
|
16
|
+
8 that is smaller than the input image dimensions. The resampled image
|
17
|
+
is compressed using a discrete cosine transform and the lowest frequency
|
18
|
+
component is encoded as a bit array of greater or less than median value
|
19
|
+
and returned as a hex string.
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
image : np.ndarray
|
24
|
+
An image as a numpy array in CxHxW format
|
25
|
+
|
26
|
+
Returns
|
27
|
+
-------
|
28
|
+
str
|
29
|
+
The hex string hash of the image using perceptual hashing
|
30
|
+
"""
|
31
|
+
# Verify that the image is at least larger than an 8x8 image
|
32
|
+
min_dim = min(image.shape[-2:])
|
33
|
+
if min_dim < HASH_SIZE + 1:
|
34
|
+
raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
|
35
|
+
|
36
|
+
# Calculates the dimensions of the resized square image
|
37
|
+
resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
|
38
|
+
|
39
|
+
# Normalizes the image to CxHxW and takes the mean over all the channels
|
40
|
+
normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
|
41
|
+
|
42
|
+
# Rescales the pixel values to an 8-bit 0-255 image
|
43
|
+
rescaled = rescale(normalized, 8).astype(np.uint8)
|
44
|
+
|
45
|
+
# Resizes the image using the Lanczos algorithm to a square image
|
46
|
+
im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
|
47
|
+
|
48
|
+
# Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
|
49
|
+
transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
|
50
|
+
|
51
|
+
# Encodes the transform as a bit array over the median value
|
52
|
+
diff = transform > np.median(transform)
|
53
|
+
|
54
|
+
# Pads the front of the bit array to a multiple of 8 with False
|
55
|
+
padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
|
56
|
+
padded[-diff.size :] = diff.ravel()
|
57
|
+
|
58
|
+
# Converts the bit array to a hex string and strips leading 0s
|
59
|
+
hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
|
60
|
+
return hash_hex if hash_hex else "0"
|
61
|
+
|
62
|
+
|
63
|
+
def xxhash(image: np.ndarray) -> str:
|
64
|
+
"""
|
65
|
+
Performs a fast non-cryptographic hash using the xxhash algorithm
|
66
|
+
(xxhash.com) against the image as a flattened bytearray. The hash
|
67
|
+
is returned as a hex string.
|
68
|
+
|
69
|
+
Parameters
|
70
|
+
----------
|
71
|
+
image : np.ndarray
|
72
|
+
An image as a numpy array
|
73
|
+
|
74
|
+
Returns
|
75
|
+
-------
|
76
|
+
str
|
77
|
+
The hex string hash of the image using the xxHash algorithm
|
78
|
+
"""
|
79
|
+
return xxh.xxh3_64_hexdigest(image.ravel().tobytes())
|