dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/_internal/detectors/clusterer.py +47 -34
  3. dataeval/_internal/detectors/drift/base.py +53 -35
  4. dataeval/_internal/detectors/drift/cvm.py +5 -4
  5. dataeval/_internal/detectors/drift/ks.py +7 -6
  6. dataeval/_internal/detectors/drift/mmd.py +39 -19
  7. dataeval/_internal/detectors/drift/torch.py +6 -5
  8. dataeval/_internal/detectors/drift/uncertainty.py +7 -8
  9. dataeval/_internal/detectors/duplicates.py +57 -30
  10. dataeval/_internal/detectors/linter.py +40 -24
  11. dataeval/_internal/detectors/ood/ae.py +2 -1
  12. dataeval/_internal/detectors/ood/aegmm.py +2 -1
  13. dataeval/_internal/detectors/ood/base.py +37 -15
  14. dataeval/_internal/detectors/ood/llr.py +9 -8
  15. dataeval/_internal/detectors/ood/vae.py +2 -1
  16. dataeval/_internal/detectors/ood/vaegmm.py +2 -1
  17. dataeval/_internal/flags.py +42 -21
  18. dataeval/_internal/interop.py +3 -12
  19. dataeval/_internal/metrics/balance.py +188 -0
  20. dataeval/_internal/metrics/ber.py +123 -48
  21. dataeval/_internal/metrics/coverage.py +90 -74
  22. dataeval/_internal/metrics/divergence.py +101 -67
  23. dataeval/_internal/metrics/diversity.py +211 -0
  24. dataeval/_internal/metrics/parity.py +287 -155
  25. dataeval/_internal/metrics/stats.py +198 -317
  26. dataeval/_internal/metrics/uap.py +40 -29
  27. dataeval/_internal/metrics/utils.py +430 -0
  28. dataeval/_internal/models/tensorflow/losses.py +3 -3
  29. dataeval/_internal/models/tensorflow/trainer.py +3 -2
  30. dataeval/_internal/models/tensorflow/utils.py +4 -3
  31. dataeval/_internal/output.py +82 -0
  32. dataeval/_internal/utils.py +64 -0
  33. dataeval/_internal/workflows/sufficiency.py +96 -107
  34. dataeval/flags/__init__.py +2 -2
  35. dataeval/metrics/__init__.py +26 -7
  36. dataeval/utils/__init__.py +9 -0
  37. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
  38. dataeval-0.65.0.dist-info/RECORD +60 -0
  39. dataeval/_internal/functional/__init__.py +0 -0
  40. dataeval/_internal/functional/ber.py +0 -63
  41. dataeval/_internal/functional/coverage.py +0 -75
  42. dataeval/_internal/functional/divergence.py +0 -16
  43. dataeval/_internal/functional/hash.py +0 -79
  44. dataeval/_internal/functional/metadata.py +0 -136
  45. dataeval/_internal/functional/metadataparity.py +0 -190
  46. dataeval/_internal/functional/uap.py +0 -6
  47. dataeval/_internal/functional/utils.py +0 -158
  48. dataeval/_internal/maite/__init__.py +0 -0
  49. dataeval/_internal/maite/utils.py +0 -30
  50. dataeval/_internal/metrics/base.py +0 -92
  51. dataeval/_internal/metrics/metadata.py +0 -610
  52. dataeval/_internal/metrics/metadataparity.py +0 -67
  53. dataeval-0.63.0.dist-info/RECORD +0 -68
  54. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
  55. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,188 @@
1
+ import warnings
2
+ from dataclasses import dataclass
3
+ from typing import Dict, List, Sequence
4
+
5
+ import numpy as np
6
+ from numpy.typing import NDArray
7
+ from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
8
+
9
+ from dataeval._internal.metrics.utils import entropy, preprocess_metadata
10
+ from dataeval._internal.output import OutputMetadata, set_metadata
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class BalanceOutput(OutputMetadata):
15
+ """
16
+ Attributes
17
+ ----------
18
+ mutual_information : NDArray[np.float64]
19
+ Estimate of mutual information between metadata factors and class label
20
+ """
21
+
22
+ mutual_information: NDArray[np.float64]
23
+
24
+
25
+ def validate_num_neighbors(num_neighbors: int) -> int:
26
+ if not isinstance(num_neighbors, (int, float)):
27
+ raise TypeError(
28
+ f"Variable {num_neighbors} is not real-valued numeric type."
29
+ "num_neighbors should be an int, greater than 0 and less than"
30
+ "the number of samples in the dataset"
31
+ )
32
+ if num_neighbors < 1:
33
+ raise ValueError(
34
+ f"Invalid value for {num_neighbors}."
35
+ "Choose a value greater than 0 and less than number of samples"
36
+ "in the dataset."
37
+ )
38
+ if isinstance(num_neighbors, float):
39
+ num_neighbors = int(num_neighbors)
40
+ warnings.warn(f"Variable {num_neighbors} is currently type float and will be truncated to type int.")
41
+
42
+ return num_neighbors
43
+
44
+
45
+ @set_metadata("dataeval.metrics")
46
+ def balance(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
47
+ """
48
+ Mutual information (MI) between factors (class label, metadata, label/image properties)
49
+
50
+ Parameters
51
+ ----------
52
+ class_labels: Sequence[int]
53
+ List of class labels for each image
54
+ metadata: List[Dict]
55
+ List of metadata factors for each image
56
+ num_neighbors: int, default 5
57
+ Number of nearest neighbors to use for computing MI between discrete
58
+ and continuous variables.
59
+
60
+ Returns
61
+ -------
62
+ BalanceOutput
63
+ (num_factors+1) x (num_factors+1) estimate of mutual information
64
+ between num_factors metadata factors and class label. Symmetry is enforced.
65
+
66
+ Notes
67
+ -----
68
+ We use `mutual_info_classif` from sklearn since class label is categorical.
69
+ `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
70
+ seed. MI is computed differently for categorical and continuous variables, and
71
+ we attempt to infer whether a variable is categorical by the fraction of unique
72
+ values in the dataset.
73
+
74
+ See Also
75
+ --------
76
+ sklearn.feature_selection.mutual_info_classif
77
+ sklearn.feature_selection.mutual_info_regression
78
+ sklearn.metrics.mutual_info_score
79
+ """
80
+ num_neighbors = validate_num_neighbors(num_neighbors)
81
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
82
+ num_factors = len(names)
83
+ mi = np.empty((num_factors, num_factors))
84
+ mi[:] = np.nan
85
+
86
+ for idx in range(num_factors):
87
+ tgt = data[:, idx]
88
+
89
+ if is_categorical[idx]:
90
+ if tgt.dtype == float:
91
+ # map to unique integers if categorical
92
+ _, tgt = np.unique(tgt, return_inverse=True)
93
+ # categorical target
94
+ mi[idx, :] = mutual_info_classif(
95
+ data,
96
+ tgt,
97
+ discrete_features=is_categorical, # type: ignore
98
+ n_neighbors=num_neighbors,
99
+ )
100
+ else:
101
+ # continuous variables
102
+ mi[idx, :] = mutual_info_regression(
103
+ data,
104
+ tgt,
105
+ discrete_features=is_categorical, # type: ignore
106
+ n_neighbors=num_neighbors,
107
+ )
108
+
109
+ ent_all = entropy(data, names, is_categorical, normalized=False)
110
+ norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
111
+ # in principle MI should be symmetric, but it is not in practice.
112
+ nmi = 0.5 * (mi + mi.T) / norm_factor
113
+
114
+ return BalanceOutput(nmi)
115
+
116
+
117
+ @set_metadata("dataeval.metrics")
118
+ def balance_classwise(class_labels: Sequence[int], metadata: List[Dict], num_neighbors: int = 5) -> BalanceOutput:
119
+ """
120
+ Compute mutual information (analogous to correlation) between metadata factors
121
+ (class label, metadata, label/image properties) with individual class labels.
122
+
123
+ Parameters
124
+ ----------
125
+ class_labels: Sequence[int]
126
+ List of class labels for each image
127
+ metadata: List[Dict]
128
+ List of metadata factors for each image
129
+ num_neighbors: int, default 5
130
+ Number of nearest neighbors to use for computing MI between discrete
131
+ and continuous variables.
132
+
133
+ Notes
134
+ -----
135
+ We use `mutual_info_classif` from sklearn since class label is categorical.
136
+ `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
137
+ seed. MI is computed differently for categorical and continuous variables, so we
138
+ have to specify with is_categorical.
139
+
140
+ Returns
141
+ -------
142
+ BalanceOutput
143
+ (num_classes x num_factors) estimate of mutual information between
144
+ num_factors metadata factors and individual class labels.
145
+
146
+ See Also
147
+ --------
148
+ sklearn.feature_selection.mutual_info_classif
149
+ sklearn.feature_selection.mutual_info_regression
150
+ sklearn.metrics.mutual_info_score
151
+ compute_mutual_information
152
+ """
153
+ num_neighbors = validate_num_neighbors(num_neighbors)
154
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
155
+ num_factors = len(names)
156
+ # unique class labels
157
+ class_idx = names.index("class_label")
158
+ class_data = data[:, class_idx]
159
+ u_cls = np.unique(class_data)
160
+ num_classes = len(u_cls)
161
+
162
+ data_no_class = np.concatenate((data[:, :class_idx], data[:, (class_idx + 1) :]), axis=1)
163
+
164
+ # assume class is a factor
165
+ mi = np.empty((num_classes, num_factors - 1))
166
+ mi[:] = np.nan
167
+
168
+ # categorical variables, excluding class label
169
+ cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
170
+
171
+ # classification MI for discrete/categorical features
172
+ for idx, cls in enumerate(u_cls):
173
+ tgt = class_data == cls
174
+ # units: nat
175
+ mi[idx, :] = mutual_info_classif(
176
+ data_no_class,
177
+ tgt,
178
+ discrete_features=cat_mask, # type: ignore
179
+ n_neighbors=num_neighbors,
180
+ )
181
+
182
+ # let this recompute for all features including class label
183
+ ent_all = entropy(data, names, is_categorical)
184
+ ent_tgt = ent_all[class_idx]
185
+ ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
186
+ norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
187
+ nmi = mi / norm_factor
188
+ return BalanceOutput(nmi)
@@ -7,68 +7,143 @@ Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
7
7
  https://arxiv.org/abs/1811.06419
8
8
  """
9
9
 
10
- from typing import Callable, Dict, Literal, Tuple
10
+ from dataclasses import dataclass
11
+ from typing import Literal, Tuple
11
12
 
12
13
  import numpy as np
14
+ from numpy.typing import ArrayLike, NDArray
15
+ from scipy.sparse import coo_matrix
16
+ from scipy.stats import mode
13
17
 
14
- from dataeval._internal.functional.ber import ber_knn, ber_mst
15
- from dataeval._internal.interop import ArrayLike, to_numpy
16
- from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
18
+ from dataeval._internal.interop import to_numpy
19
+ from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
20
+ from dataeval._internal.output import OutputMetadata, set_metadata
17
21
 
18
- _METHODS = Literal["MST", "KNN"]
19
- _FUNCTION = Callable[[np.ndarray, np.ndarray, int], Tuple[float, float]]
20
22
 
23
+ @dataclass(frozen=True)
24
+ class BEROutput(OutputMetadata):
25
+ """
26
+ Attributes
27
+ ----------
28
+ ber : float
29
+ The upper bounds of the Bayes Error Rate
30
+ ber_lower : float
31
+ The lower bounds of the Bayes Error Rate
32
+ """
33
+
34
+ ber: float
35
+ ber_lower: float
36
+
37
+
38
+ def ber_mst(X: NDArray, y: NDArray) -> Tuple[float, float]:
39
+ """Calculates the Bayes Error Rate using a minimum spanning tree
40
+
41
+ Parameters
42
+ ----------
43
+ X : NDArray, shape - (N, ... )
44
+ n_samples containing n_features
45
+ y : NDArray, shape - (N, 1)
46
+ Labels corresponding to each sample
47
+
48
+ Returns
49
+ -------
50
+ Tuple[float, float]
51
+ The upper and lower bounds of the bayes error rate
52
+ """
53
+ M, N = get_classes_counts(y)
54
+
55
+ tree = coo_matrix(minimum_spanning_tree(X))
56
+ matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
57
+ deltas = matches / (2 * N)
58
+ upper = 2 * deltas
59
+ lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
60
+ return upper, lower
61
+
62
+
63
+ def ber_knn(X: NDArray, y: NDArray, k: int) -> Tuple[float, float]:
64
+ """Calculates the Bayes Error Rate using K-nearest neighbors
65
+
66
+ Parameters
67
+ ----------
68
+ X : NDArray, shape - (N, ... )
69
+ n_samples containing n_features
70
+ y : NDArray, shape - (N, 1)
71
+ Labels corresponding to each sample
72
+
73
+ Returns
74
+ -------
75
+ Tuple[float, float]
76
+ The upper and lower bounds of the bayes error rate
77
+ """
78
+ M, N = get_classes_counts(y)
79
+ nn_indices = compute_neighbors(X, X, k=k)
80
+ nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
81
+ modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
82
+ upper = float(np.count_nonzero(modal_class - y) / N)
83
+ lower = knn_lowerbound(upper, M, k)
84
+ return upper, lower
85
+
86
+
87
+ def knn_lowerbound(value: float, classes: int, k: int) -> float:
88
+ """Several cases for computing the BER lower bound"""
89
+ if value <= 1e-10:
90
+ return 0.0
91
+
92
+ if classes == 2 and k != 1:
93
+ if k > 5:
94
+ # Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
95
+ alpha = 0.3399
96
+ beta = 0.9749
97
+ a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
98
+ return value / (1 + a_k)
99
+ if k > 2:
100
+ return value / (1 + (1 / np.sqrt(k)))
101
+ # k == 2:
102
+ return value / 2
103
+
104
+ return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
105
+
106
+
107
+ BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
21
108
 
22
- class BER(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
109
+
110
+ @set_metadata("dataeval.metrics")
111
+ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
23
112
  """
24
113
  An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
25
114
 
26
115
  Parameters
27
116
  ----------
28
- method : Literal["MST", "KNN"], default "KNN"
29
- Method to use when estimating the Bayes error rate
117
+ images : ArrayLike (N, ... )
118
+ Array of images or image embeddings
119
+ labels : ArrayLike (N, 1)
120
+ Array of labels for each image or image embedding
30
121
  k : int, default 1
31
- number of nearest neighbors for KNN estimator -- ignored by MST estimator
122
+ Number of nearest neighbors for KNN estimator -- ignored by MST estimator
123
+ method : Literal["KNN", "MST"], default "KNN"
124
+ Method to use when estimating the Bayes error rate
32
125
 
126
+ Returns
127
+ -------
128
+ BEROutput
129
+ The upper and lower bounds of the Bayes Error Rate
33
130
 
34
- See Also
131
+ References
132
+ ----------
133
+ [1] `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
134
+
135
+ Examples
35
136
  --------
36
- `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
137
+ >>> import sklearn.datasets as dsets
138
+ >>> from dataeval.metrics import ber
37
139
 
38
- """
140
+ >>> images, labels = dsets.make_blobs(n_samples=50, centers=2, n_features=2, random_state=0)
39
141
 
40
- def __init__(self, method: _METHODS = "KNN", k: int = 1) -> None:
41
- self.k: int = k
42
- self._set_method(method)
43
-
44
- @classmethod
45
- def _methods(cls) -> Dict[str, _FUNCTION]:
46
- return {"KNN": ber_knn, "MST": ber_mst}
47
-
48
- def evaluate(self, images: ArrayLike, labels: ArrayLike) -> Dict[str, float]:
49
- """
50
- Calculates the Bayes Error Rate estimate using the provided method
51
-
52
- Parameters
53
- ----------
54
- images : ArrayLike (N, : )
55
- Array of images or image embeddings
56
- labels : ArrayLike (N, 1)
57
- Array of labels for each image or image embedding
58
-
59
- Returns
60
- -------
61
- Dict[str, float]
62
- ber : float
63
- The estimated lower bounds of the Bayes Error Rate
64
- ber_lower : float
65
- The estimated upper bounds of the Bayes Error Rate
66
-
67
- Raises
68
- ------
69
- ValueError
70
- If unique classes M < 2
71
- """
72
-
73
- upper, lower = self._method(to_numpy(images), to_numpy(labels), self.k)
74
- return {"ber": upper, "ber_lower": lower}
142
+ >>> ber(images, labels)
143
+ BEROutput(ber=0.04, ber_lower=0.020416847668728033)
144
+ """
145
+ ber_fn = get_method(BER_FN_MAP, method)
146
+ X = to_numpy(images)
147
+ y = to_numpy(labels)
148
+ upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
149
+ return BEROutput(upper, lower)
@@ -1,18 +1,49 @@
1
- from typing import Literal, Tuple
1
+ import math
2
+ from dataclasses import dataclass
3
+ from typing import Literal
2
4
 
3
5
  import numpy as np
6
+ from numpy.typing import ArrayLike, NDArray
7
+ from scipy.spatial.distance import pdist, squareform
4
8
 
5
- from dataeval._internal.functional.coverage import coverage
6
- from dataeval._internal.interop import ArrayLike, to_numpy
7
- from dataeval._internal.metrics.base import EvaluateMixin
9
+ from dataeval._internal.interop import to_numpy
10
+ from dataeval._internal.metrics.utils import flatten
11
+ from dataeval._internal.output import OutputMetadata, set_metadata
8
12
 
9
13
 
10
- class Coverage(EvaluateMixin):
14
+ @dataclass(frozen=True)
15
+ class CoverageOutput(OutputMetadata):
16
+ """
17
+ Attributes
18
+ ----------
19
+ indices : NDArray
20
+ Array of uncovered indices
21
+ radii : NDArray
22
+ Array of critical value radii
23
+ critical_value : float
24
+ Radius for coverage
25
+ """
26
+
27
+ indices: NDArray[np.intp]
28
+ radii: NDArray[np.float64]
29
+ critical_value: float
30
+
31
+
32
+ @set_metadata("dataeval.metrics")
33
+ def coverage(
34
+ embeddings: ArrayLike,
35
+ radius_type: Literal["adaptive", "naive"] = "adaptive",
36
+ k: int = 20,
37
+ percent: np.float64 = np.float64(0.01),
38
+ ) -> CoverageOutput:
11
39
  """
12
40
  Class for evaluating coverage and identifying images/samples that are in undercovered regions.
13
41
 
14
42
  Parameters
15
43
  ----------
44
+ embeddings : ArrayLike, shape - (N, P)
45
+ A dataset in an ArrayLike format.
46
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
16
47
  radius_type : Literal["adaptive", "naive"], default "adaptive"
17
48
  The function used to determine radius.
18
49
  k: int, default 20
@@ -21,76 +52,61 @@ class Coverage(EvaluateMixin):
21
52
  percent: np.float64, default np.float(0.01)
22
53
  Percent of observations to be considered uncovered. Only applies to adaptive radius.
23
54
 
55
+ Returns
56
+ -------
57
+ CoverageOutput
58
+ Array of uncovered indices, critical value radii, and the radius for coverage
59
+
60
+ Raises
61
+ ------
62
+ ValueError
63
+ If length of embeddings is less than or equal to k
64
+ ValueError
65
+ If radius_type is unknown
66
+
67
+ Note
68
+ ----
69
+ Embeddings should be on the unit interval.
70
+
71
+ Example
72
+ -------
73
+ >>> coverage(embeddings)
74
+ CoverageOutput(indices=array([], dtype=int64), radii=array([0.59307666, 0.56956307, 0.56328616, 0.70660265, 0.57778087,
75
+ 0.53738624, 0.58968217, 1.27721334, 0.84378694, 0.67767021,
76
+ 0.69680335, 1.35532621, 0.59764166, 0.8691945 , 0.83627602,
77
+ 0.84187303, 0.62212358, 1.09039732, 0.67956797, 0.60134383,
78
+ 0.83713908, 0.91784263, 1.12901193, 0.73907618, 0.63943983,
79
+ 0.61188447, 0.47872713, 0.57207771, 0.92885883, 0.54750511,
80
+ 0.83015726, 1.20721778, 0.50421928, 0.98312246, 0.59764166,
81
+ 0.61009202, 0.73864073, 1.0381061 , 0.77598609, 0.72984036,
82
+ 0.67573006, 0.48056064, 1.00050879, 0.89532971, 0.58395529,
83
+ 0.95954793, 0.60134383, 1.10096454, 0.51955314, 0.73038702]), critical_value=0)
84
+
24
85
  Reference
25
86
  ---------
26
87
  This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
27
88
  [1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
28
-
29
- Examples
30
- --------
31
- Initialize the Coverage class:
32
-
33
- >>> cover = Coverage()
34
-
35
- Adjusting parameters:
36
-
37
- >>> cover = Coverage(k=5, percent=0.1)
38
- """
39
-
40
- def __init__(
41
- self,
42
- radius_type: Literal["adaptive", "naive"] = "adaptive",
43
- k: int = 20,
44
- percent: np.float64 = np.float64(0.01),
45
- ):
46
- self.radius_type: Literal["adaptive", "naive"] = radius_type
47
- self.k: int = k
48
- self.percent: np.float64 = percent
49
-
50
- def evaluate(self, embeddings: ArrayLike) -> Tuple[np.ndarray, np.ndarray, float]:
51
- """
52
- Perform a one-way chi-squared test between observation frequencies and expected frequencies that
53
- tests the null hypothesis that the observed data has the expected frequencies.
54
-
55
- Parameters
56
- ----------
57
- embeddings : ArrayLike, shape - (N, P)
58
- A dataset in an ArrayLike format.
59
- Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
60
-
61
- Returns
62
- -------
63
- np.ndarray
64
- Array of uncovered indices
65
- np.ndarray
66
- Array of critical value radii
67
- float
68
- Radius for coverage
69
-
70
- Raises
71
- ------
72
- ValueError
73
- If length of embeddings is less than or equal to k
74
- ValueError
75
- If radius_type is unknown
76
-
77
- Note
78
- ----
79
- Embeddings should be on the unit interval.
80
-
81
- Example
82
- -------
83
- >>> cover.evaluate(embeddings)
84
- (array([31, 7, 22, 37, 11]), array([0.35938604, 0.26462789, 0.20319609, 0.34140912, 0.31069921,
85
- 0.2308378 , 0.33300179, 0.69881025, 0.53587532, 0.35689803,
86
- 0.39333634, 0.67497874, 0.21788128, 0.43510162, 0.38601861,
87
- 0.34171868, 0.16941337, 0.66438044, 0.20319609, 0.19732733,
88
- 0.48660288, 0.5135814 , 0.69352653, 0.26946943, 0.31120605,
89
- 0.33067705, 0.30508271, 0.32802489, 0.51805702, 0.31120605,
90
- 0.40843265, 0.74996768, 0.31069921, 0.52263763, 0.26654013,
91
- 0.33113507, 0.40814838, 0.67723008, 0.48124375, 0.37243185,
92
- 0.29760001, 0.30907904, 0.59023236, 0.57778087, 0.21839853,
93
- 0.46067782, 0.31078966, 0.65199049, 0.26410603, 0.19542706]))
94
- """
95
-
96
- return coverage(to_numpy(embeddings), self.radius_type, self.k, self.percent)
89
+ """ # noqa: E501
90
+
91
+ # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
92
+ embeddings = to_numpy(embeddings)
93
+ n = len(embeddings)
94
+ if n <= k:
95
+ raise ValueError(
96
+ f"Number of observations n={n} is less than or equal to the specified number of neighbors k={k}."
97
+ )
98
+ mat = squareform(pdist(flatten(embeddings))).astype(np.float64)
99
+ sorted_dists = np.sort(mat, axis=1)
100
+ crit = sorted_dists[:, k + 1]
101
+
102
+ d = embeddings.shape[1]
103
+ if radius_type == "naive":
104
+ rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
105
+ pvals = np.where(crit > rho)[0]
106
+ elif radius_type == "adaptive":
107
+ # Use data adaptive cutoff as rho
108
+ rho = int(n * percent)
109
+ pvals = np.argsort(crit)[::-1][:rho]
110
+ else:
111
+ raise ValueError(f"{radius_type} is an invalid radius type. Expected 'adaptive' or 'naive'")
112
+ return CoverageOutput(pvals, crit, rho)