dataeval 0.63.0__py3-none-any.whl → 0.64.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/_internal/detectors/clusterer.py +2 -1
- dataeval/_internal/detectors/drift/base.py +2 -1
- dataeval/_internal/detectors/drift/cvm.py +2 -1
- dataeval/_internal/detectors/drift/ks.py +2 -1
- dataeval/_internal/detectors/drift/mmd.py +4 -3
- dataeval/_internal/detectors/drift/uncertainty.py +1 -2
- dataeval/_internal/detectors/duplicates.py +2 -1
- dataeval/_internal/detectors/linter.py +1 -1
- dataeval/_internal/detectors/ood/ae.py +2 -1
- dataeval/_internal/detectors/ood/aegmm.py +2 -1
- dataeval/_internal/detectors/ood/base.py +2 -1
- dataeval/_internal/detectors/ood/llr.py +3 -2
- dataeval/_internal/detectors/ood/vae.py +2 -1
- dataeval/_internal/detectors/ood/vaegmm.py +2 -1
- dataeval/_internal/interop.py +2 -11
- dataeval/_internal/metrics/balance.py +180 -0
- dataeval/_internal/metrics/base.py +1 -83
- dataeval/_internal/metrics/ber.py +122 -48
- dataeval/_internal/metrics/coverage.py +83 -74
- dataeval/_internal/metrics/divergence.py +67 -67
- dataeval/_internal/metrics/diversity.py +206 -0
- dataeval/_internal/metrics/parity.py +300 -155
- dataeval/_internal/metrics/stats.py +7 -5
- dataeval/_internal/metrics/uap.py +37 -29
- dataeval/_internal/metrics/utils.py +393 -0
- dataeval/_internal/utils.py +64 -0
- dataeval/metrics/__init__.py +25 -6
- dataeval/utils/__init__.py +9 -0
- {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -1
- dataeval-0.64.0.dist-info/RECORD +60 -0
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +0 -63
- dataeval/_internal/functional/coverage.py +0 -75
- dataeval/_internal/functional/divergence.py +0 -16
- dataeval/_internal/functional/hash.py +0 -79
- dataeval/_internal/functional/metadata.py +0 -136
- dataeval/_internal/functional/metadataparity.py +0 -190
- dataeval/_internal/functional/uap.py +0 -6
- dataeval/_internal/functional/utils.py +0 -158
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +0 -30
- dataeval/_internal/metrics/metadata.py +0 -610
- dataeval/_internal/metrics/metadataparity.py +0 -67
- dataeval-0.63.0.dist-info/RECORD +0 -68
- {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -7,68 +7,142 @@ Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
|
|
7
7
|
https://arxiv.org/abs/1811.06419
|
8
8
|
"""
|
9
9
|
|
10
|
-
from typing import
|
10
|
+
from typing import Literal, NamedTuple, Tuple
|
11
11
|
|
12
12
|
import numpy as np
|
13
|
+
from numpy.typing import ArrayLike, NDArray
|
14
|
+
from scipy.sparse import coo_matrix
|
15
|
+
from scipy.stats import mode
|
13
16
|
|
14
|
-
from dataeval._internal.
|
15
|
-
from dataeval._internal.
|
16
|
-
from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
|
17
|
+
from dataeval._internal.interop import to_numpy
|
18
|
+
from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
|
17
19
|
|
18
|
-
|
19
|
-
|
20
|
+
|
21
|
+
class BEROutput(NamedTuple):
|
22
|
+
"""
|
23
|
+
Attributes
|
24
|
+
----------
|
25
|
+
ber : float
|
26
|
+
The upper bounds of the Bayes Error Rate
|
27
|
+
ber_lower : float
|
28
|
+
The lower bounds of the Bayes Error Rate
|
29
|
+
"""
|
30
|
+
|
31
|
+
ber: float
|
32
|
+
ber_lower: float
|
33
|
+
|
34
|
+
|
35
|
+
def ber_mst(X: NDArray, y: NDArray) -> Tuple[float, float]:
|
36
|
+
"""Calculates the Bayes Error Rate using a minimum spanning tree
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
X : NDArray, shape - (N, ... )
|
41
|
+
n_samples containing n_features
|
42
|
+
y : NDArray, shape - (N, 1)
|
43
|
+
Labels corresponding to each sample
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
Tuple[float, float]
|
48
|
+
The upper and lower bounds of the bayes error rate
|
49
|
+
"""
|
50
|
+
M, N = get_classes_counts(y)
|
51
|
+
|
52
|
+
tree = coo_matrix(minimum_spanning_tree(X))
|
53
|
+
matches = np.sum([y[tree.row[i]] != y[tree.col[i]] for i in range(N - 1)])
|
54
|
+
deltas = matches / (2 * N)
|
55
|
+
upper = 2 * deltas
|
56
|
+
lower = ((M - 1) / (M)) * (1 - max(1 - 2 * ((M) / (M - 1)) * deltas, 0) ** 0.5)
|
57
|
+
return upper, lower
|
20
58
|
|
21
59
|
|
22
|
-
|
60
|
+
def ber_knn(X: NDArray, y: NDArray, k: int) -> Tuple[float, float]:
|
61
|
+
"""Calculates the Bayes Error Rate using K-nearest neighbors
|
62
|
+
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
X : NDArray, shape - (N, ... )
|
66
|
+
n_samples containing n_features
|
67
|
+
y : NDArray, shape - (N, 1)
|
68
|
+
Labels corresponding to each sample
|
69
|
+
|
70
|
+
Returns
|
71
|
+
-------
|
72
|
+
Tuple[float, float]
|
73
|
+
The upper and lower bounds of the bayes error rate
|
74
|
+
"""
|
75
|
+
M, N = get_classes_counts(y)
|
76
|
+
|
77
|
+
# All features belong on second dimension
|
78
|
+
X = X.reshape((X.shape[0], -1))
|
79
|
+
nn_indices = compute_neighbors(X, X, k=k)
|
80
|
+
nn_indices = np.expand_dims(nn_indices, axis=1) if nn_indices.ndim == 1 else nn_indices
|
81
|
+
modal_class = mode(y[nn_indices], axis=1, keepdims=True).mode.squeeze()
|
82
|
+
upper = float(np.count_nonzero(modal_class - y) / N)
|
83
|
+
lower = knn_lowerbound(upper, M, k)
|
84
|
+
return upper, lower
|
85
|
+
|
86
|
+
|
87
|
+
def knn_lowerbound(value: float, classes: int, k: int) -> float:
|
88
|
+
"""Several cases for computing the BER lower bound"""
|
89
|
+
if value <= 1e-10:
|
90
|
+
return 0.0
|
91
|
+
|
92
|
+
if classes == 2 and k != 1:
|
93
|
+
if k > 5:
|
94
|
+
# Property 2 (Devroye, 1981) cited in Snoopy paper, not in snoopy repo
|
95
|
+
alpha = 0.3399
|
96
|
+
beta = 0.9749
|
97
|
+
a_k = alpha * np.sqrt(k) / (k - 3.25) * (1 + beta / (np.sqrt(k - 3)))
|
98
|
+
return value / (1 + a_k)
|
99
|
+
if k > 2:
|
100
|
+
return value / (1 + (1 / np.sqrt(k)))
|
101
|
+
# k == 2:
|
102
|
+
return value / 2
|
103
|
+
|
104
|
+
return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
|
105
|
+
|
106
|
+
|
107
|
+
BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
|
108
|
+
|
109
|
+
|
110
|
+
def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
|
23
111
|
"""
|
24
112
|
An estimator for Multi-class Bayes Error Rate using FR or KNN test statistic basis
|
25
113
|
|
26
114
|
Parameters
|
27
115
|
----------
|
28
|
-
|
29
|
-
|
116
|
+
images : ArrayLike (N, ... )
|
117
|
+
Array of images or image embeddings
|
118
|
+
labels : ArrayLike (N, 1)
|
119
|
+
Array of labels for each image or image embedding
|
30
120
|
k : int, default 1
|
31
|
-
|
121
|
+
Number of nearest neighbors for KNN estimator -- ignored by MST estimator
|
122
|
+
method : Literal["KNN", "MST"], default "KNN"
|
123
|
+
Method to use when estimating the Bayes error rate
|
32
124
|
|
125
|
+
Returns
|
126
|
+
-------
|
127
|
+
BEROutput
|
128
|
+
The upper and lower bounds of the Bayes Error Rate
|
33
129
|
|
34
|
-
|
130
|
+
References
|
131
|
+
----------
|
132
|
+
[1] `Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4) <https://arxiv.org/abs/1811.06419>`_
|
133
|
+
|
134
|
+
Examples
|
35
135
|
--------
|
36
|
-
|
136
|
+
>>> import sklearn.datasets as dsets
|
137
|
+
>>> from dataeval.metrics import ber
|
37
138
|
|
38
|
-
|
139
|
+
>>> images, labels = dsets.make_blobs(n_samples=50, centers=2, n_features=2, random_state=0)
|
39
140
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
def evaluate(self, images: ArrayLike, labels: ArrayLike) -> Dict[str, float]:
|
49
|
-
"""
|
50
|
-
Calculates the Bayes Error Rate estimate using the provided method
|
51
|
-
|
52
|
-
Parameters
|
53
|
-
----------
|
54
|
-
images : ArrayLike (N, : )
|
55
|
-
Array of images or image embeddings
|
56
|
-
labels : ArrayLike (N, 1)
|
57
|
-
Array of labels for each image or image embedding
|
58
|
-
|
59
|
-
Returns
|
60
|
-
-------
|
61
|
-
Dict[str, float]
|
62
|
-
ber : float
|
63
|
-
The estimated lower bounds of the Bayes Error Rate
|
64
|
-
ber_lower : float
|
65
|
-
The estimated upper bounds of the Bayes Error Rate
|
66
|
-
|
67
|
-
Raises
|
68
|
-
------
|
69
|
-
ValueError
|
70
|
-
If unique classes M < 2
|
71
|
-
"""
|
72
|
-
|
73
|
-
upper, lower = self._method(to_numpy(images), to_numpy(labels), self.k)
|
74
|
-
return {"ber": upper, "ber_lower": lower}
|
141
|
+
>>> ber(images, labels)
|
142
|
+
BEROutput(ber=0.04, ber_lower=0.020416847668728033)
|
143
|
+
"""
|
144
|
+
ber_fn = get_method(BER_FN_MAP, method)
|
145
|
+
X = to_numpy(images)
|
146
|
+
y = to_numpy(labels)
|
147
|
+
upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
|
148
|
+
return BEROutput(upper, lower)
|
@@ -1,18 +1,44 @@
|
|
1
|
-
|
1
|
+
import math
|
2
|
+
from typing import Literal, NamedTuple
|
2
3
|
|
3
4
|
import numpy as np
|
5
|
+
from numpy.typing import ArrayLike, NDArray
|
6
|
+
from scipy.spatial.distance import pdist, squareform
|
4
7
|
|
5
|
-
from dataeval._internal.
|
6
|
-
from dataeval._internal.interop import ArrayLike, to_numpy
|
7
|
-
from dataeval._internal.metrics.base import EvaluateMixin
|
8
|
+
from dataeval._internal.interop import to_numpy
|
8
9
|
|
9
10
|
|
10
|
-
class
|
11
|
+
class CoverageOutput(NamedTuple):
|
12
|
+
"""
|
13
|
+
Attributes
|
14
|
+
----------
|
15
|
+
indices : np.ndarray
|
16
|
+
Array of uncovered indices
|
17
|
+
radii : np.ndarray
|
18
|
+
Array of critical value radii
|
19
|
+
critical_value : float
|
20
|
+
Radius for coverage
|
21
|
+
"""
|
22
|
+
|
23
|
+
indices: NDArray[np.intp]
|
24
|
+
radii: NDArray[np.float64]
|
25
|
+
critical_value: float
|
26
|
+
|
27
|
+
|
28
|
+
def coverage(
|
29
|
+
embeddings: ArrayLike,
|
30
|
+
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
31
|
+
k: int = 20,
|
32
|
+
percent: np.float64 = np.float64(0.01),
|
33
|
+
) -> CoverageOutput:
|
11
34
|
"""
|
12
35
|
Class for evaluating coverage and identifying images/samples that are in undercovered regions.
|
13
36
|
|
14
37
|
Parameters
|
15
38
|
----------
|
39
|
+
embeddings : ArrayLike, shape - (N, P)
|
40
|
+
A dataset in an ArrayLike format.
|
41
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
16
42
|
radius_type : Literal["adaptive", "naive"], default "adaptive"
|
17
43
|
The function used to determine radius.
|
18
44
|
k: int, default 20
|
@@ -21,76 +47,59 @@ class Coverage(EvaluateMixin):
|
|
21
47
|
percent: np.float64, default np.float(0.01)
|
22
48
|
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
23
49
|
|
50
|
+
Returns
|
51
|
+
-------
|
52
|
+
CoverageOutput
|
53
|
+
Array of uncovered indices, critical value radii, and the radius for coverage
|
54
|
+
|
55
|
+
Raises
|
56
|
+
------
|
57
|
+
ValueError
|
58
|
+
If length of embeddings is less than or equal to k
|
59
|
+
ValueError
|
60
|
+
If radius_type is unknown
|
61
|
+
|
62
|
+
Note
|
63
|
+
----
|
64
|
+
Embeddings should be on the unit interval.
|
65
|
+
|
66
|
+
Example
|
67
|
+
-------
|
68
|
+
>>> coverage(embeddings)
|
69
|
+
CoverageOutput(indices=array([], dtype=int64), radii=array([0.59307666, 0.56956307, 0.56328616, 0.70660265, 0.57778087,
|
70
|
+
0.53738624, 0.58968217, 1.27721334, 0.84378694, 0.67767021,
|
71
|
+
0.69680335, 1.35532621, 0.59764166, 0.8691945 , 0.83627602,
|
72
|
+
0.84187303, 0.62212358, 1.09039732, 0.67956797, 0.60134383,
|
73
|
+
0.83713908, 0.91784263, 1.12901193, 0.73907618, 0.63943983,
|
74
|
+
0.61188447, 0.47872713, 0.57207771, 0.92885883, 0.54750511,
|
75
|
+
0.83015726, 1.20721778, 0.50421928, 0.98312246, 0.59764166,
|
76
|
+
0.61009202, 0.73864073, 1.0381061 , 0.77598609, 0.72984036,
|
77
|
+
0.67573006, 0.48056064, 1.00050879, 0.89532971, 0.58395529,
|
78
|
+
0.95954793, 0.60134383, 1.10096454, 0.51955314, 0.73038702]), critical_value=0)
|
79
|
+
|
24
80
|
Reference
|
25
81
|
---------
|
26
82
|
This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
|
27
83
|
[1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
def evaluate(self, embeddings: ArrayLike) -> Tuple[np.ndarray, np.ndarray, float]:
|
51
|
-
"""
|
52
|
-
Perform a one-way chi-squared test between observation frequencies and expected frequencies that
|
53
|
-
tests the null hypothesis that the observed data has the expected frequencies.
|
54
|
-
|
55
|
-
Parameters
|
56
|
-
----------
|
57
|
-
embeddings : ArrayLike, shape - (N, P)
|
58
|
-
A dataset in an ArrayLike format.
|
59
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
60
|
-
|
61
|
-
Returns
|
62
|
-
-------
|
63
|
-
np.ndarray
|
64
|
-
Array of uncovered indices
|
65
|
-
np.ndarray
|
66
|
-
Array of critical value radii
|
67
|
-
float
|
68
|
-
Radius for coverage
|
69
|
-
|
70
|
-
Raises
|
71
|
-
------
|
72
|
-
ValueError
|
73
|
-
If length of embeddings is less than or equal to k
|
74
|
-
ValueError
|
75
|
-
If radius_type is unknown
|
76
|
-
|
77
|
-
Note
|
78
|
-
----
|
79
|
-
Embeddings should be on the unit interval.
|
80
|
-
|
81
|
-
Example
|
82
|
-
-------
|
83
|
-
>>> cover.evaluate(embeddings)
|
84
|
-
(array([31, 7, 22, 37, 11]), array([0.35938604, 0.26462789, 0.20319609, 0.34140912, 0.31069921,
|
85
|
-
0.2308378 , 0.33300179, 0.69881025, 0.53587532, 0.35689803,
|
86
|
-
0.39333634, 0.67497874, 0.21788128, 0.43510162, 0.38601861,
|
87
|
-
0.34171868, 0.16941337, 0.66438044, 0.20319609, 0.19732733,
|
88
|
-
0.48660288, 0.5135814 , 0.69352653, 0.26946943, 0.31120605,
|
89
|
-
0.33067705, 0.30508271, 0.32802489, 0.51805702, 0.31120605,
|
90
|
-
0.40843265, 0.74996768, 0.31069921, 0.52263763, 0.26654013,
|
91
|
-
0.33113507, 0.40814838, 0.67723008, 0.48124375, 0.37243185,
|
92
|
-
0.29760001, 0.30907904, 0.59023236, 0.57778087, 0.21839853,
|
93
|
-
0.46067782, 0.31078966, 0.65199049, 0.26410603, 0.19542706]))
|
94
|
-
"""
|
95
|
-
|
96
|
-
return coverage(to_numpy(embeddings), self.radius_type, self.k, self.percent)
|
84
|
+
""" # noqa: E501
|
85
|
+
|
86
|
+
# Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
|
87
|
+
embeddings = to_numpy(embeddings)
|
88
|
+
n = len(embeddings)
|
89
|
+
if n <= k:
|
90
|
+
raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
|
91
|
+
mat = squareform(pdist(embeddings)).astype(np.float64)
|
92
|
+
sorted_dists = np.sort(mat, axis=1)
|
93
|
+
crit = sorted_dists[:, k + 1]
|
94
|
+
|
95
|
+
d = np.shape(embeddings)[1]
|
96
|
+
if radius_type == "naive":
|
97
|
+
rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
|
98
|
+
pvals = np.where(crit > rho)[0]
|
99
|
+
elif radius_type == "adaptive":
|
100
|
+
# Use data adaptive cutoff as rho
|
101
|
+
rho = int(n * percent)
|
102
|
+
pvals = np.argsort(crit)[::-1][:rho]
|
103
|
+
else:
|
104
|
+
raise ValueError("Invalid radius type.")
|
105
|
+
return CoverageOutput(pvals, crit, rho)
|
@@ -3,27 +3,70 @@ This module contains the implementation of HP Divergence
|
|
3
3
|
using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
|
4
4
|
"""
|
5
5
|
|
6
|
-
from typing import
|
6
|
+
from typing import Literal, NamedTuple
|
7
7
|
|
8
8
|
import numpy as np
|
9
|
+
from numpy.typing import ArrayLike
|
9
10
|
|
10
|
-
from dataeval._internal.
|
11
|
-
from dataeval._internal.
|
12
|
-
from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
|
11
|
+
from dataeval._internal.interop import to_numpy
|
12
|
+
from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
|
15
|
+
class DivergenceOutput(NamedTuple):
|
16
|
+
"""
|
17
|
+
Attributes
|
18
|
+
----------
|
19
|
+
divergence : float
|
20
|
+
Divergence value calculated between 2 datasets ranging between 0.0 and 1.0
|
21
|
+
errors : int
|
22
|
+
The number of differing edges between the datasets
|
23
|
+
"""
|
24
|
+
|
25
|
+
divergence: float
|
26
|
+
errors: int
|
27
|
+
|
28
|
+
|
29
|
+
def divergence_mst(data: np.ndarray, labels: np.ndarray) -> int:
|
30
|
+
mst = minimum_spanning_tree(data).toarray()
|
31
|
+
edgelist = np.transpose(np.nonzero(mst))
|
32
|
+
errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
|
33
|
+
return errors
|
34
|
+
|
35
|
+
|
36
|
+
def divergence_fnn(data: np.ndarray, labels: np.ndarray) -> int:
|
37
|
+
nn_indices = compute_neighbors(data, data)
|
38
|
+
errors = np.sum(np.abs(labels[nn_indices] - labels))
|
39
|
+
return errors
|
16
40
|
|
17
41
|
|
18
|
-
|
42
|
+
DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
|
43
|
+
|
44
|
+
|
45
|
+
def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
19
46
|
"""
|
20
|
-
Calculates the
|
47
|
+
Calculates the divergence and any errors between the datasets
|
21
48
|
|
22
49
|
Parameters
|
23
50
|
----------
|
24
|
-
|
51
|
+
data_a : ArrayLike, shape - (N, P)
|
52
|
+
A dataset in an ArrayLike format to compare.
|
53
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
54
|
+
data_b : ArrayLike, shape - (N, P)
|
55
|
+
A dataset in an ArrayLike format to compare.
|
56
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
57
|
+
method : Literal["MST, "FNN"], default "FNN"
|
25
58
|
Method used to estimate dataset divergence
|
26
59
|
|
60
|
+
Returns
|
61
|
+
-------
|
62
|
+
DivergenceOutput
|
63
|
+
The divergence value (0.0..1.0) and the number of differing edges between the datasets
|
64
|
+
|
65
|
+
Notes
|
66
|
+
-----
|
67
|
+
The divergence value indicates how similar the 2 datasets are
|
68
|
+
with 0 indicating approximately identical data distributions.
|
69
|
+
|
27
70
|
Warning
|
28
71
|
-------
|
29
72
|
MST is very slow in this implementation, this is unlike matlab where
|
@@ -40,63 +83,20 @@ class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
|
|
40
83
|
|
41
84
|
Examples
|
42
85
|
--------
|
43
|
-
|
44
|
-
|
45
|
-
>>> divert = Divergence()
|
86
|
+
Evaluate the datasets:
|
46
87
|
|
47
|
-
|
48
|
-
|
49
|
-
>>> divert = Divergence(method="FNN")
|
88
|
+
>>> divergence(datasetA, datasetB)
|
89
|
+
DivergenceOutput(divergence=0.28, errors=36.0)
|
50
90
|
"""
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
Parameters
|
64
|
-
----------
|
65
|
-
data_a : ArrayLike, shape - (N, P)
|
66
|
-
A dataset in an ArrayLike format to compare.
|
67
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
68
|
-
data_b : ArrayLike, shape - (N, P)
|
69
|
-
A dataset in an ArrayLike format to compare.
|
70
|
-
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
71
|
-
|
72
|
-
Returns
|
73
|
-
-------
|
74
|
-
Dict[str, Any]
|
75
|
-
divergence : float
|
76
|
-
divergence value between 0.0 and 1.0
|
77
|
-
error : int
|
78
|
-
the number of differing edges between the datasets
|
79
|
-
|
80
|
-
Notes
|
81
|
-
-----
|
82
|
-
The divergence value indicates how similar the 2 datasets are
|
83
|
-
with 0 indicating approximately identical data distributions.
|
84
|
-
|
85
|
-
Examples
|
86
|
-
--------
|
87
|
-
Evaluate the datasets:
|
88
|
-
|
89
|
-
>>> divert.evaluate(datasetA, datasetB)
|
90
|
-
{'divergence': 0.28, 'error': 36.0}
|
91
|
-
"""
|
92
|
-
a = to_numpy(data_a)
|
93
|
-
b = to_numpy(data_b)
|
94
|
-
N = a.shape[0]
|
95
|
-
M = b.shape[0]
|
96
|
-
|
97
|
-
stacked_data = np.vstack((a, b))
|
98
|
-
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
99
|
-
|
100
|
-
errors = self._method(stacked_data, labels)
|
101
|
-
dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
|
102
|
-
return {"divergence": dp, "error": errors}
|
91
|
+
div_fn = get_method(DIVERGENCE_FN_MAP, method)
|
92
|
+
a = to_numpy(data_a)
|
93
|
+
b = to_numpy(data_b)
|
94
|
+
N = a.shape[0]
|
95
|
+
M = b.shape[0]
|
96
|
+
|
97
|
+
stacked_data = np.vstack((a, b))
|
98
|
+
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
99
|
+
|
100
|
+
errors = div_fn(stacked_data, labels)
|
101
|
+
dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
|
102
|
+
return DivergenceOutput(dp, errors)
|