dataeval 0.61.0__py3-none-any.whl → 0.63.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_internal/detectors/clusterer.py +44 -16
- dataeval/_internal/detectors/drift/base.py +14 -12
- dataeval/_internal/detectors/drift/cvm.py +11 -8
- dataeval/_internal/detectors/drift/ks.py +6 -3
- dataeval/_internal/detectors/drift/mmd.py +14 -12
- dataeval/_internal/detectors/drift/uncertainty.py +7 -5
- dataeval/_internal/detectors/duplicates.py +35 -12
- dataeval/_internal/detectors/linter.py +85 -16
- dataeval/_internal/detectors/ood/ae.py +6 -5
- dataeval/_internal/detectors/ood/aegmm.py +5 -5
- dataeval/_internal/detectors/ood/base.py +14 -13
- dataeval/_internal/detectors/ood/llr.py +6 -4
- dataeval/_internal/detectors/ood/vae.py +5 -4
- dataeval/_internal/detectors/ood/vaegmm.py +5 -4
- dataeval/_internal/functional/__init__.py +0 -0
- dataeval/_internal/functional/ber.py +63 -0
- dataeval/_internal/functional/coverage.py +75 -0
- dataeval/_internal/functional/divergence.py +16 -0
- dataeval/_internal/{metrics → functional}/hash.py +1 -1
- dataeval/_internal/functional/metadata.py +136 -0
- dataeval/_internal/functional/metadataparity.py +190 -0
- dataeval/_internal/functional/uap.py +6 -0
- dataeval/_internal/interop.py +52 -0
- dataeval/_internal/maite/__init__.py +0 -0
- dataeval/_internal/maite/utils.py +30 -0
- dataeval/_internal/metrics/base.py +2 -2
- dataeval/_internal/metrics/ber.py +16 -66
- dataeval/_internal/metrics/coverage.py +51 -35
- dataeval/_internal/metrics/divergence.py +50 -42
- dataeval/_internal/metrics/metadata.py +610 -0
- dataeval/_internal/metrics/metadataparity.py +67 -0
- dataeval/_internal/metrics/parity.py +40 -56
- dataeval/_internal/metrics/stats.py +46 -35
- dataeval/_internal/metrics/uap.py +14 -17
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/metrics/__init__.py +2 -1
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/METADATA +1 -2
- dataeval-0.63.0.dist-info/RECORD +68 -0
- dataeval-0.61.0.dist-info/RECORD +0 -55
- /dataeval/_internal/{metrics → functional}/utils.py +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/WHEEL +0 -0
@@ -7,46 +7,23 @@ from typing import Any, Callable, Dict, Literal
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
|
10
|
+
from dataeval._internal.functional.divergence import divergence_fnn, divergence_mst
|
11
|
+
from dataeval._internal.interop import ArrayLike, to_numpy
|
10
12
|
from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
|
11
13
|
|
12
|
-
from .utils import compute_neighbors, minimum_spanning_tree
|
13
|
-
|
14
|
-
|
15
|
-
def _mst(data: np.ndarray, labels: np.ndarray) -> int:
|
16
|
-
mst = minimum_spanning_tree(data).toarray()
|
17
|
-
edgelist = np.transpose(np.nonzero(mst))
|
18
|
-
errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
|
19
|
-
return errors
|
20
|
-
|
21
|
-
|
22
|
-
def _fnn(data: np.ndarray, labels: np.ndarray) -> int:
|
23
|
-
nn_indices = compute_neighbors(data, data)
|
24
|
-
errors = np.sum(np.abs(labels[nn_indices] - labels))
|
25
|
-
return errors
|
26
|
-
|
27
|
-
|
28
14
|
_METHODS = Literal["MST", "FNN"]
|
29
15
|
_FUNCTION = Callable[[np.ndarray, np.ndarray], int]
|
30
16
|
|
31
17
|
|
32
18
|
class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
|
33
19
|
"""
|
34
|
-
Calculates the estimated divergence between two datasets
|
20
|
+
Calculates the estimated HP divergence between two datasets
|
35
21
|
|
36
22
|
Parameters
|
37
23
|
----------
|
38
|
-
data_a : np.ndarray
|
39
|
-
Array of images or image embeddings to compare
|
40
|
-
data_b : np.ndarray
|
41
|
-
Array of images or image embeddings to compare
|
42
24
|
method : Literal["MST, "FNN"], default "MST"
|
43
25
|
Method used to estimate dataset divergence
|
44
26
|
|
45
|
-
See Also
|
46
|
-
--------
|
47
|
-
For more information about this divergence, its formal definition,
|
48
|
-
and its associated estimators see https://arxiv.org/abs/1412.6534.
|
49
|
-
|
50
27
|
Warning
|
51
28
|
-------
|
52
29
|
MST is very slow in this implementation, this is unlike matlab where
|
@@ -55,38 +32,69 @@ class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
|
|
55
32
|
Source of slowdown:
|
56
33
|
conversion to and from CSR format adds ~10% of the time diff between
|
57
34
|
1nn and scipy mst function the remaining 90%
|
35
|
+
|
36
|
+
References
|
37
|
+
----------
|
38
|
+
For more information about this divergence, its formal definition,
|
39
|
+
and its associated estimators see https://arxiv.org/abs/1412.6534.
|
40
|
+
|
41
|
+
Examples
|
42
|
+
--------
|
43
|
+
Initialize the Divergence class:
|
44
|
+
|
45
|
+
>>> divert = Divergence()
|
46
|
+
|
47
|
+
Specify the method:
|
48
|
+
|
49
|
+
>>> divert = Divergence(method="FNN")
|
58
50
|
"""
|
59
51
|
|
60
|
-
def __init__(
|
61
|
-
self,
|
62
|
-
data_a: np.ndarray,
|
63
|
-
data_b: np.ndarray,
|
64
|
-
method: _METHODS = "MST",
|
65
|
-
) -> None:
|
66
|
-
self.data_a = data_a
|
67
|
-
self.data_b = data_b
|
52
|
+
def __init__(self, method: _METHODS = "MST") -> None:
|
68
53
|
self._set_method(method)
|
69
54
|
|
70
55
|
@classmethod
|
71
56
|
def _methods(cls) -> Dict[str, _FUNCTION]:
|
72
|
-
return {"FNN":
|
57
|
+
return {"FNN": divergence_fnn, "MST": divergence_mst}
|
73
58
|
|
74
|
-
def evaluate(self) -> Dict[str, Any]:
|
59
|
+
def evaluate(self, data_a: ArrayLike, data_b: ArrayLike) -> Dict[str, Any]:
|
75
60
|
"""
|
76
61
|
Calculates the divergence and any errors between the datasets
|
77
62
|
|
63
|
+
Parameters
|
64
|
+
----------
|
65
|
+
data_a : ArrayLike, shape - (N, P)
|
66
|
+
A dataset in an ArrayLike format to compare.
|
67
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
68
|
+
data_b : ArrayLike, shape - (N, P)
|
69
|
+
A dataset in an ArrayLike format to compare.
|
70
|
+
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
71
|
+
|
78
72
|
Returns
|
79
73
|
-------
|
80
74
|
Dict[str, Any]
|
81
|
-
|
75
|
+
divergence : float
|
82
76
|
divergence value between 0.0 and 1.0
|
83
|
-
|
84
|
-
the number of differing edges
|
77
|
+
error : int
|
78
|
+
the number of differing edges between the datasets
|
79
|
+
|
80
|
+
Notes
|
81
|
+
-----
|
82
|
+
The divergence value indicates how similar the 2 datasets are
|
83
|
+
with 0 indicating approximately identical data distributions.
|
84
|
+
|
85
|
+
Examples
|
86
|
+
--------
|
87
|
+
Evaluate the datasets:
|
88
|
+
|
89
|
+
>>> divert.evaluate(datasetA, datasetB)
|
90
|
+
{'divergence': 0.28, 'error': 36.0}
|
85
91
|
"""
|
86
|
-
|
87
|
-
|
92
|
+
a = to_numpy(data_a)
|
93
|
+
b = to_numpy(data_b)
|
94
|
+
N = a.shape[0]
|
95
|
+
M = b.shape[0]
|
88
96
|
|
89
|
-
stacked_data = np.vstack((
|
97
|
+
stacked_data = np.vstack((a, b))
|
90
98
|
labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
|
91
99
|
|
92
100
|
errors = self._method(stacked_data, labels)
|