dataeval 0.61.0__py3-none-any.whl → 0.63.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_internal/detectors/clusterer.py +44 -16
  3. dataeval/_internal/detectors/drift/base.py +14 -12
  4. dataeval/_internal/detectors/drift/cvm.py +11 -8
  5. dataeval/_internal/detectors/drift/ks.py +6 -3
  6. dataeval/_internal/detectors/drift/mmd.py +14 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +7 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -12
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +6 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +5 -5
  12. dataeval/_internal/detectors/ood/base.py +14 -13
  13. dataeval/_internal/detectors/ood/llr.py +6 -4
  14. dataeval/_internal/detectors/ood/vae.py +5 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +5 -4
  16. dataeval/_internal/functional/__init__.py +0 -0
  17. dataeval/_internal/functional/ber.py +63 -0
  18. dataeval/_internal/functional/coverage.py +75 -0
  19. dataeval/_internal/functional/divergence.py +16 -0
  20. dataeval/_internal/{metrics → functional}/hash.py +1 -1
  21. dataeval/_internal/functional/metadata.py +136 -0
  22. dataeval/_internal/functional/metadataparity.py +190 -0
  23. dataeval/_internal/functional/uap.py +6 -0
  24. dataeval/_internal/interop.py +52 -0
  25. dataeval/_internal/maite/__init__.py +0 -0
  26. dataeval/_internal/maite/utils.py +30 -0
  27. dataeval/_internal/metrics/base.py +2 -2
  28. dataeval/_internal/metrics/ber.py +16 -66
  29. dataeval/_internal/metrics/coverage.py +51 -35
  30. dataeval/_internal/metrics/divergence.py +50 -42
  31. dataeval/_internal/metrics/metadata.py +610 -0
  32. dataeval/_internal/metrics/metadataparity.py +67 -0
  33. dataeval/_internal/metrics/parity.py +40 -56
  34. dataeval/_internal/metrics/stats.py +46 -35
  35. dataeval/_internal/metrics/uap.py +14 -17
  36. dataeval/_internal/workflows/__init__.py +0 -0
  37. dataeval/metrics/__init__.py +2 -1
  38. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/METADATA +1 -2
  39. dataeval-0.63.0.dist-info/RECORD +68 -0
  40. dataeval-0.61.0.dist-info/RECORD +0 -55
  41. /dataeval/_internal/{metrics → functional}/utils.py +0 -0
  42. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/LICENSE.txt +0 -0
  43. {dataeval-0.61.0.dist-info → dataeval-0.63.0.dist-info}/WHEEL +0 -0
@@ -7,46 +7,23 @@ from typing import Any, Callable, Dict, Literal
7
7
 
8
8
  import numpy as np
9
9
 
10
+ from dataeval._internal.functional.divergence import divergence_fnn, divergence_mst
11
+ from dataeval._internal.interop import ArrayLike, to_numpy
10
12
  from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
11
13
 
12
- from .utils import compute_neighbors, minimum_spanning_tree
13
-
14
-
15
- def _mst(data: np.ndarray, labels: np.ndarray) -> int:
16
- mst = minimum_spanning_tree(data).toarray()
17
- edgelist = np.transpose(np.nonzero(mst))
18
- errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
19
- return errors
20
-
21
-
22
- def _fnn(data: np.ndarray, labels: np.ndarray) -> int:
23
- nn_indices = compute_neighbors(data, data)
24
- errors = np.sum(np.abs(labels[nn_indices] - labels))
25
- return errors
26
-
27
-
28
14
  _METHODS = Literal["MST", "FNN"]
29
15
  _FUNCTION = Callable[[np.ndarray, np.ndarray], int]
30
16
 
31
17
 
32
18
  class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
33
19
  """
34
- Calculates the estimated divergence between two datasets
20
+ Calculates the estimated HP divergence between two datasets
35
21
 
36
22
  Parameters
37
23
  ----------
38
- data_a : np.ndarray
39
- Array of images or image embeddings to compare
40
- data_b : np.ndarray
41
- Array of images or image embeddings to compare
42
24
  method : Literal["MST, "FNN"], default "MST"
43
25
  Method used to estimate dataset divergence
44
26
 
45
- See Also
46
- --------
47
- For more information about this divergence, its formal definition,
48
- and its associated estimators see https://arxiv.org/abs/1412.6534.
49
-
50
27
  Warning
51
28
  -------
52
29
  MST is very slow in this implementation, this is unlike matlab where
@@ -55,38 +32,69 @@ class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
55
32
  Source of slowdown:
56
33
  conversion to and from CSR format adds ~10% of the time diff between
57
34
  1nn and scipy mst function the remaining 90%
35
+
36
+ References
37
+ ----------
38
+ For more information about this divergence, its formal definition,
39
+ and its associated estimators see https://arxiv.org/abs/1412.6534.
40
+
41
+ Examples
42
+ --------
43
+ Initialize the Divergence class:
44
+
45
+ >>> divert = Divergence()
46
+
47
+ Specify the method:
48
+
49
+ >>> divert = Divergence(method="FNN")
58
50
  """
59
51
 
60
- def __init__(
61
- self,
62
- data_a: np.ndarray,
63
- data_b: np.ndarray,
64
- method: _METHODS = "MST",
65
- ) -> None:
66
- self.data_a = data_a
67
- self.data_b = data_b
52
+ def __init__(self, method: _METHODS = "MST") -> None:
68
53
  self._set_method(method)
69
54
 
70
55
  @classmethod
71
56
  def _methods(cls) -> Dict[str, _FUNCTION]:
72
- return {"FNN": _fnn, "MST": _mst}
57
+ return {"FNN": divergence_fnn, "MST": divergence_mst}
73
58
 
74
- def evaluate(self) -> Dict[str, Any]:
59
+ def evaluate(self, data_a: ArrayLike, data_b: ArrayLike) -> Dict[str, Any]:
75
60
  """
76
61
  Calculates the divergence and any errors between the datasets
77
62
 
63
+ Parameters
64
+ ----------
65
+ data_a : ArrayLike, shape - (N, P)
66
+ A dataset in an ArrayLike format to compare.
67
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
68
+ data_b : ArrayLike, shape - (N, P)
69
+ A dataset in an ArrayLike format to compare.
70
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
71
+
78
72
  Returns
79
73
  -------
80
74
  Dict[str, Any]
81
- dp : float
75
+ divergence : float
82
76
  divergence value between 0.0 and 1.0
83
- errors : int
84
- the number of differing edges
77
+ error : int
78
+ the number of differing edges between the datasets
79
+
80
+ Notes
81
+ -----
82
+ The divergence value indicates how similar the 2 datasets are
83
+ with 0 indicating approximately identical data distributions.
84
+
85
+ Examples
86
+ --------
87
+ Evaluate the datasets:
88
+
89
+ >>> divert.evaluate(datasetA, datasetB)
90
+ {'divergence': 0.28, 'error': 36.0}
85
91
  """
86
- N = self.data_a.shape[0]
87
- M = self.data_b.shape[0]
92
+ a = to_numpy(data_a)
93
+ b = to_numpy(data_b)
94
+ N = a.shape[0]
95
+ M = b.shape[0]
88
96
 
89
- stacked_data = np.vstack((self.data_a, self.data_b))
97
+ stacked_data = np.vstack((a, b))
90
98
  labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
91
99
 
92
100
  errors = self._method(stacked_data, labels)