dataeval 0.63.0__py3-none-any.whl → 0.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/_internal/detectors/clusterer.py +47 -34
  3. dataeval/_internal/detectors/drift/base.py +53 -35
  4. dataeval/_internal/detectors/drift/cvm.py +5 -4
  5. dataeval/_internal/detectors/drift/ks.py +7 -6
  6. dataeval/_internal/detectors/drift/mmd.py +39 -19
  7. dataeval/_internal/detectors/drift/torch.py +6 -5
  8. dataeval/_internal/detectors/drift/uncertainty.py +7 -8
  9. dataeval/_internal/detectors/duplicates.py +57 -30
  10. dataeval/_internal/detectors/linter.py +40 -24
  11. dataeval/_internal/detectors/ood/ae.py +2 -1
  12. dataeval/_internal/detectors/ood/aegmm.py +2 -1
  13. dataeval/_internal/detectors/ood/base.py +37 -15
  14. dataeval/_internal/detectors/ood/llr.py +9 -8
  15. dataeval/_internal/detectors/ood/vae.py +2 -1
  16. dataeval/_internal/detectors/ood/vaegmm.py +2 -1
  17. dataeval/_internal/flags.py +42 -21
  18. dataeval/_internal/interop.py +3 -12
  19. dataeval/_internal/metrics/balance.py +188 -0
  20. dataeval/_internal/metrics/ber.py +123 -48
  21. dataeval/_internal/metrics/coverage.py +90 -74
  22. dataeval/_internal/metrics/divergence.py +101 -67
  23. dataeval/_internal/metrics/diversity.py +211 -0
  24. dataeval/_internal/metrics/parity.py +287 -155
  25. dataeval/_internal/metrics/stats.py +198 -317
  26. dataeval/_internal/metrics/uap.py +40 -29
  27. dataeval/_internal/metrics/utils.py +430 -0
  28. dataeval/_internal/models/tensorflow/losses.py +3 -3
  29. dataeval/_internal/models/tensorflow/trainer.py +3 -2
  30. dataeval/_internal/models/tensorflow/utils.py +4 -3
  31. dataeval/_internal/output.py +82 -0
  32. dataeval/_internal/utils.py +64 -0
  33. dataeval/_internal/workflows/sufficiency.py +96 -107
  34. dataeval/flags/__init__.py +2 -2
  35. dataeval/metrics/__init__.py +26 -7
  36. dataeval/utils/__init__.py +9 -0
  37. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
  38. dataeval-0.65.0.dist-info/RECORD +60 -0
  39. dataeval/_internal/functional/__init__.py +0 -0
  40. dataeval/_internal/functional/ber.py +0 -63
  41. dataeval/_internal/functional/coverage.py +0 -75
  42. dataeval/_internal/functional/divergence.py +0 -16
  43. dataeval/_internal/functional/hash.py +0 -79
  44. dataeval/_internal/functional/metadata.py +0 -136
  45. dataeval/_internal/functional/metadataparity.py +0 -190
  46. dataeval/_internal/functional/uap.py +0 -6
  47. dataeval/_internal/functional/utils.py +0 -158
  48. dataeval/_internal/maite/__init__.py +0 -0
  49. dataeval/_internal/maite/utils.py +0 -30
  50. dataeval/_internal/metrics/base.py +0 -92
  51. dataeval/_internal/metrics/metadata.py +0 -610
  52. dataeval/_internal/metrics/metadataparity.py +0 -67
  53. dataeval-0.63.0.dist-info/RECORD +0 -68
  54. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
  55. {dataeval-0.63.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -3,27 +3,104 @@ This module contains the implementation of HP Divergence
3
3
  using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
4
4
  """
5
5
 
6
- from typing import Any, Callable, Dict, Literal
6
+ from dataclasses import dataclass
7
+ from typing import Literal
7
8
 
8
9
  import numpy as np
10
+ from numpy.typing import ArrayLike, NDArray
9
11
 
10
- from dataeval._internal.functional.divergence import divergence_fnn, divergence_mst
11
- from dataeval._internal.interop import ArrayLike, to_numpy
12
- from dataeval._internal.metrics.base import EvaluateMixin, MethodsMixin
12
+ from dataeval._internal.interop import to_numpy
13
+ from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
14
+ from dataeval._internal.output import OutputMetadata, set_metadata
13
15
 
14
- _METHODS = Literal["MST", "FNN"]
15
- _FUNCTION = Callable[[np.ndarray, np.ndarray], int]
16
+
17
+ @dataclass(frozen=True)
18
+ class DivergenceOutput(OutputMetadata):
19
+ """
20
+ Attributes
21
+ ----------
22
+ divergence : float
23
+ Divergence value calculated between 2 datasets ranging between 0.0 and 1.0
24
+ errors : int
25
+ The number of differing edges between the datasets
26
+ """
27
+
28
+ divergence: float
29
+ errors: int
30
+
31
+
32
+ def divergence_mst(data: NDArray, labels: NDArray) -> int:
33
+ """
34
+ Calculates the estimated label errors based on the minimum spanning tree
35
+
36
+ Parameters
37
+ ----------
38
+ data : NDArray, shape - (N, ... )
39
+ Input images to be grouped
40
+ labels : NDArray
41
+ Corresponding labels for each data point
42
+
43
+ Returns
44
+ -------
45
+ int
46
+ Number of label errors when creating the minimum spanning tree
47
+ """
48
+ mst = minimum_spanning_tree(data).toarray()
49
+ edgelist = np.transpose(np.nonzero(mst))
50
+ errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
51
+ return errors
16
52
 
17
53
 
18
- class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
54
+ def divergence_fnn(data: NDArray, labels: NDArray) -> int:
19
55
  """
20
- Calculates the estimated HP divergence between two datasets
56
+ Calculates the estimated label errors based on their nearest neighbors
21
57
 
22
58
  Parameters
23
59
  ----------
24
- method : Literal["MST, "FNN"], default "MST"
60
+ data : NDArray, shape - (N, ... )
61
+ Input images to be grouped
62
+ labels : NDArray
63
+ Corresponding labels for each data point
64
+
65
+ Returns
66
+ -------
67
+ int
68
+ Number of label errors when finding nearest neighbors
69
+ """
70
+ nn_indices = compute_neighbors(data, data)
71
+ errors = np.sum(np.abs(labels[nn_indices] - labels))
72
+ return errors
73
+
74
+
75
+ DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
76
+
77
+
78
+ @set_metadata("dataeval.metrics")
79
+ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
80
+ """
81
+ Calculates the divergence and any errors between the datasets
82
+
83
+ Parameters
84
+ ----------
85
+ data_a : ArrayLike, shape - (N, P)
86
+ A dataset in an ArrayLike format to compare.
87
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
88
+ data_b : ArrayLike, shape - (N, P)
89
+ A dataset in an ArrayLike format to compare.
90
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
91
+ method : Literal["MST, "FNN"], default "FNN"
25
92
  Method used to estimate dataset divergence
26
93
 
94
+ Returns
95
+ -------
96
+ DivergenceOutput
97
+ The divergence value (0.0..1.0) and the number of differing edges between the datasets
98
+
99
+ Notes
100
+ -----
101
+ The divergence value indicates how similar the 2 datasets are
102
+ with 0 indicating approximately identical data distributions.
103
+
27
104
  Warning
28
105
  -------
29
106
  MST is very slow in this implementation, this is unlike matlab where
@@ -40,63 +117,20 @@ class Divergence(EvaluateMixin, MethodsMixin[_METHODS, _FUNCTION]):
40
117
 
41
118
  Examples
42
119
  --------
43
- Initialize the Divergence class:
120
+ Evaluate the datasets:
44
121
 
45
- >>> divert = Divergence()
46
-
47
- Specify the method:
48
-
49
- >>> divert = Divergence(method="FNN")
122
+ >>> divergence(datasetA, datasetB)
123
+ DivergenceOutput(divergence=0.28, errors=36.0)
50
124
  """
51
-
52
- def __init__(self, method: _METHODS = "MST") -> None:
53
- self._set_method(method)
54
-
55
- @classmethod
56
- def _methods(cls) -> Dict[str, _FUNCTION]:
57
- return {"FNN": divergence_fnn, "MST": divergence_mst}
58
-
59
- def evaluate(self, data_a: ArrayLike, data_b: ArrayLike) -> Dict[str, Any]:
60
- """
61
- Calculates the divergence and any errors between the datasets
62
-
63
- Parameters
64
- ----------
65
- data_a : ArrayLike, shape - (N, P)
66
- A dataset in an ArrayLike format to compare.
67
- Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
68
- data_b : ArrayLike, shape - (N, P)
69
- A dataset in an ArrayLike format to compare.
70
- Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
71
-
72
- Returns
73
- -------
74
- Dict[str, Any]
75
- divergence : float
76
- divergence value between 0.0 and 1.0
77
- error : int
78
- the number of differing edges between the datasets
79
-
80
- Notes
81
- -----
82
- The divergence value indicates how similar the 2 datasets are
83
- with 0 indicating approximately identical data distributions.
84
-
85
- Examples
86
- --------
87
- Evaluate the datasets:
88
-
89
- >>> divert.evaluate(datasetA, datasetB)
90
- {'divergence': 0.28, 'error': 36.0}
91
- """
92
- a = to_numpy(data_a)
93
- b = to_numpy(data_b)
94
- N = a.shape[0]
95
- M = b.shape[0]
96
-
97
- stacked_data = np.vstack((a, b))
98
- labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
99
-
100
- errors = self._method(stacked_data, labels)
101
- dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
102
- return {"divergence": dp, "error": errors}
125
+ div_fn = get_method(DIVERGENCE_FN_MAP, method)
126
+ a = to_numpy(data_a)
127
+ b = to_numpy(data_b)
128
+ N = a.shape[0]
129
+ M = b.shape[0]
130
+
131
+ stacked_data = np.vstack((a, b))
132
+ labels = np.vstack([np.zeros([N, 1]), np.ones([M, 1])])
133
+
134
+ errors = div_fn(stacked_data, labels)
135
+ dp = max(0.0, 1 - ((M + N) / (2 * M * N)) * errors)
136
+ return DivergenceOutput(dp, errors)
@@ -0,0 +1,211 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Literal, Optional, Sequence
3
+
4
+ import numpy as np
5
+ from numpy.typing import NDArray
6
+
7
+ from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
8
+ from dataeval._internal.output import OutputMetadata, set_metadata
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class DiversityOutput(OutputMetadata):
13
+ """
14
+ Attributes
15
+ ----------
16
+ diversity_index : NDArray[np.float64]
17
+ Diversity index for classes and factors
18
+ """
19
+
20
+ diversity_index: NDArray[np.float64]
21
+
22
+
23
+ def diversity_shannon(
24
+ data: NDArray,
25
+ names: List[str],
26
+ is_categorical: List[bool],
27
+ subset_mask: Optional[NDArray[np.bool_]] = None,
28
+ ) -> NDArray:
29
+ """
30
+ Compute diversity for discrete/categorical variables and, through standard
31
+ histogram binning, for continuous variables.
32
+
33
+ We define diversity as a normalized form of the Shannon entropy.
34
+
35
+ diversity = 1 implies that samples are evenly distributed across a particular factor
36
+ diversity = 0 implies that all samples belong to one category/bin
37
+
38
+ Parameters
39
+ ----------
40
+ subset_mask: Optional[NDArray[np.bool_]]
41
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
42
+
43
+ Notes
44
+ -----
45
+ For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
46
+
47
+ Returns
48
+ -------
49
+ diversity_index: NDArray
50
+ Diversity index per column of X
51
+
52
+ See Also
53
+ --------
54
+ numpy.histogram
55
+ """
56
+
57
+ # entropy computed using global auto bins so that we can properly normalize
58
+ ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
59
+ # normalize by global counts rather than classwise counts
60
+ num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
61
+ return ent_unnormalized / np.log(num_bins)
62
+
63
+
64
+ def diversity_simpson(
65
+ data: NDArray,
66
+ names: List[str],
67
+ is_categorical: List[bool],
68
+ subset_mask: Optional[NDArray[np.bool_]] = None,
69
+ ) -> NDArray:
70
+ """
71
+ Compute diversity for discrete/categorical variables and, through standard
72
+ histogram binning, for continuous variables.
73
+
74
+ We define diversity as a normalized form of the inverse Simpson diversity
75
+ index.
76
+
77
+ diversity = 1 implies that samples are evenly distributed across a particular factor
78
+ diversity = 1/num_categories implies that all samples belong to one category/bin
79
+
80
+ Parameters
81
+ ----------
82
+ subset_mask: Optional[NDArray[np.bool_]]
83
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
84
+
85
+ Notes
86
+ -----
87
+ For continuous variables, histogram bins are chosen automatically. See
88
+ numpy.histogram for details.
89
+ The expression is undefined for q=1, but it approaches the Shannon entropy
90
+ in the limit.
91
+ If there is only one category, the diversity index takes a value of 1 =
92
+ 1/N = 1/1. Entropy will take a value of 0.
93
+
94
+ Returns
95
+ -------
96
+ NDArray
97
+ Diversity index per column of X
98
+
99
+ See Also
100
+ --------
101
+ numpy.histogram
102
+ """
103
+
104
+ hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
105
+ # normalize by global counts, not classwise counts
106
+ num_bins = get_num_bins(data, names, is_categorical)
107
+
108
+ ev_index = np.empty(len(names))
109
+ # loop over columns for convenience
110
+ for col, cnts in enumerate(hist_counts.values()):
111
+ # relative frequencies
112
+ p_i = cnts / cnts.sum()
113
+ # inverse Simpson index normalized by (number of bins)
114
+ ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
115
+
116
+ return ev_index
117
+
118
+
119
+ DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
120
+
121
+
122
+ @set_metadata("dataeval.metrics")
123
+ def diversity(
124
+ class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
125
+ ) -> DiversityOutput:
126
+ """
127
+ Compute diversity for discrete/categorical variables and, through standard
128
+ histogram binning, for continuous variables.
129
+
130
+ diversity = 1 implies that samples are evenly distributed across a particular factor
131
+ diversity = 0 implies that all samples belong to one category/bin
132
+
133
+ Parameters
134
+ ----------
135
+ class_labels: Sequence[int]
136
+ List of class labels for each image
137
+ metadata: List[Dict]
138
+ List of metadata factors for each image
139
+ metric: Literal["shannon", "simpson"], default "simpson"
140
+ string variable indicating which diversity index should be used.
141
+ Permissible values include "simpson" and "shannon"
142
+
143
+ Notes
144
+ -----
145
+ - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
146
+
147
+ Returns
148
+ -------
149
+ DiversityOutput
150
+ Diversity index per column of self.data or each factor in self.names
151
+
152
+ See Also
153
+ --------
154
+ numpy.histogram
155
+ """
156
+ diversity_fn = get_method(DIVERSITY_FN_MAP, method)
157
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
158
+ diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
159
+ return DiversityOutput(diversity_index)
160
+
161
+
162
+ @set_metadata("dataeval.metrics")
163
+ def diversity_classwise(
164
+ class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
165
+ ) -> DiversityOutput:
166
+ """
167
+ Compute diversity for discrete/categorical variables and, through standard
168
+ histogram binning, for continuous variables.
169
+
170
+ We define diversity as a normalized form of the inverse Simpson diversity
171
+ index.
172
+
173
+ diversity = 1 implies that samples are evenly distributed across a particular factor
174
+ diversity = 1/num_categories implies that all samples belong to one category/bin
175
+
176
+ Parameters
177
+ ----------
178
+ class_labels: Sequence[int]
179
+ List of class labels for each image
180
+ metadata: List[Dict]
181
+ List of metadata factors for each image
182
+
183
+ Notes
184
+ -----
185
+ - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
186
+ - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
187
+ - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
188
+
189
+ Returns
190
+ -------
191
+ DiversityOutput
192
+ Diversity index [n_class x n_factor]
193
+
194
+ See Also
195
+ --------
196
+ numpy.histogram
197
+ """
198
+ diversity_fn = get_method(DIVERSITY_FN_MAP, method)
199
+ data, names, is_categorical = preprocess_metadata(class_labels, metadata)
200
+ class_idx = names.index("class_label")
201
+ class_lbl = data[:, class_idx]
202
+
203
+ u_classes = np.unique(class_lbl)
204
+ num_factors = len(names)
205
+ diversity = np.empty((len(u_classes), num_factors))
206
+ diversity[:] = np.nan
207
+ for idx, cls in enumerate(u_classes):
208
+ subset_mask = class_lbl == cls
209
+ diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
210
+ div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
211
+ return DiversityOutput(div_no_class)