dataeval 0.64.0__py3-none-any.whl → 0.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dataeval/__init__.py +2 -2
  2. dataeval/_internal/detectors/clusterer.py +46 -34
  3. dataeval/_internal/detectors/drift/base.py +52 -35
  4. dataeval/_internal/detectors/drift/cvm.py +4 -4
  5. dataeval/_internal/detectors/drift/ks.py +6 -6
  6. dataeval/_internal/detectors/drift/mmd.py +35 -16
  7. dataeval/_internal/detectors/drift/torch.py +6 -5
  8. dataeval/_internal/detectors/drift/uncertainty.py +7 -7
  9. dataeval/_internal/detectors/duplicates.py +55 -29
  10. dataeval/_internal/detectors/linter.py +40 -24
  11. dataeval/_internal/detectors/ood/base.py +36 -15
  12. dataeval/_internal/detectors/ood/llr.py +7 -7
  13. dataeval/_internal/flags.py +42 -21
  14. dataeval/_internal/interop.py +2 -2
  15. dataeval/_internal/metrics/balance.py +10 -2
  16. dataeval/_internal/metrics/ber.py +6 -5
  17. dataeval/_internal/metrics/coverage.py +15 -8
  18. dataeval/_internal/metrics/divergence.py +41 -7
  19. dataeval/_internal/metrics/diversity.py +17 -12
  20. dataeval/_internal/metrics/parity.py +30 -43
  21. dataeval/_internal/metrics/stats.py +196 -317
  22. dataeval/_internal/metrics/uap.py +5 -2
  23. dataeval/_internal/metrics/utils.py +70 -33
  24. dataeval/_internal/models/tensorflow/losses.py +3 -3
  25. dataeval/_internal/models/tensorflow/trainer.py +3 -2
  26. dataeval/_internal/models/tensorflow/utils.py +4 -3
  27. dataeval/_internal/output.py +82 -0
  28. dataeval/_internal/workflows/sufficiency.py +96 -107
  29. dataeval/flags/__init__.py +2 -2
  30. dataeval/metrics/__init__.py +3 -3
  31. {dataeval-0.64.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
  32. dataeval-0.65.0.dist-info/RECORD +60 -0
  33. dataeval/_internal/metrics/base.py +0 -10
  34. dataeval-0.64.0.dist-info/RECORD +0 -60
  35. {dataeval-0.64.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
  36. {dataeval-0.64.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -1,20 +1,24 @@
1
1
  import math
2
- from typing import Literal, NamedTuple
2
+ from dataclasses import dataclass
3
+ from typing import Literal
3
4
 
4
5
  import numpy as np
5
6
  from numpy.typing import ArrayLike, NDArray
6
7
  from scipy.spatial.distance import pdist, squareform
7
8
 
8
9
  from dataeval._internal.interop import to_numpy
10
+ from dataeval._internal.metrics.utils import flatten
11
+ from dataeval._internal.output import OutputMetadata, set_metadata
9
12
 
10
13
 
11
- class CoverageOutput(NamedTuple):
14
+ @dataclass(frozen=True)
15
+ class CoverageOutput(OutputMetadata):
12
16
  """
13
17
  Attributes
14
18
  ----------
15
- indices : np.ndarray
19
+ indices : NDArray
16
20
  Array of uncovered indices
17
- radii : np.ndarray
21
+ radii : NDArray
18
22
  Array of critical value radii
19
23
  critical_value : float
20
24
  Radius for coverage
@@ -25,6 +29,7 @@ class CoverageOutput(NamedTuple):
25
29
  critical_value: float
26
30
 
27
31
 
32
+ @set_metadata("dataeval.metrics")
28
33
  def coverage(
29
34
  embeddings: ArrayLike,
30
35
  radius_type: Literal["adaptive", "naive"] = "adaptive",
@@ -87,12 +92,14 @@ def coverage(
87
92
  embeddings = to_numpy(embeddings)
88
93
  n = len(embeddings)
89
94
  if n <= k:
90
- raise ValueError("Number of observations less than or equal to the specified number of neighbors.")
91
- mat = squareform(pdist(embeddings)).astype(np.float64)
95
+ raise ValueError(
96
+ f"Number of observations n={n} is less than or equal to the specified number of neighbors k={k}."
97
+ )
98
+ mat = squareform(pdist(flatten(embeddings))).astype(np.float64)
92
99
  sorted_dists = np.sort(mat, axis=1)
93
100
  crit = sorted_dists[:, k + 1]
94
101
 
95
- d = np.shape(embeddings)[1]
102
+ d = embeddings.shape[1]
96
103
  if radius_type == "naive":
97
104
  rho = (1 / math.sqrt(math.pi)) * ((2 * k * math.gamma(d / 2 + 1)) / (n)) ** (1 / d)
98
105
  pvals = np.where(crit > rho)[0]
@@ -101,5 +108,5 @@ def coverage(
101
108
  rho = int(n * percent)
102
109
  pvals = np.argsort(crit)[::-1][:rho]
103
110
  else:
104
- raise ValueError("Invalid radius type.")
111
+ raise ValueError(f"{radius_type} is an invalid radius type. Expected 'adaptive' or 'naive'")
105
112
  return CoverageOutput(pvals, crit, rho)
@@ -3,16 +3,19 @@ This module contains the implementation of HP Divergence
3
3
  using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
4
4
  """
5
5
 
6
- from typing import Literal, NamedTuple
6
+ from dataclasses import dataclass
7
+ from typing import Literal
7
8
 
8
9
  import numpy as np
9
- from numpy.typing import ArrayLike
10
+ from numpy.typing import ArrayLike, NDArray
10
11
 
11
12
  from dataeval._internal.interop import to_numpy
12
13
  from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
14
+ from dataeval._internal.output import OutputMetadata, set_metadata
13
15
 
14
16
 
15
- class DivergenceOutput(NamedTuple):
17
+ @dataclass(frozen=True)
18
+ class DivergenceOutput(OutputMetadata):
16
19
  """
17
20
  Attributes
18
21
  ----------
@@ -26,14 +29,44 @@ class DivergenceOutput(NamedTuple):
26
29
  errors: int
27
30
 
28
31
 
29
- def divergence_mst(data: np.ndarray, labels: np.ndarray) -> int:
32
+ def divergence_mst(data: NDArray, labels: NDArray) -> int:
33
+ """
34
+ Calculates the estimated label errors based on the minimum spanning tree
35
+
36
+ Parameters
37
+ ----------
38
+ data : NDArray, shape - (N, ... )
39
+ Input images to be grouped
40
+ labels : NDArray
41
+ Corresponding labels for each data point
42
+
43
+ Returns
44
+ -------
45
+ int
46
+ Number of label errors when creating the minimum spanning tree
47
+ """
30
48
  mst = minimum_spanning_tree(data).toarray()
31
49
  edgelist = np.transpose(np.nonzero(mst))
32
50
  errors = np.sum(labels[edgelist[:, 0]] != labels[edgelist[:, 1]])
33
51
  return errors
34
52
 
35
53
 
36
- def divergence_fnn(data: np.ndarray, labels: np.ndarray) -> int:
54
+ def divergence_fnn(data: NDArray, labels: NDArray) -> int:
55
+ """
56
+ Calculates the estimated label errors based on their nearest neighbors
57
+
58
+ Parameters
59
+ ----------
60
+ data : NDArray, shape - (N, ... )
61
+ Input images to be grouped
62
+ labels : NDArray
63
+ Corresponding labels for each data point
64
+
65
+ Returns
66
+ -------
67
+ int
68
+ Number of label errors when finding nearest neighbors
69
+ """
37
70
  nn_indices = compute_neighbors(data, data)
38
71
  errors = np.sum(np.abs(labels[nn_indices] - labels))
39
72
  return errors
@@ -42,6 +75,7 @@ def divergence_fnn(data: np.ndarray, labels: np.ndarray) -> int:
42
75
  DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
43
76
 
44
77
 
78
+ @set_metadata("dataeval.metrics")
45
79
  def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
46
80
  """
47
81
  Calculates the divergence and any errors between the datasets
@@ -50,10 +84,10 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
50
84
  ----------
51
85
  data_a : ArrayLike, shape - (N, P)
52
86
  A dataset in an ArrayLike format to compare.
53
- Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
87
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
54
88
  data_b : ArrayLike, shape - (N, P)
55
89
  A dataset in an ArrayLike format to compare.
56
- Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
90
+ Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
57
91
  method : Literal["MST, "FNN"], default "FNN"
58
92
  Method used to estimate dataset divergence
59
93
 
@@ -1,12 +1,15 @@
1
- from typing import Dict, List, Literal, NamedTuple, Optional, Sequence
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Literal, Optional, Sequence
2
3
 
3
4
  import numpy as np
4
5
  from numpy.typing import NDArray
5
6
 
6
7
  from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
8
+ from dataeval._internal.output import OutputMetadata, set_metadata
7
9
 
8
10
 
9
- class DiversityOutput(NamedTuple):
11
+ @dataclass(frozen=True)
12
+ class DiversityOutput(OutputMetadata):
10
13
  """
11
14
  Attributes
12
15
  ----------
@@ -18,11 +21,11 @@ class DiversityOutput(NamedTuple):
18
21
 
19
22
 
20
23
  def diversity_shannon(
21
- data: np.ndarray,
24
+ data: NDArray,
22
25
  names: List[str],
23
26
  is_categorical: List[bool],
24
- subset_mask: Optional[np.ndarray] = None,
25
- ) -> np.ndarray:
27
+ subset_mask: Optional[NDArray[np.bool_]] = None,
28
+ ) -> NDArray:
26
29
  """
27
30
  Compute diversity for discrete/categorical variables and, through standard
28
31
  histogram binning, for continuous variables.
@@ -34,7 +37,7 @@ def diversity_shannon(
34
37
 
35
38
  Parameters
36
39
  ----------
37
- subset_mask: Optional[np.ndarray[bool]]
40
+ subset_mask: Optional[NDArray[np.bool_]]
38
41
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
39
42
 
40
43
  Notes
@@ -43,7 +46,7 @@ def diversity_shannon(
43
46
 
44
47
  Returns
45
48
  -------
46
- diversity_index: np.ndarray
49
+ diversity_index: NDArray
47
50
  Diversity index per column of X
48
51
 
49
52
  See Also
@@ -59,11 +62,11 @@ def diversity_shannon(
59
62
 
60
63
 
61
64
  def diversity_simpson(
62
- data: np.ndarray,
65
+ data: NDArray,
63
66
  names: List[str],
64
67
  is_categorical: List[bool],
65
- subset_mask: Optional[np.ndarray] = None,
66
- ) -> np.ndarray:
68
+ subset_mask: Optional[NDArray[np.bool_]] = None,
69
+ ) -> NDArray:
67
70
  """
68
71
  Compute diversity for discrete/categorical variables and, through standard
69
72
  histogram binning, for continuous variables.
@@ -76,7 +79,7 @@ def diversity_simpson(
76
79
 
77
80
  Parameters
78
81
  ----------
79
- subset_mask: Optional[np.ndarray[bool]]
82
+ subset_mask: Optional[NDArray[np.bool_]]
80
83
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
81
84
 
82
85
  Notes
@@ -90,7 +93,7 @@ def diversity_simpson(
90
93
 
91
94
  Returns
92
95
  -------
93
- np.ndarray
96
+ NDArray
94
97
  Diversity index per column of X
95
98
 
96
99
  See Also
@@ -116,6 +119,7 @@ def diversity_simpson(
116
119
  DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
117
120
 
118
121
 
122
+ @set_metadata("dataeval.metrics")
119
123
  def diversity(
120
124
  class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
121
125
  ) -> DiversityOutput:
@@ -155,6 +159,7 @@ def diversity(
155
159
  return DiversityOutput(diversity_index)
156
160
 
157
161
 
162
+ @set_metadata("dataeval.metrics")
158
163
  def diversity_classwise(
159
164
  class_labels: Sequence[int], metadata: List[Dict], method: Literal["shannon", "simpson"] = "simpson"
160
165
  ) -> DiversityOutput:
@@ -1,48 +1,39 @@
1
1
  import warnings
2
- from typing import Dict, Mapping, NamedTuple, Optional, Tuple
2
+ from dataclasses import dataclass
3
+ from typing import Dict, Generic, Mapping, Optional, Tuple, TypeVar
3
4
 
4
5
  import numpy as np
5
6
  from numpy.typing import ArrayLike, NDArray
6
7
  from scipy.stats import chi2_contingency, chisquare
7
8
 
8
9
  from dataeval._internal.interop import to_numpy
10
+ from dataeval._internal.output import OutputMetadata, set_metadata
9
11
 
10
-
11
- class ParityOutput(NamedTuple):
12
- """
13
- Attributes
14
- ----------
15
- score : np.float64
16
- chi-squared value of the test
17
- p_value : np.float64
18
- p-value of the test
19
- """
20
-
21
- score: np.float64
22
- p_value: np.float64
12
+ TData = TypeVar("TData", np.float64, NDArray[np.float64])
23
13
 
24
14
 
25
- class ParityMetadataOutput(NamedTuple):
15
+ @dataclass(frozen=True)
16
+ class ParityOutput(Generic[TData], OutputMetadata):
26
17
  """
27
18
  Attributes
28
19
  ----------
29
- scores : NDArray[np.float64]
30
- chi-squared values of the test
31
- p_values : NDArray[np.float64]
32
- p-values of the test
20
+ score : np.float64 | NDArray[np.float64]
21
+ chi-squared score(s) of the test
22
+ p_value : np.float64 | NDArray[np.float64]
23
+ p-value(s) of the test
33
24
  """
34
25
 
35
- score: NDArray[np.float64]
36
- p_value: NDArray[np.float64]
26
+ score: TData
27
+ p_value: TData
37
28
 
38
29
 
39
- def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
30
+ def digitize_factor_bins(continuous_values: NDArray, bins: int, factor_name: str) -> NDArray:
40
31
  """
41
32
  Digitizes a list of values into a given number of bins.
42
33
 
43
34
  Parameters
44
35
  ----------
45
- continuous_values: np.ndarray
36
+ continuous_values: NDArray
46
37
  The values to be digitized.
47
38
  bins: int
48
39
  The number of bins for the discrete values that continuous_values will be digitized into.
@@ -51,7 +42,7 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
51
42
 
52
43
  Returns
53
44
  -------
54
- np.ndarray
45
+ NDArray
55
46
  The digitized values
56
47
 
57
48
  """
@@ -69,14 +60,14 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
69
60
 
70
61
 
71
62
  def format_discretize_factors(
72
- data_factors: dict[str, np.ndarray], continuous_factor_bincounts: Dict[str, int]
73
- ) -> Tuple[dict, np.ndarray]:
63
+ data_factors: Dict[str, NDArray], continuous_factor_bincounts: Dict[str, int]
64
+ ) -> Tuple[Dict[str, NDArray], NDArray]:
74
65
  """
75
66
  Sets up the internal list of metadata factors.
76
67
 
77
68
  Parameters
78
69
  ----------
79
- data_factors: Dict[str, np.ndarray]
70
+ data_factors: Dict[str, NDArray]
80
71
  The dataset factors, which are per-image attributes including class label and metadata.
81
72
  Each key of dataset_factors is a factor, whose value is the per-image factor values.
82
73
  continuous_factor_bincounts : Dict[str, int]
@@ -87,11 +78,10 @@ def format_discretize_factors(
87
78
 
88
79
  Returns
89
80
  -------
90
- Dict[str, np.ndarray]
91
- Intrinsic per-image metadata information with the formatting that input data_factors uses.
92
- Each key is a metadata factor, whose value is the discrete per-image factor values.
93
- np.ndarray
94
- Per-image labels, whose ith element is the label for the ith element of the dataset.
81
+ Tuple[Dict[str, NDArray], NDArray]
82
+ - Intrinsic per-image metadata information with the formatting that input data_factors uses.
83
+ Each key is a metadata factor, whose value is the discrete per-image factor values.
84
+ - Per-image labels, whose ith element is the label for the ith element of the dataset.
95
85
  """
96
86
  invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
97
87
  if invalid_keys:
@@ -123,7 +113,7 @@ def format_discretize_factors(
123
113
  return metadata_factors, labels
124
114
 
125
115
 
126
- def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
116
+ def normalize_expected_dist(expected_dist: NDArray, observed_dist: NDArray) -> NDArray:
127
117
  exp_sum = np.sum(expected_dist)
128
118
  obs_sum = np.sum(observed_dist)
129
119
 
@@ -141,14 +131,14 @@ def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray
141
131
  return expected_dist
142
132
 
143
133
 
144
- def validate_dist(label_dist: np.ndarray, label_name: str):
134
+ def validate_dist(label_dist: NDArray, label_name: str):
145
135
  """
146
136
  Verifies that the given label distribution has labels and checks if
147
137
  any labels have frequencies less than 5.
148
138
 
149
139
  Parameters
150
140
  ----------
151
- label_dist : np.ndarray
141
+ label_dist : NDArray
152
142
  Array representing label distributions
153
143
 
154
144
  Raises
@@ -166,18 +156,14 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
166
156
  " dataset have frequencies less than 5. This may lead"
167
157
  " to invalid chi-squared evaluation."
168
158
  )
169
- warnings.warn(
170
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
171
- " dataset have frequencies less than 5. This may lead"
172
- " to invalid chi-squared evaluation."
173
- )
174
159
 
175
160
 
161
+ @set_metadata("dataeval.metrics")
176
162
  def parity(
177
163
  expected_labels: ArrayLike,
178
164
  observed_labels: ArrayLike,
179
165
  num_classes: Optional[int] = None,
180
- ) -> ParityOutput:
166
+ ) -> ParityOutput[np.float64]:
181
167
  """
182
168
  Perform a one-way chi-squared test between observation frequencies and expected frequencies that
183
169
  tests the null hypothesis that the observed data has the expected frequencies.
@@ -236,10 +222,11 @@ def parity(
236
222
  return ParityOutput(cs, p)
237
223
 
238
224
 
225
+ @set_metadata("dataeval.metrics")
239
226
  def parity_metadata(
240
227
  data_factors: Mapping[str, ArrayLike],
241
228
  continuous_factor_bincounts: Optional[Dict[str, int]] = None,
242
- ) -> ParityMetadataOutput:
229
+ ) -> ParityOutput[NDArray[np.float64]]:
243
230
  """
244
231
  Evaluates the statistical independence of metadata factors from class labels.
245
232
  This performs a chi-square test, which provides a score and a p-value for
@@ -306,4 +293,4 @@ def parity_metadata(
306
293
  chi_scores[i] = chi2
307
294
  p_values[i] = p
308
295
 
309
- return ParityMetadataOutput(chi_scores, p_values)
296
+ return ParityOutput(chi_scores, p_values)