dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. dataeval/__init__.py +13 -9
  2. dataeval/_internal/detectors/clusterer.py +63 -49
  3. dataeval/_internal/detectors/drift/base.py +248 -51
  4. dataeval/_internal/detectors/drift/cvm.py +28 -26
  5. dataeval/_internal/detectors/drift/ks.py +31 -28
  6. dataeval/_internal/detectors/drift/mmd.py +62 -42
  7. dataeval/_internal/detectors/drift/torch.py +69 -60
  8. dataeval/_internal/detectors/drift/uncertainty.py +32 -32
  9. dataeval/_internal/detectors/duplicates.py +67 -31
  10. dataeval/_internal/detectors/ood/ae.py +15 -29
  11. dataeval/_internal/detectors/ood/aegmm.py +33 -27
  12. dataeval/_internal/detectors/ood/base.py +86 -47
  13. dataeval/_internal/detectors/ood/llr.py +34 -31
  14. dataeval/_internal/detectors/ood/vae.py +32 -31
  15. dataeval/_internal/detectors/ood/vaegmm.py +34 -28
  16. dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
  17. dataeval/_internal/flags.py +44 -21
  18. dataeval/_internal/interop.py +5 -3
  19. dataeval/_internal/metrics/balance.py +42 -5
  20. dataeval/_internal/metrics/ber.py +11 -8
  21. dataeval/_internal/metrics/coverage.py +15 -8
  22. dataeval/_internal/metrics/divergence.py +41 -7
  23. dataeval/_internal/metrics/diversity.py +57 -19
  24. dataeval/_internal/metrics/parity.py +141 -66
  25. dataeval/_internal/metrics/stats.py +330 -313
  26. dataeval/_internal/metrics/uap.py +33 -4
  27. dataeval/_internal/metrics/utils.py +79 -40
  28. dataeval/_internal/models/pytorch/autoencoder.py +127 -22
  29. dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
  30. dataeval/_internal/models/tensorflow/gmm.py +4 -2
  31. dataeval/_internal/models/tensorflow/losses.py +17 -13
  32. dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
  33. dataeval/_internal/models/tensorflow/trainer.py +10 -7
  34. dataeval/_internal/models/tensorflow/utils.py +23 -20
  35. dataeval/_internal/output.py +85 -0
  36. dataeval/_internal/utils.py +5 -3
  37. dataeval/_internal/workflows/sufficiency.py +122 -121
  38. dataeval/detectors/__init__.py +6 -25
  39. dataeval/detectors/drift/__init__.py +16 -0
  40. dataeval/detectors/drift/kernels/__init__.py +6 -0
  41. dataeval/detectors/drift/updates/__init__.py +3 -0
  42. dataeval/detectors/linters/__init__.py +5 -0
  43. dataeval/detectors/ood/__init__.py +11 -0
  44. dataeval/flags/__init__.py +2 -2
  45. dataeval/metrics/__init__.py +2 -26
  46. dataeval/metrics/bias/__init__.py +14 -0
  47. dataeval/metrics/estimators/__init__.py +9 -0
  48. dataeval/metrics/stats/__init__.py +6 -0
  49. dataeval/tensorflow/__init__.py +3 -0
  50. dataeval/tensorflow/loss/__init__.py +3 -0
  51. dataeval/tensorflow/models/__init__.py +5 -0
  52. dataeval/tensorflow/recon/__init__.py +3 -0
  53. dataeval/torch/__init__.py +3 -0
  54. dataeval/{models/torch → torch/models}/__init__.py +1 -2
  55. dataeval/torch/trainer/__init__.py +3 -0
  56. dataeval/utils/__init__.py +3 -6
  57. dataeval/workflows/__init__.py +2 -4
  58. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
  59. dataeval-0.66.0.dist-info/RECORD +72 -0
  60. dataeval/_internal/metrics/base.py +0 -10
  61. dataeval/models/__init__.py +0 -15
  62. dataeval/models/tensorflow/__init__.py +0 -6
  63. dataeval-0.64.0.dist-info/RECORD +0 -60
  64. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
  65. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
@@ -1,48 +1,41 @@
1
+ from __future__ import annotations
2
+
1
3
  import warnings
2
- from typing import Dict, Mapping, NamedTuple, Optional, Tuple
4
+ from dataclasses import dataclass
5
+ from typing import Generic, Mapping, TypeVar
3
6
 
4
7
  import numpy as np
5
8
  from numpy.typing import ArrayLike, NDArray
6
9
  from scipy.stats import chi2_contingency, chisquare
7
10
 
8
11
  from dataeval._internal.interop import to_numpy
12
+ from dataeval._internal.output import OutputMetadata, set_metadata
9
13
 
10
-
11
- class ParityOutput(NamedTuple):
12
- """
13
- Attributes
14
- ----------
15
- score : np.float64
16
- chi-squared value of the test
17
- p_value : np.float64
18
- p-value of the test
19
- """
20
-
21
- score: np.float64
22
- p_value: np.float64
14
+ TData = TypeVar("TData", np.float64, NDArray[np.float64])
23
15
 
24
16
 
25
- class ParityMetadataOutput(NamedTuple):
17
+ @dataclass(frozen=True)
18
+ class ParityOutput(Generic[TData], OutputMetadata):
26
19
  """
27
20
  Attributes
28
21
  ----------
29
- scores : NDArray[np.float64]
30
- chi-squared values of the test
31
- p_values : NDArray[np.float64]
32
- p-values of the test
22
+ score : np.float64 | NDArray[np.float64]
23
+ chi-squared score(s) of the test
24
+ p_value : np.float64 | NDArray[np.float64]
25
+ p-value(s) of the test
33
26
  """
34
27
 
35
- score: NDArray[np.float64]
36
- p_value: NDArray[np.float64]
28
+ score: TData
29
+ p_value: TData
37
30
 
38
31
 
39
- def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name: str):
32
+ def digitize_factor_bins(continuous_values: NDArray, bins: int, factor_name: str) -> NDArray:
40
33
  """
41
34
  Digitizes a list of values into a given number of bins.
42
35
 
43
36
  Parameters
44
37
  ----------
45
- continuous_values: np.ndarray
38
+ continuous_values: NDArray
46
39
  The values to be digitized.
47
40
  bins: int
48
41
  The number of bins for the discrete values that continuous_values will be digitized into.
@@ -51,10 +44,10 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
51
44
 
52
45
  Returns
53
46
  -------
54
- np.ndarray
47
+ NDArray
55
48
  The digitized values
56
-
57
49
  """
50
+
58
51
  if not np.all([np.issubdtype(type(n), np.number) for n in continuous_values]):
59
52
  raise TypeError(
60
53
  f"Encountered a non-numeric value for factor {factor_name}, but the factor"
@@ -69,14 +62,14 @@ def digitize_factor_bins(continuous_values: np.ndarray, bins: int, factor_name:
69
62
 
70
63
 
71
64
  def format_discretize_factors(
72
- data_factors: dict[str, np.ndarray], continuous_factor_bincounts: Dict[str, int]
73
- ) -> Tuple[dict, np.ndarray]:
65
+ data_factors: dict[str, NDArray], continuous_factor_bincounts: dict[str, int]
66
+ ) -> tuple[dict[str, NDArray], NDArray]:
74
67
  """
75
68
  Sets up the internal list of metadata factors.
76
69
 
77
70
  Parameters
78
71
  ----------
79
- data_factors: Dict[str, np.ndarray]
72
+ data_factors: Dict[str, NDArray]
80
73
  The dataset factors, which are per-image attributes including class label and metadata.
81
74
  Each key of dataset_factors is a factor, whose value is the per-image factor values.
82
75
  continuous_factor_bincounts : Dict[str, int]
@@ -87,12 +80,12 @@ def format_discretize_factors(
87
80
 
88
81
  Returns
89
82
  -------
90
- Dict[str, np.ndarray]
91
- Intrinsic per-image metadata information with the formatting that input data_factors uses.
92
- Each key is a metadata factor, whose value is the discrete per-image factor values.
93
- np.ndarray
94
- Per-image labels, whose ith element is the label for the ith element of the dataset.
83
+ Tuple[Dict[str, NDArray], NDArray]
84
+ - Intrinsic per-image metadata information with the formatting that input data_factors uses.
85
+ Each key is a metadata factor, whose value is the discrete per-image factor values.
86
+ - Per-image labels, whose ith element is the label for the ith element of the dataset.
95
87
  """
88
+
96
89
  invalid_keys = set(continuous_factor_bincounts.keys()) - set(data_factors.keys())
97
90
  if invalid_keys:
98
91
  raise KeyError(
@@ -123,7 +116,36 @@ def format_discretize_factors(
123
116
  return metadata_factors, labels
124
117
 
125
118
 
126
- def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray) -> np.ndarray:
119
+ def normalize_expected_dist(expected_dist: NDArray, observed_dist: NDArray) -> NDArray:
120
+ """
121
+ Normalize the expected label distribution to match the total number of labels in the observed distribution.
122
+
123
+ This function adjusts the expected distribution so that its sum equals the sum of the observed distribution.
124
+ If the expected distribution is all zeros, an error is raised.
125
+
126
+ Parameters
127
+ ----------
128
+ expected_dist : np.ndarray
129
+ The expected label distribution. This array represents the anticipated distribution of labels.
130
+ observed_dist : np.ndarray
131
+ The observed label distribution. This array represents the actual distribution of labels in the dataset.
132
+
133
+ Returns
134
+ -------
135
+ np.ndarray
136
+ The normalized expected distribution, scaled to have the same sum as the observed distribution.
137
+
138
+ Raises
139
+ ------
140
+ ValueError
141
+ If the expected distribution is all zeros.
142
+
143
+ Notes
144
+ -----
145
+ The function ensures that the total number of labels in the expected distribution matches the total
146
+ number of labels in the observed distribution by scaling the expected distribution.
147
+ """
148
+
127
149
  exp_sum = np.sum(expected_dist)
128
150
  obs_sum = np.sum(observed_dist)
129
151
 
@@ -141,14 +163,14 @@ def normalize_expected_dist(expected_dist: np.ndarray, observed_dist: np.ndarray
141
163
  return expected_dist
142
164
 
143
165
 
144
- def validate_dist(label_dist: np.ndarray, label_name: str):
166
+ def validate_dist(label_dist: NDArray, label_name: str):
145
167
  """
146
168
  Verifies that the given label distribution has labels and checks if
147
169
  any labels have frequencies less than 5.
148
170
 
149
171
  Parameters
150
172
  ----------
151
- label_dist : np.ndarray
173
+ label_dist : NDArray
152
174
  Array representing label distributions
153
175
 
154
176
  Raises
@@ -158,6 +180,7 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
158
180
  Warning
159
181
  If any elements of label_dist are less than 5
160
182
  """
183
+
161
184
  if not len(label_dist):
162
185
  raise ValueError(f"No labels found in the {label_name} dataset")
163
186
  if np.any(label_dist < 5):
@@ -166,24 +189,20 @@ def validate_dist(label_dist: np.ndarray, label_name: str):
166
189
  " dataset have frequencies less than 5. This may lead"
167
190
  " to invalid chi-squared evaluation."
168
191
  )
169
- warnings.warn(
170
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
171
- " dataset have frequencies less than 5. This may lead"
172
- " to invalid chi-squared evaluation."
173
- )
174
192
 
175
193
 
176
- def parity(
194
+ @set_metadata("dataeval.metrics")
195
+ def label_parity(
177
196
  expected_labels: ArrayLike,
178
197
  observed_labels: ArrayLike,
179
- num_classes: Optional[int] = None,
180
- ) -> ParityOutput:
198
+ num_classes: int | None = None,
199
+ ) -> ParityOutput[np.float64]:
181
200
  """
182
- Perform a one-way chi-squared test between observation frequencies and expected frequencies that
183
- tests the null hypothesis that the observed data has the expected frequencies.
201
+ Calculate the chi-square statistic to assess the parity between expected and observed label distributions.
184
202
 
185
- This function acts as an interface to the scipy.stats.chisquare method, which is documented at
186
- https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
203
+ This function computes the frequency distribution of classes in both expected and observed labels, normalizes
204
+ the expected distribution to match the total number of observed labels, and then calculates the chi-square
205
+ statistic to determine if there is a significant difference between the two distributions.
187
206
 
188
207
  Parameters
189
208
  ----------
@@ -191,9 +210,9 @@ def parity(
191
210
  List of class labels in the expected dataset
192
211
  observed_labels : ArrayLike
193
212
  List of class labels in the observed dataset
194
- num_classes : Optional[int]
195
- The number of unique classes in the datasets. If this is not specified, it will
196
- be inferred from the set of unique labels in expected_labels and observed_labels
213
+ num_classes : int | None, default None
214
+ The number of unique classes in the datasets. If not provided, the function will infer it
215
+ from the set of unique labels in expected_labels and observed_labels
197
216
 
198
217
  Returns
199
218
  -------
@@ -203,8 +222,31 @@ def parity(
203
222
  Raises
204
223
  ------
205
224
  ValueError
206
- If x is empty
225
+ If expected label distribution is empty, is all zeros, or if there is a mismatch in the number
226
+ of unique classes between the observed and expected distributions.
227
+
228
+
229
+ Notes
230
+ -----
231
+ - Providing ``num_classes`` can be helpful if there are classes with zero instances in one of the distributions.
232
+ - The function first validates the observed distribution and normalizes the expected distribution so that it
233
+ has the same total number of labels as the observed distribution.
234
+ - It then performs a chi-square test to determine if there is a statistically significant difference between
235
+ the observed and expected label distributions.
236
+ - This function acts as an interface to the scipy.stats.chisquare method, which is documented at
237
+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chisquare.html
238
+
239
+
240
+ Examples
241
+ --------
242
+ Randomly creating some label distributions using ``np.random.default_rng``
243
+
244
+ >>> expected_labels = np_random_gen.choice([0, 1, 2, 3, 4], (100))
245
+ >>> observed_labels = np_random_gen.choice([2, 3, 0, 4, 1], (100))
246
+ >>> label_parity(expected_labels, observed_labels)
247
+ ParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
207
248
  """
249
+
208
250
  # Calculate
209
251
  if not num_classes:
210
252
  num_classes = 0
@@ -236,27 +278,28 @@ def parity(
236
278
  return ParityOutput(cs, p)
237
279
 
238
280
 
239
- def parity_metadata(
281
+ @set_metadata("dataeval.metrics")
282
+ def parity(
240
283
  data_factors: Mapping[str, ArrayLike],
241
- continuous_factor_bincounts: Optional[Dict[str, int]] = None,
242
- ) -> ParityMetadataOutput:
284
+ continuous_factor_bincounts: dict[str, int] | None = None,
285
+ ) -> ParityOutput[NDArray[np.float64]]:
243
286
  """
244
- Evaluates the statistical independence of metadata factors from class labels.
245
- This performs a chi-square test, which provides a score and a p-value for
246
- statistical independence between each pair of a metadata factor and a class label.
247
- A high score with a low p-value suggests that a metadata factor is strongly
248
- correlated with a class label.
287
+ Calculate chi-square statistics to assess the relationship between multiple factors and class labels.
288
+
289
+ This function computes the chi-square statistic for each metadata factor to determine if there is
290
+ a significant relationship between the factor values and class labels. The function handles both categorical
291
+ and discretized continuous factors.
249
292
 
250
293
  Parameters
251
294
  ----------
252
295
  data_factors: Mapping[str, ArrayLike]
253
296
  The dataset factors, which are per-image attributes including class label and metadata.
254
297
  Each key of dataset_factors is a factor, whose value is the per-image factor values.
255
- continuous_factor_bincounts : Optional[Dict[str, int]], default None
256
- The factors in data_factors that have continuous values and the array of bin counts to
257
- discretize values into. All factors are treated as having discrete values unless they
258
- are specified as keys in this dictionary. Each element of this array must occur as a key
259
- in data_factors.
298
+ continuous_factor_bincounts : Dict[str, int] | None, default None
299
+ A dictionary specifying the number of bins for discretizing the continuous factors.
300
+ The keys should correspond to the names of continuous factors in `data_factors`,
301
+ and the values should be the number of bins to use for discretization.
302
+ If not provided, no discretization is applied.
260
303
 
261
304
  Returns
262
305
  -------
@@ -264,7 +307,39 @@ def parity_metadata(
264
307
  Arrays of length (num_factors) whose (i)th element corresponds to the
265
308
  chi-square score and p-value for the relationship between factor i and
266
309
  the class labels in the dataset.
310
+
311
+ Raises
312
+ ------
313
+ Warning
314
+ If any cell in the contingency matrix has a value between 0 and 5, a warning is issued because this can
315
+ lead to inaccurate chi-square calculations. It is recommended to ensure that each label co-occurs with
316
+ factor values either 0 times or at least 5 times. Alternatively, continuous-valued factors can be digitized
317
+ into fewer bins.
318
+
319
+ Notes
320
+ -----
321
+ - Each key of the ``continuous_factor_bincounts`` dictionary must occur as a key in data_factors.
322
+ - A high score with a low p-value suggests that a metadata factor is strongly correlated with a class label.
323
+ - The function creates a contingency matrix for each factor, where each entry represents the frequency of a
324
+ specific factor value co-occurring with a particular class label.
325
+ - Rows containing only zeros in the contingency matrix are removed before performing the chi-square test
326
+ to prevent errors in the calculation.
327
+
328
+ Examples
329
+ --------
330
+ Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
331
+
332
+ >>> data_factors = {
333
+ ... "age": np_random_gen.choice([25, 30, 35, 45], (100)),
334
+ ... "income": np_random_gen.choice([50000, 65000, 80000], (100)),
335
+ ... "gender": np_random_gen.choice(["M", "F"], (100)),
336
+ ... "class": np_random_gen.choice([0, 1, 2], (100)),
337
+ ... }
338
+ >>> continuous_factor_bincounts = {"age": 4, "income": 3}
339
+ >>> parity(data_factors, continuous_factor_bincounts)
340
+ ParityOutput(score=array([2.82329785, 1.60625584, 1.38377236]), p_value=array([0.83067563, 0.80766733, 0.5006309 ]))
267
341
  """
342
+
268
343
  data_factors_np = {k: to_numpy(v) for k, v in data_factors.items()}
269
344
  continuous_factor_bincounts = continuous_factor_bincounts if continuous_factor_bincounts else {}
270
345
 
@@ -306,4 +381,4 @@ def parity_metadata(
306
381
  chi_scores[i] = chi2
307
382
  p_values[i] = p
308
383
 
309
- return ParityMetadataOutput(chi_scores, p_values)
384
+ return ParityOutput(chi_scores, p_values)