dataeval 0.72.2__py3-none-any.whl → 0.73.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/detectors/__init__.py +1 -1
  3. dataeval/detectors/drift/__init__.py +1 -1
  4. dataeval/detectors/drift/base.py +2 -2
  5. dataeval/detectors/linters/clusterer.py +1 -1
  6. dataeval/detectors/ood/__init__.py +1 -1
  7. dataeval/detectors/ood/ae.py +14 -6
  8. dataeval/detectors/ood/aegmm.py +14 -6
  9. dataeval/detectors/ood/base.py +9 -3
  10. dataeval/detectors/ood/llr.py +22 -16
  11. dataeval/detectors/ood/vae.py +14 -6
  12. dataeval/detectors/ood/vaegmm.py +14 -6
  13. dataeval/interop.py +9 -7
  14. dataeval/metrics/bias/balance.py +50 -44
  15. dataeval/metrics/bias/coverage.py +38 -6
  16. dataeval/metrics/bias/diversity.py +117 -65
  17. dataeval/metrics/bias/metadata.py +225 -60
  18. dataeval/metrics/bias/parity.py +68 -54
  19. dataeval/utils/__init__.py +4 -3
  20. dataeval/utils/lazy.py +26 -0
  21. dataeval/utils/metadata.py +258 -0
  22. dataeval/utils/shared.py +1 -1
  23. dataeval/utils/split_dataset.py +12 -6
  24. dataeval/utils/tensorflow/_internal/gmm.py +8 -2
  25. dataeval/utils/tensorflow/_internal/loss.py +20 -11
  26. dataeval/utils/tensorflow/_internal/{pixelcnn.py → models.py} +371 -77
  27. dataeval/utils/tensorflow/_internal/trainer.py +12 -5
  28. dataeval/utils/tensorflow/_internal/utils.py +70 -71
  29. dataeval/utils/torch/datasets.py +2 -2
  30. dataeval/workflows/__init__.py +1 -1
  31. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/METADATA +3 -3
  32. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/RECORD +34 -33
  33. dataeval/utils/tensorflow/_internal/autoencoder.py +0 -316
  34. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/LICENSE.txt +0 -0
  35. {dataeval-0.72.2.dist-info → dataeval-0.73.1.dist-info}/WHEEL +0 -0
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = ["CoverageOutput", "coverage"]
4
4
 
5
+ import contextlib
5
6
  import math
6
7
  from dataclasses import dataclass
7
8
  from typing import Literal
@@ -11,9 +12,13 @@ from numpy.typing import ArrayLike, NDArray
11
12
  from scipy.spatial.distance import pdist, squareform
12
13
 
13
14
  from dataeval.interop import to_numpy
15
+ from dataeval.metrics.bias.metadata import coverage_plot
14
16
  from dataeval.output import OutputMetadata, set_metadata
15
17
  from dataeval.utils.shared import flatten
16
18
 
19
+ with contextlib.suppress(ImportError):
20
+ from matplotlib.figure import Figure
21
+
17
22
 
18
23
  @dataclass(frozen=True)
19
24
  class CoverageOutput(OutputMetadata):
@@ -22,9 +27,9 @@ class CoverageOutput(OutputMetadata):
22
27
 
23
28
  Attributes
24
29
  ----------
25
- indices : NDArray
30
+ indices : NDArray[np.intp]
26
31
  Array of uncovered indices
27
- radii : NDArray
32
+ radii : NDArray[np.float64]
28
33
  Array of critical value radii
29
34
  critical_value : float
30
35
  Radius for :term:`coverage<Coverage>`
@@ -34,13 +39,40 @@ class CoverageOutput(OutputMetadata):
34
39
  radii: NDArray[np.float64]
35
40
  critical_value: float
36
41
 
42
+ def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
43
+ """
44
+ Plot the top k images together for visualization
45
+
46
+ Parameters
47
+ ----------
48
+ images : ArrayLike
49
+ Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
50
+ top_k : int, default 6
51
+ Number of images to plot (plotting assumes groups of 3)
52
+
53
+ Returns
54
+ -------
55
+ matplotlib.figure.Figure
56
+ """
57
+ # Determine which images to plot
58
+ highest_uncovered_indices = self.indices[:top_k]
59
+
60
+ # Grab the images
61
+ images = to_numpy(images)
62
+ selected_images = images[highest_uncovered_indices]
63
+
64
+ # Plot the images
65
+ fig = coverage_plot(selected_images, top_k)
66
+
67
+ return fig
68
+
37
69
 
38
70
  @set_metadata()
39
71
  def coverage(
40
72
  embeddings: ArrayLike,
41
73
  radius_type: Literal["adaptive", "naive"] = "adaptive",
42
74
  k: int = 20,
43
- percent: np.float64 = np.float64(0.01),
75
+ percent: float = 0.01,
44
76
  ) -> CoverageOutput:
45
77
  """
46
78
  Class for evaluating :term:`coverage<Coverage>` and identifying images/samples that are in undercovered regions.
@@ -50,12 +82,12 @@ def coverage(
50
82
  embeddings : ArrayLike, shape - (N, P)
51
83
  A dataset in an ArrayLike format.
52
84
  Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
53
- radius_type : Literal["adaptive", "naive"], default "adaptive"
85
+ radius_type : {"adaptive", "naive"}, default "adaptive"
54
86
  The function used to determine radius.
55
- k: int, default 20
87
+ k : int, default 20
56
88
  Number of observations required in order to be covered.
57
89
  [1] suggests that a minimum of 20-50 samples is necessary.
58
- percent: np.float64, default np.float(0.01)
90
+ percent : float, default 0.01
59
91
  Percent of observations to be considered uncovered. Only applies to adaptive radius.
60
92
 
61
93
  Returns
@@ -2,16 +2,28 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = ["DiversityOutput", "diversity"]
4
4
 
5
+ import contextlib
5
6
  from dataclasses import dataclass
6
7
  from typing import Any, Literal, Mapping
7
8
 
8
9
  import numpy as np
9
10
  from numpy.typing import ArrayLike, NDArray
10
11
 
11
- from dataeval.metrics.bias.metadata import entropy, get_counts, get_num_bins, heatmap, preprocess_metadata
12
+ from dataeval.metrics.bias.metadata import (
13
+ CLASS_LABEL,
14
+ diversity_bar_plot,
15
+ entropy,
16
+ get_counts,
17
+ get_num_bins,
18
+ heatmap,
19
+ preprocess_metadata,
20
+ )
12
21
  from dataeval.output import OutputMetadata, set_metadata
13
22
  from dataeval.utils.shared import get_method
14
23
 
24
+ with contextlib.suppress(ImportError):
25
+ from matplotlib.figure import Figure
26
+
15
27
 
16
28
  @dataclass(frozen=True)
17
29
  class DiversityOutput(OutputMetadata):
@@ -24,50 +36,63 @@ class DiversityOutput(OutputMetadata):
24
36
  :term:`Diversity` index for classes and factors
25
37
  classwise : NDArray[np.float64]
26
38
  Classwise diversity index [n_class x n_factor]
27
- class_list: NDArray[np.int64]
39
+ class_list : NDArray[np.int64]
28
40
  Class labels for each value in the dataset
29
- metadata_names: list[str]
41
+ metadata_names : list[str]
30
42
  Names of each metadata factor
31
43
  """
32
44
 
33
45
  diversity_index: NDArray[np.float64]
34
46
  classwise: NDArray[np.float64]
35
-
36
- class_list: NDArray[np.int64]
47
+ class_list: NDArray[Any]
37
48
  metadata_names: list[str]
38
49
 
39
- method: Literal["shannon", "simpson"]
40
-
41
- def plot(self, row_labels: NDArray[Any] | None = None, col_labels: NDArray[Any] | None = None) -> None:
50
+ def plot(
51
+ self,
52
+ row_labels: ArrayLike | list[Any] | None = None,
53
+ col_labels: ArrayLike | list[Any] | None = None,
54
+ plot_classwise: bool = False,
55
+ ) -> Figure:
42
56
  """
43
57
  Plot a heatmap of diversity information
44
58
 
45
59
  Parameters
46
60
  ----------
47
- row_labels: NDArray | None, default None
48
- Array containing the labels for rows in the histogram
49
- col_labels: NDArray | None, default None
50
- Array containing the labels for columns in the histogram
61
+ row_labels : ArrayLike or None, default None
62
+ List/Array containing the labels for rows in the histogram
63
+ col_labels : ArrayLike or None, default None
64
+ List/Array containing the labels for columns in the histogram
65
+ plot_classwise : bool, default False
66
+ Whether to plot per-class balance instead of global balance
51
67
  """
52
- if row_labels is None:
53
- row_labels = np.unique(self.class_list)
54
- if col_labels is None:
55
- col_labels = np.array(self.metadata_names)
68
+ if plot_classwise:
69
+ if row_labels is None:
70
+ row_labels = self.class_list
71
+ if col_labels is None:
72
+ col_labels = self.metadata_names
73
+
74
+ fig = heatmap(
75
+ self.classwise,
76
+ row_labels,
77
+ col_labels,
78
+ xlabel="Factors",
79
+ ylabel="Class",
80
+ cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
81
+ )
56
82
 
57
- heatmap(
58
- self.classwise,
59
- row_labels,
60
- col_labels,
61
- xlabel="Factors",
62
- ylabel="Class",
63
- cbarlabel=f"Normalized {self.method.title()} Index",
64
- )
83
+ else:
84
+ # Creating label array for heat map axes
85
+ heat_labels = np.concatenate((["class"], self.metadata_names))
86
+
87
+ fig = diversity_bar_plot(heat_labels, self.diversity_index)
88
+
89
+ return fig
65
90
 
66
91
 
67
92
  def diversity_shannon(
68
93
  data: NDArray[Any],
69
94
  names: list[str],
70
- is_categorical: list[bool],
95
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
71
96
  subset_mask: NDArray[np.bool_] | None = None,
72
97
  ) -> NDArray[np.float64]:
73
98
  """
@@ -81,14 +106,16 @@ def diversity_shannon(
81
106
 
82
107
  Parameters
83
108
  ----------
84
- data: NDArray
109
+ data : NDArray
85
110
  Array containing numerical values for metadata factors
86
- names: list[str]
111
+ names : list[str]
87
112
  Names of metadata factors -- keys of the metadata dictionary
88
- is_categorical: list[bool]
89
- List of flags to identify whether variables are categorical (True) or
90
- continuous (False)
91
- subset_mask: NDArray[np.bool_] | None
113
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
114
+ The factors in names that have continuous values and the array of bin counts to
115
+ discretize values into. All factors are treated as having discrete values unless they
116
+ are specified as keys in this dictionary. Each element of this array must occur as a key
117
+ in names.
118
+ subset_mask : NDArray[np.bool_] or None, default None
92
119
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
93
120
 
94
121
  Note
@@ -97,18 +124,32 @@ def diversity_shannon(
97
124
 
98
125
  Returns
99
126
  -------
100
- diversity_index: NDArray
127
+ diversity_index : NDArray[np.float64]
101
128
  Diversity index per column of X
102
129
 
103
130
  See Also
104
131
  --------
105
132
  numpy.histogram
106
133
  """
134
+ hist_cache = {}
107
135
 
108
136
  # entropy computed using global auto bins so that we can properly normalize
109
- ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
137
+ ent_unnormalized = entropy(
138
+ data,
139
+ names,
140
+ continuous_factor_bincounts,
141
+ normalized=False,
142
+ subset_mask=subset_mask,
143
+ hist_cache=hist_cache,
144
+ )
110
145
  # normalize by global counts rather than classwise counts
111
- num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
146
+ num_bins = get_num_bins(
147
+ data,
148
+ names,
149
+ continuous_factor_bincounts=continuous_factor_bincounts,
150
+ subset_mask=subset_mask,
151
+ hist_cache=hist_cache,
152
+ )
112
153
  ent_norm = np.empty(ent_unnormalized.shape)
113
154
  ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
114
155
  ent_norm[num_bins == 1] = 0
@@ -118,7 +159,7 @@ def diversity_shannon(
118
159
  def diversity_simpson(
119
160
  data: NDArray[Any],
120
161
  names: list[str],
121
- is_categorical: list[bool],
162
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
122
163
  subset_mask: NDArray[np.bool_] | None = None,
123
164
  ) -> NDArray[np.float64]:
124
165
  """
@@ -132,14 +173,16 @@ def diversity_simpson(
132
173
 
133
174
  Parameters
134
175
  ----------
135
- data: NDArray
176
+ data : NDArray
136
177
  Array containing numerical values for metadata factors
137
- names: list[str]
178
+ names : list[str]
138
179
  Names of metadata factors -- keys of the metadata dictionary
139
- is_categorical: list[bool]
140
- List of flags to identify whether variables are categorical (True) or
141
- continuous (False)
142
- subset_mask: NDArray[np.bool_] | None
180
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
181
+ The factors in names that have continuous values and the array of bin counts to
182
+ discretize values into. All factors are treated as having discrete values unless they
183
+ are specified as keys in this dictionary. Each element of this array must occur as a key
184
+ in names.
185
+ subset_mask : NDArray[np.bool_] or None, default None
143
186
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
144
187
 
145
188
  Note
@@ -150,35 +193,39 @@ def diversity_simpson(
150
193
 
151
194
  Returns
152
195
  -------
153
- NDArray
196
+ diversity_index : NDArray[np.float64]
154
197
  Diversity index per column of X
155
198
 
156
199
  See Also
157
200
  --------
158
201
  numpy.histogram
159
202
  """
203
+ hist_cache = {}
160
204
 
161
- hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
205
+ hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
162
206
  # normalize by global counts, not classwise counts
163
- num_bins = get_num_bins(data, names, is_categorical)
207
+ num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
164
208
 
165
209
  ev_index = np.empty(len(names))
166
210
  # loop over columns for convenience
167
211
  for col, cnts in enumerate(hist_counts.values()):
168
212
  # relative frequencies
169
- p_i = cnts / cnts.sum()
213
+ p_i = cnts / np.sum(cnts)
170
214
  # inverse Simpson index normalized by (number of bins)
171
- s_0 = 1 / np.sum(p_i**2) / num_bins[col]
215
+ s_0 = 1 / np.sum(p_i**2) # / num_bins[col]
172
216
  if num_bins[col] == 1:
173
217
  ev_index[col] = 0
174
218
  else:
175
- ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
219
+ ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
176
220
  return ev_index
177
221
 
178
222
 
179
223
  @set_metadata()
180
224
  def diversity(
181
- class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
225
+ class_labels: ArrayLike,
226
+ metadata: Mapping[str, ArrayLike],
227
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
228
+ method: Literal["simpson", "shannon"] = "simpson",
182
229
  ) -> DiversityOutput:
183
230
  """
184
231
  Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
@@ -191,11 +238,16 @@ def diversity(
191
238
 
192
239
  Parameters
193
240
  ----------
194
- class_labels: ArrayLike
241
+ class_labels : ArrayLike
195
242
  List of class labels for each image
196
- metadata: Mapping[str, ArrayLike]
243
+ metadata : Mapping[str, ArrayLike]
197
244
  Dict of list of metadata factors for each image
198
- method: Literal["shannon", "simpson"], default "simpson"
245
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
246
+ The factors in metadata that have continuous values and the array of bin counts to
247
+ discretize values into. All factors are treated as having discrete values unless they
248
+ are specified as keys in this dictionary. Each element of this array must occur as a key
249
+ in metadata.
250
+ method : {"simpson", "shannon"}, default "simpson"
199
251
  Indicates which diversity index should be computed
200
252
 
201
253
  Note
@@ -214,34 +266,34 @@ def diversity(
214
266
  -------
215
267
  Compute Simpson diversity index of metadata and class labels
216
268
 
217
- >>> div_simp = diversity(class_labels, metadata, method="simpson")
269
+ >>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
218
270
  >>> div_simp.diversity_index
219
- array([0.18103448, 0.18103448, 0.88636364])
271
+ array([0.72413793, 0.72413793, 0.88636364])
220
272
 
221
273
  >>> div_simp.classwise
222
- array([[0.17241379, 0.39473684],
223
- [0.2 , 0.2 ]])
274
+ array([[0.68965517, 0.69230769],
275
+ [0.8 , 1. ]])
224
276
 
225
277
  Compute Shannon diversity index of metadata and class labels
226
278
 
227
- >>> div_shan = diversity(class_labels, metadata, method="shannon")
279
+ >>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
228
280
  >>> div_shan.diversity_index
229
- array([0.37955133, 0.37955133, 0.96748876])
281
+ array([0.8812909 , 0.8812909 , 0.96748876])
230
282
 
231
283
  >>> div_shan.classwise
232
- array([[0.43156028, 0.83224889],
233
- [0.57938016, 0.57938016]])
284
+ array([[0.86312057, 0.91651644],
285
+ [0.91829583, 1. ]])
234
286
 
235
287
  See Also
236
288
  --------
237
289
  numpy.histogram
238
290
  """
239
291
  diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
240
- data, names, is_categorical = preprocess_metadata(class_labels, metadata)
241
- diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
292
+ data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
293
+ diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
242
294
 
243
- class_idx = names.index("class_label")
244
- class_lbl = np.array(data[:, class_idx], dtype=int)
295
+ class_idx = names.index(CLASS_LABEL)
296
+ class_lbl = data[:, class_idx]
245
297
 
246
298
  u_classes = np.unique(class_lbl)
247
299
  num_factors = len(names)
@@ -249,7 +301,7 @@ def diversity(
249
301
  diversity[:] = np.nan
250
302
  for idx, cls in enumerate(u_classes):
251
303
  subset_mask = class_lbl == cls
252
- diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
304
+ diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
253
305
  div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
254
306
 
255
- return DiversityOutput(diversity_index, div_no_class, class_lbl, list(metadata.keys()), method)
307
+ return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))