dataeval 0.73.1__py3-none-any.whl → 0.74.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. dataeval/__init__.py +3 -9
  2. dataeval/detectors/__init__.py +2 -10
  3. dataeval/detectors/drift/base.py +3 -3
  4. dataeval/detectors/drift/mmd.py +1 -1
  5. dataeval/detectors/drift/torch.py +1 -101
  6. dataeval/detectors/linters/clusterer.py +3 -3
  7. dataeval/detectors/linters/duplicates.py +4 -4
  8. dataeval/detectors/linters/outliers.py +4 -4
  9. dataeval/detectors/ood/__init__.py +9 -9
  10. dataeval/detectors/ood/{ae.py → ae_torch.py} +22 -27
  11. dataeval/detectors/ood/base.py +63 -113
  12. dataeval/detectors/ood/base_torch.py +109 -0
  13. dataeval/detectors/ood/metadata_ks_compare.py +52 -14
  14. dataeval/interop.py +1 -1
  15. dataeval/metrics/bias/__init__.py +3 -0
  16. dataeval/metrics/bias/balance.py +73 -70
  17. dataeval/metrics/bias/coverage.py +4 -4
  18. dataeval/metrics/bias/diversity.py +67 -136
  19. dataeval/metrics/bias/metadata_preprocessing.py +285 -0
  20. dataeval/metrics/bias/metadata_utils.py +229 -0
  21. dataeval/metrics/bias/parity.py +51 -161
  22. dataeval/metrics/estimators/ber.py +3 -3
  23. dataeval/metrics/estimators/divergence.py +3 -3
  24. dataeval/metrics/estimators/uap.py +3 -3
  25. dataeval/metrics/stats/base.py +2 -2
  26. dataeval/metrics/stats/boxratiostats.py +1 -1
  27. dataeval/metrics/stats/datasetstats.py +6 -6
  28. dataeval/metrics/stats/dimensionstats.py +1 -1
  29. dataeval/metrics/stats/hashstats.py +1 -1
  30. dataeval/metrics/stats/labelstats.py +3 -3
  31. dataeval/metrics/stats/pixelstats.py +1 -1
  32. dataeval/metrics/stats/visualstats.py +1 -1
  33. dataeval/output.py +77 -53
  34. dataeval/utils/__init__.py +1 -7
  35. dataeval/utils/gmm.py +26 -0
  36. dataeval/utils/metadata.py +29 -9
  37. dataeval/utils/torch/gmm.py +98 -0
  38. dataeval/utils/torch/models.py +192 -0
  39. dataeval/utils/torch/trainer.py +84 -5
  40. dataeval/utils/torch/utils.py +107 -1
  41. dataeval/workflows/sufficiency.py +4 -4
  42. {dataeval-0.73.1.dist-info → dataeval-0.74.1.dist-info}/METADATA +3 -9
  43. dataeval-0.74.1.dist-info/RECORD +65 -0
  44. dataeval/detectors/ood/aegmm.py +0 -66
  45. dataeval/detectors/ood/llr.py +0 -302
  46. dataeval/detectors/ood/vae.py +0 -97
  47. dataeval/detectors/ood/vaegmm.py +0 -75
  48. dataeval/metrics/bias/metadata.py +0 -440
  49. dataeval/utils/lazy.py +0 -26
  50. dataeval/utils/tensorflow/__init__.py +0 -19
  51. dataeval/utils/tensorflow/_internal/gmm.py +0 -123
  52. dataeval/utils/tensorflow/_internal/loss.py +0 -121
  53. dataeval/utils/tensorflow/_internal/models.py +0 -1394
  54. dataeval/utils/tensorflow/_internal/trainer.py +0 -114
  55. dataeval/utils/tensorflow/_internal/utils.py +0 -256
  56. dataeval/utils/tensorflow/loss/__init__.py +0 -11
  57. dataeval-0.73.1.dist-info/RECORD +0 -73
  58. {dataeval-0.73.1.dist-info → dataeval-0.74.1.dist-info}/LICENSE.txt +0 -0
  59. {dataeval-0.73.1.dist-info → dataeval-0.74.1.dist-info}/WHEEL +0 -0
@@ -4,21 +4,15 @@ __all__ = ["DiversityOutput", "diversity"]
4
4
 
5
5
  import contextlib
6
6
  from dataclasses import dataclass
7
- from typing import Any, Literal, Mapping
7
+ from typing import Any, Literal
8
8
 
9
9
  import numpy as np
10
+ import scipy as sp
10
11
  from numpy.typing import ArrayLike, NDArray
11
12
 
12
- from dataeval.metrics.bias.metadata import (
13
- CLASS_LABEL,
14
- diversity_bar_plot,
15
- entropy,
16
- get_counts,
17
- get_num_bins,
18
- heatmap,
19
- preprocess_metadata,
20
- )
21
- from dataeval.output import OutputMetadata, set_metadata
13
+ from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
14
+ from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
15
+ from dataeval.output import Output, set_metadata
22
16
  from dataeval.utils.shared import get_method
23
17
 
24
18
  with contextlib.suppress(ImportError):
@@ -26,31 +20,31 @@ with contextlib.suppress(ImportError):
26
20
 
27
21
 
28
22
  @dataclass(frozen=True)
29
- class DiversityOutput(OutputMetadata):
23
+ class DiversityOutput(Output):
30
24
  """
31
25
  Output class for :func:`diversity` :term:`bias<Bias>` metric
32
26
 
33
27
  Attributes
34
28
  ----------
35
- diversity_index : NDArray[np.float64]
29
+ diversity_index : NDArray[np.double]
36
30
  :term:`Diversity` index for classes and factors
37
- classwise : NDArray[np.float64]
31
+ classwise : NDArray[np.double]
38
32
  Classwise diversity index [n_class x n_factor]
39
- class_list : NDArray[np.int64]
40
- Class labels for each value in the dataset
41
- metadata_names : list[str]
33
+ factor_names : list[str]
42
34
  Names of each metadata factor
35
+ class_list : NDArray[Any]
36
+ Class labels for each value in the dataset
43
37
  """
44
38
 
45
- diversity_index: NDArray[np.float64]
46
- classwise: NDArray[np.float64]
39
+ diversity_index: NDArray[np.double]
40
+ classwise: NDArray[np.double]
41
+ factor_names: list[str]
47
42
  class_list: NDArray[Any]
48
- metadata_names: list[str]
49
43
 
50
44
  def plot(
51
45
  self,
52
- row_labels: ArrayLike | list[Any] | None = None,
53
- col_labels: ArrayLike | list[Any] | None = None,
46
+ row_labels: ArrayLike | None = None,
47
+ col_labels: ArrayLike | None = None,
54
48
  plot_classwise: bool = False,
55
49
  ) -> Figure:
56
50
  """
@@ -69,7 +63,7 @@ class DiversityOutput(OutputMetadata):
69
63
  if row_labels is None:
70
64
  row_labels = self.class_list
71
65
  if col_labels is None:
72
- col_labels = self.metadata_names
66
+ col_labels = self.factor_names
73
67
 
74
68
  fig = heatmap(
75
69
  self.classwise,
@@ -82,7 +76,7 @@ class DiversityOutput(OutputMetadata):
82
76
 
83
77
  else:
84
78
  # Creating label array for heat map axes
85
- heat_labels = np.concatenate((["class"], self.metadata_names))
79
+ heat_labels = np.concatenate((["class"], self.factor_names))
86
80
 
87
81
  fig = diversity_bar_plot(heat_labels, self.diversity_index)
88
82
 
@@ -90,11 +84,9 @@ class DiversityOutput(OutputMetadata):
90
84
 
91
85
 
92
86
  def diversity_shannon(
93
- data: NDArray[Any],
94
- names: list[str],
95
- continuous_factor_bincounts: Mapping[str, int] | None = None,
96
- subset_mask: NDArray[np.bool_] | None = None,
97
- ) -> NDArray[np.float64]:
87
+ counts: NDArray[np.int_],
88
+ num_bins: NDArray[np.int_],
89
+ ) -> NDArray[np.double]:
98
90
  """
99
91
  Compute :term:`diversity<Diversity>` for discrete/categorical variables and, through standard
100
92
  histogram binning, for continuous variables.
@@ -106,62 +98,31 @@ def diversity_shannon(
106
98
 
107
99
  Parameters
108
100
  ----------
109
- data : NDArray
110
- Array containing numerical values for metadata factors
111
- names : list[str]
112
- Names of metadata factors -- keys of the metadata dictionary
113
- continuous_factor_bincounts : Mapping[str, int] or None, default None
114
- The factors in names that have continuous values and the array of bin counts to
115
- discretize values into. All factors are treated as having discrete values unless they
116
- are specified as keys in this dictionary. Each element of this array must occur as a key
117
- in names.
118
- subset_mask : NDArray[np.bool_] or None, default None
119
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
120
-
121
- Note
122
- ----
123
- For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
101
+ counts : NDArray[np.int_]
102
+ Array containing bin counts for each factor
103
+ num_bins : NDArray[np.int_]
104
+ Number of bins with values for each factor
124
105
 
125
106
  Returns
126
107
  -------
127
- diversity_index : NDArray[np.float64]
108
+ diversity_index : NDArray[np.double]
128
109
  Diversity index per column of X
129
110
 
130
111
  See Also
131
112
  --------
132
- numpy.histogram
113
+ scipy.stats.entropy
133
114
  """
134
- hist_cache = {}
135
-
136
- # entropy computed using global auto bins so that we can properly normalize
137
- ent_unnormalized = entropy(
138
- data,
139
- names,
140
- continuous_factor_bincounts,
141
- normalized=False,
142
- subset_mask=subset_mask,
143
- hist_cache=hist_cache,
144
- )
145
- # normalize by global counts rather than classwise counts
146
- num_bins = get_num_bins(
147
- data,
148
- names,
149
- continuous_factor_bincounts=continuous_factor_bincounts,
150
- subset_mask=subset_mask,
151
- hist_cache=hist_cache,
152
- )
153
- ent_norm = np.empty(ent_unnormalized.shape)
154
- ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
115
+ raw_entropy = sp.stats.entropy(counts, axis=0)
116
+ ent_norm = np.empty(raw_entropy.shape)
117
+ ent_norm[num_bins != 1] = raw_entropy[num_bins != 1] / np.log(num_bins[num_bins != 1])
155
118
  ent_norm[num_bins == 1] = 0
156
119
  return ent_norm
157
120
 
158
121
 
159
122
  def diversity_simpson(
160
- data: NDArray[Any],
161
- names: list[str],
162
- continuous_factor_bincounts: Mapping[str, int] | None = None,
163
- subset_mask: NDArray[np.bool_] | None = None,
164
- ) -> NDArray[np.float64]:
123
+ counts: NDArray[np.int_],
124
+ num_bins: NDArray[np.int_],
125
+ ) -> NDArray[np.double]:
165
126
  """
166
127
  Compute :term:`diversity<Diversity>` for discrete/categorical variables and, through standard
167
128
  histogram binning, for continuous variables.
@@ -173,58 +134,38 @@ def diversity_simpson(
173
134
 
174
135
  Parameters
175
136
  ----------
176
- data : NDArray
177
- Array containing numerical values for metadata factors
178
- names : list[str]
179
- Names of metadata factors -- keys of the metadata dictionary
180
- continuous_factor_bincounts : Mapping[str, int] or None, default None
181
- The factors in names that have continuous values and the array of bin counts to
182
- discretize values into. All factors are treated as having discrete values unless they
183
- are specified as keys in this dictionary. Each element of this array must occur as a key
184
- in names.
185
- subset_mask : NDArray[np.bool_] or None, default None
186
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
137
+ counts : NDArray[np.int_]
138
+ Array containing bin counts for each factor
139
+ num_bins : NDArray[np.int_]
140
+ Number of bins with values for each factor
187
141
 
188
142
  Note
189
143
  ----
190
- For continuous variables, histogram bins are chosen automatically. See
191
- numpy.histogram for details.
192
144
  If there is only one category, the diversity index takes a value of 0.
193
145
 
194
146
  Returns
195
147
  -------
196
- diversity_index : NDArray[np.float64]
148
+ diversity_index : NDArray[np.double]
197
149
  Diversity index per column of X
198
-
199
- See Also
200
- --------
201
- numpy.histogram
202
150
  """
203
- hist_cache = {}
204
-
205
- hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
206
- # normalize by global counts, not classwise counts
207
- num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
208
-
209
- ev_index = np.empty(len(names))
151
+ ev_index = np.empty(counts.shape[1])
210
152
  # loop over columns for convenience
211
- for col, cnts in enumerate(hist_counts.values()):
153
+ for col, cnts in enumerate(counts.T):
212
154
  # relative frequencies
213
155
  p_i = cnts / np.sum(cnts)
214
- # inverse Simpson index normalized by (number of bins)
215
- s_0 = 1 / np.sum(p_i**2) # / num_bins[col]
156
+ # inverse Simpson index
157
+ s_0 = 1 / np.sum(p_i**2)
216
158
  if num_bins[col] == 1:
217
159
  ev_index[col] = 0
218
160
  else:
161
+ # normalized by number of bins
219
162
  ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
220
163
  return ev_index
221
164
 
222
165
 
223
- @set_metadata()
166
+ @set_metadata
224
167
  def diversity(
225
- class_labels: ArrayLike,
226
- metadata: Mapping[str, ArrayLike],
227
- continuous_factor_bincounts: Mapping[str, int] | None = None,
168
+ metadata: MetadataOutput,
228
169
  method: Literal["simpson", "shannon"] = "simpson",
229
170
  ) -> DiversityOutput:
230
171
  """
@@ -238,23 +179,13 @@ def diversity(
238
179
 
239
180
  Parameters
240
181
  ----------
241
- class_labels : ArrayLike
242
- List of class labels for each image
243
- metadata : Mapping[str, ArrayLike]
244
- Dict of list of metadata factors for each image
245
- continuous_factor_bincounts : Mapping[str, int] or None, default None
246
- The factors in metadata that have continuous values and the array of bin counts to
247
- discretize values into. All factors are treated as having discrete values unless they
248
- are specified as keys in this dictionary. Each element of this array must occur as a key
249
- in metadata.
250
- method : {"simpson", "shannon"}, default "simpson"
251
- Indicates which diversity index should be computed
182
+ metadata : MetadataOutput
183
+ Output after running `metadata_preprocessing`
252
184
 
253
185
  Note
254
186
  ----
255
- - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
256
187
  - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
257
- - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
188
+ - If there is only one category, the diversity index takes a value of 0.
258
189
 
259
190
  Returns
260
191
  -------
@@ -266,42 +197,42 @@ def diversity(
266
197
  -------
267
198
  Compute Simpson diversity index of metadata and class labels
268
199
 
269
- >>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
200
+ >>> div_simp = diversity(metadata, method="simpson")
270
201
  >>> div_simp.diversity_index
271
- array([0.72413793, 0.72413793, 0.88636364])
202
+ array([0.72413793, 0.88636364, 0.72413793])
272
203
 
273
204
  >>> div_simp.classwise
274
- array([[0.68965517, 0.69230769],
275
- [0.8 , 1. ]])
205
+ array([[0.69230769, 0.68965517],
206
+ [0.5 , 0.8 ]])
276
207
 
277
208
  Compute Shannon diversity index of metadata and class labels
278
209
 
279
- >>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
210
+ >>> div_shan = diversity(metadata, method="shannon")
280
211
  >>> div_shan.diversity_index
281
- array([0.8812909 , 0.8812909 , 0.96748876])
212
+ array([0.8812909 , 0.96748876, 0.8812909 ])
282
213
 
283
214
  >>> div_shan.classwise
284
- array([[0.86312057, 0.91651644],
285
- [0.91829583, 1. ]])
215
+ array([[0.91651644, 0.86312057],
216
+ [0.68260619, 0.91829583]])
286
217
 
287
218
  See Also
288
219
  --------
289
- numpy.histogram
220
+ scipy.stats.entropy
290
221
  """
291
222
  diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
292
- data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
293
- diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
223
+ discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
224
+ cnts = get_counts(discretized_data)
225
+ num_bins = np.bincount(np.nonzero(cnts)[1])
226
+ diversity_index = diversity_fn(cnts, num_bins)
294
227
 
295
- class_idx = names.index(CLASS_LABEL)
296
- class_lbl = data[:, class_idx]
228
+ class_lbl = metadata.class_labels
297
229
 
298
230
  u_classes = np.unique(class_lbl)
299
- num_factors = len(names)
300
- diversity = np.empty((len(u_classes), num_factors))
301
- diversity[:] = np.nan
231
+ num_factors = len(metadata.discrete_factor_names)
232
+ classwise_div = np.full((len(u_classes), num_factors), np.nan)
302
233
  for idx, cls in enumerate(u_classes):
303
234
  subset_mask = class_lbl == cls
304
- diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
305
- div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
235
+ cls_cnts = get_counts(metadata.discrete_data[subset_mask], min_num_bins=cnts.shape[0])
236
+ classwise_div[idx, :] = diversity_fn(cls_cnts, num_bins[1:])
306
237
 
307
- return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))
238
+ return DiversityOutput(diversity_index, classwise_div, metadata.discrete_factor_names, metadata.class_names)
@@ -0,0 +1,285 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ["MetadataOutput", "metadata_preprocessing"]
4
+
5
+ import warnings
6
+ from dataclasses import dataclass
7
+ from typing import Any, Iterable, Literal, Mapping, TypeVar
8
+
9
+ import numpy as np
10
+ from numpy.typing import ArrayLike, NDArray
11
+ from scipy.stats import wasserstein_distance as wd
12
+
13
+ from dataeval.interop import as_numpy, to_numpy
14
+ from dataeval.output import Output, set_metadata
15
+ from dataeval.utils.metadata import merge_metadata
16
+
17
+ TNum = TypeVar("TNum", int, float)
18
+ DISCRETE_MIN_WD = 0.054
19
+ CONTINUOUS_MIN_SAMPLE_SIZE = 20
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class MetadataOutput(Output):
24
+ """
25
+ Output class for :func:`metadata_binning` function
26
+
27
+ Attributes
28
+ ----------
29
+ discrete_factor_names : list[str]
30
+ List containing factor names for the original data that was discrete and the binned continuous data
31
+ discrete_data : NDArray[np.int]
32
+ Array containing values for the original data that was discrete and the binned continuous data
33
+ continuous_factor_names : list[str]
34
+ List containing factor names for the original continuous data
35
+ continuous_data : NDArray[np.int or np.double] | None
36
+ Array containing values for the original continuous data or None if there was no continuous data
37
+ class_labels : NDArray[np.int]
38
+ Numerical class labels for the images/objects
39
+ class_names : NDArray[Any]
40
+ Array of unique class names (for use with plotting)
41
+ total_num_factors : int
42
+ Sum of discrete_factor_names and continuous_factor_names plus 1 for class
43
+ """
44
+
45
+ discrete_factor_names: list[str]
46
+ discrete_data: NDArray[np.int_]
47
+ continuous_factor_names: list[str]
48
+ continuous_data: NDArray[np.int_ | np.double] | None
49
+ class_labels: NDArray[np.int_]
50
+ class_names: NDArray[Any]
51
+ total_num_factors: int
52
+
53
+
54
+ @set_metadata
55
+ def metadata_preprocessing(
56
+ raw_metadata: Iterable[Mapping[str, Any]],
57
+ class_labels: ArrayLike | str,
58
+ continuous_factor_bins: Mapping[str, int | list[tuple[TNum, TNum]]] | None = None,
59
+ auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
60
+ exclude: Iterable[str] | None = None,
61
+ ) -> MetadataOutput:
62
+ """
63
+ Restructures the metadata to be in the correct format for the bias functions.
64
+
65
+ This identifies whether the incoming metadata is discrete or continuous,
66
+ and whether the data is already binned or still needs binning.
67
+ It accepts a list of dictionaries containing the per image metadata and
68
+ automatically adjusts for multiple targets in an image.
69
+
70
+ Parameters
71
+ ----------
72
+ raw_metadata : Iterable[Mapping[str, Any]]
73
+ Iterable collection of metadata dictionaries to flatten and merge.
74
+ class_labels : ArrayLike or string or None
75
+ If arraylike, expects the labels for each image (image classification) or each object (object detection).
76
+ If the labels are included in the metadata dictionary, pass in the key value.
77
+ continuous_factor_bins : Mapping[str, int] or Mapping[str, list[tuple[TNum, TNum]]] or None, default None
78
+ User provided dictionary specifying how to bin the continuous metadata factors
79
+ auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
80
+ Method by which the function will automatically bin continuous metadata factors. It is recommended
81
+ that the user provide the bins through the `continuous_factor_bins`.
82
+ exclude : Iterable[str] or None, default None
83
+ User provided collection of metadata keys to exclude when processing metadata.
84
+
85
+ Returns
86
+ -------
87
+ MetadataOutput
88
+ Output class containing the binned metadata
89
+ """
90
+ # Transform metadata into single, flattened dictionary
91
+ metadata, image_repeats = merge_metadata(raw_metadata)
92
+
93
+ # Drop any excluded metadata keys
94
+ if exclude:
95
+ for k in list(metadata):
96
+ if k in exclude:
97
+ metadata.pop(k)
98
+
99
+ # Get the class label array in numeric form
100
+ class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
101
+ if class_array.ndim > 1:
102
+ raise ValueError(
103
+ f"Got class labels with {class_array.ndim}-dimensional "
104
+ f"shape {class_array.shape}, but expected a 1-dimensional array."
105
+ )
106
+ if not np.issubdtype(class_array.dtype, np.int_):
107
+ unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
108
+ else:
109
+ numerical_labels = class_array
110
+ unique_classes = np.unique(class_array)
111
+
112
+ # Bin according to user supplied bins
113
+ continuous_metadata = {}
114
+ discrete_metadata = {}
115
+ if continuous_factor_bins is not None and continuous_factor_bins != {}:
116
+ invalid_keys = set(continuous_factor_bins.keys()) - set(metadata.keys())
117
+ if invalid_keys:
118
+ raise KeyError(
119
+ f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
120
+ "but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
121
+ "or add corresponding entries to the `metadata` dictionary."
122
+ )
123
+ for factor, grouping in continuous_factor_bins.items():
124
+ discrete_metadata[factor] = _user_defined_bin(metadata[factor], grouping)
125
+ continuous_metadata[factor] = metadata[factor]
126
+
127
+ # Determine category of the rest of the keys
128
+ remaining_keys = set(metadata.keys()) - set(continuous_metadata.keys())
129
+ for key in remaining_keys:
130
+ data = to_numpy(metadata[key])
131
+ if np.issubdtype(data.dtype, np.number):
132
+ result = _is_continuous(data, image_repeats)
133
+ if result:
134
+ continuous_metadata[key] = data
135
+ unique_samples, ordinal_data = np.unique(data, return_inverse=True)
136
+ if unique_samples.size <= np.max([20, data.size * 0.01]):
137
+ discrete_metadata[key] = ordinal_data
138
+ else:
139
+ warnings.warn(
140
+ f"A user defined binning was not provided for {key}. "
141
+ f"Using the {auto_bin_method} method to discretize the data. "
142
+ "It is recommended that the user rerun and supply the desired "
143
+ "bins using the continuous_factor_bins parameter.",
144
+ UserWarning,
145
+ )
146
+ discrete_metadata[key] = _binning_function(data, auto_bin_method)
147
+ else:
148
+ _, discrete_metadata[key] = np.unique(data, return_inverse=True)
149
+
150
+ # splitting out the dictionaries into the keys and values
151
+ discrete_factor_names = list(discrete_metadata.keys())
152
+ discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
153
+ continuous_factor_names = list(continuous_metadata.keys())
154
+ continuous_data = np.stack(list(continuous_metadata.values()), axis=-1) if continuous_metadata else None
155
+ total_num_factors = len(discrete_factor_names + continuous_factor_names) + 1
156
+
157
+ return MetadataOutput(
158
+ discrete_factor_names,
159
+ discrete_data,
160
+ continuous_factor_names,
161
+ continuous_data,
162
+ numerical_labels,
163
+ unique_classes,
164
+ total_num_factors,
165
+ )
166
+
167
+
168
+ def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[TNum, TNum]]) -> NDArray[np.intp]:
169
+ """
170
+ Digitizes a list of values into a given number of bins.
171
+
172
+ Parameters
173
+ ----------
174
+ data : list | NDArray
175
+ The values to be digitized.
176
+ binning : int | list[tuple[TNum, TNum]]
177
+ The number of bins for the discrete values that data will be digitized into.
178
+
179
+ Returns
180
+ -------
181
+ NDArray[np.intp]
182
+ The digitized values
183
+ """
184
+
185
+ if not np.all([np.issubdtype(type(n), np.number) for n in data]):
186
+ raise TypeError(
187
+ "Encountered a data value with non-numeric type when digitizing a factor. "
188
+ "Ensure all occurrences of continuous factors are numeric types."
189
+ )
190
+ if type(binning) is int:
191
+ _, bin_edges = np.histogram(data, bins=binning)
192
+ bin_edges[-1] = np.inf
193
+ bin_edges[0] = -np.inf
194
+ else:
195
+ bin_edges = binning
196
+ return np.digitize(data, bin_edges)
197
+
198
+
199
+ def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
200
+ """
201
+ Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
202
+ """
203
+ if bin_method == "clusters":
204
+ # bin_edges = _binning_by_clusters(data)
205
+ warnings.warn(
206
+ "Binning by clusters is currently unavailable until changes to the clustering function go through.",
207
+ UserWarning,
208
+ )
209
+ bin_method = "uniform_width"
210
+
211
+ if bin_method != "clusters":
212
+ counts, bin_edges = np.histogram(data, bins="auto")
213
+ n_bins = counts.size
214
+ if counts[counts > 0].min() < 10:
215
+ for _ in range(20):
216
+ n_bins -= 1
217
+ counts, bin_edges = np.histogram(data, bins=n_bins)
218
+ if counts[counts > 0].min() >= 10 or n_bins < 2:
219
+ break
220
+
221
+ if bin_method == "uniform_count":
222
+ quantiles = np.linspace(0, 100, n_bins + 1)
223
+ bin_edges = np.asarray(np.percentile(data, quantiles))
224
+
225
+ bin_edges[0] = -np.inf # type: ignore # until the clusters speed up is merged
226
+ bin_edges[-1] = np.inf # type: ignore # and the _binning_by_clusters can be uncommented
227
+ return np.digitize(data, bin_edges) # type: ignore
228
+
229
+
230
+ def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
231
+ """
232
+ Determines whether the data is continuous or discrete using the Wasserstein distance.
233
+
234
+ Given a 1D sample, we consider the intervals between adjacent points. For a continuous distribution,
235
+ a point is equally likely to lie anywhere in the interval bounded by its two neighbors. Furthermore,
236
+ we can put all "between neighbor" locations on the same scale of 0 to 1 by subtracting the smaller
237
+ neighbor and dividing out the length of the interval. (Duplicates are either assigned to zero or
238
+ ignored, depending on context). These normalized locations will be much more uniformly distributed
239
+ for continuous data than for discrete, and this gives us a way to distinguish them. Call this the
240
+ Normalized Near Neighbor distribution (NNN), defined on the interval [0,1].
241
+
242
+ The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure
243
+ how close the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at
244
+ least 20 points, and furthermore at least half as many points as there are discrete values, we can
245
+ reliably distinguish discrete from continuous samples by testing that the Wasserstein distance
246
+ measured from a uniform distribution is greater or less than 0.054, respectively.
247
+ """
248
+ # Check if the metadata is image specific
249
+ _, data_indicies_unsorted = np.unique(data, return_index=True)
250
+ if data_indicies_unsorted.size == image_indicies.size:
251
+ data_indicies = np.sort(data_indicies_unsorted)
252
+ if (data_indicies == image_indicies).all():
253
+ data = data[data_indicies]
254
+
255
+ # OLD METHOD
256
+ # uvals = np.unique(data)
257
+ # pct_unique = uvals.size / data.size
258
+ # return pct_unique < threshold
259
+
260
+ n_examples = len(data)
261
+
262
+ if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
263
+ warnings.warn(
264
+ f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})", UserWarning
265
+ )
266
+ return False
267
+
268
+ # Require at least 3 unique values before bothering with NNN
269
+ xu = np.unique(data, axis=None)
270
+ if xu.size < 3:
271
+ return False
272
+
273
+ Xs = np.sort(data)
274
+
275
+ X0, X1 = Xs[0:-2], Xs[2:] # left and right neighbors
276
+
277
+ dx = np.zeros(n_examples - 2) # no dx at end points
278
+ gtz = (X1 - X0) > 0 # check for dups; dx will be zero for them
279
+ dx[np.logical_not(gtz)] = 0.0
280
+
281
+ dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz] # the core idea: dx is NNN samples.
282
+
283
+ shift = wd(dx, np.linspace(0, 1, dx.size)) # how far is dx from uniform, for this feature?
284
+
285
+ return shift < DISCRETE_MIN_WD # if NNN is close enough to uniform, consider the sample continuous.