dataeval 0.73.0__py3-none-any.whl → 0.74.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/detectors/__init__.py +1 -1
  3. dataeval/detectors/drift/__init__.py +1 -1
  4. dataeval/detectors/drift/base.py +2 -2
  5. dataeval/detectors/drift/torch.py +1 -101
  6. dataeval/detectors/linters/clusterer.py +1 -1
  7. dataeval/detectors/ood/__init__.py +11 -4
  8. dataeval/detectors/ood/ae.py +2 -1
  9. dataeval/detectors/ood/ae_torch.py +70 -0
  10. dataeval/detectors/ood/aegmm.py +4 -3
  11. dataeval/detectors/ood/base.py +58 -108
  12. dataeval/detectors/ood/base_tf.py +109 -0
  13. dataeval/detectors/ood/base_torch.py +109 -0
  14. dataeval/detectors/ood/llr.py +2 -2
  15. dataeval/detectors/ood/metadata_ks_compare.py +53 -14
  16. dataeval/detectors/ood/vae.py +3 -2
  17. dataeval/detectors/ood/vaegmm.py +5 -4
  18. dataeval/metrics/bias/__init__.py +3 -0
  19. dataeval/metrics/bias/balance.py +77 -64
  20. dataeval/metrics/bias/coverage.py +12 -12
  21. dataeval/metrics/bias/diversity.py +74 -114
  22. dataeval/metrics/bias/metadata_preprocessing.py +285 -0
  23. dataeval/metrics/bias/metadata_utils.py +229 -0
  24. dataeval/metrics/bias/parity.py +54 -158
  25. dataeval/utils/__init__.py +2 -2
  26. dataeval/utils/gmm.py +26 -0
  27. dataeval/utils/metadata.py +29 -9
  28. dataeval/utils/shared.py +1 -1
  29. dataeval/utils/split_dataset.py +12 -6
  30. dataeval/utils/tensorflow/_internal/gmm.py +4 -24
  31. dataeval/utils/torch/datasets.py +2 -2
  32. dataeval/utils/torch/gmm.py +98 -0
  33. dataeval/utils/torch/models.py +192 -0
  34. dataeval/utils/torch/trainer.py +84 -5
  35. dataeval/utils/torch/utils.py +107 -1
  36. dataeval/workflows/__init__.py +1 -1
  37. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/METADATA +1 -2
  38. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/RECORD +40 -34
  39. dataeval/metrics/bias/metadata.py +0 -358
  40. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/LICENSE.txt +0 -0
  41. {dataeval-0.73.0.dist-info → dataeval-0.74.0.dist-info}/WHEEL +0 -0
@@ -4,19 +4,14 @@ __all__ = ["DiversityOutput", "diversity"]
4
4
 
5
5
  import contextlib
6
6
  from dataclasses import dataclass
7
- from typing import Any, Literal, Mapping
7
+ from typing import Any, Literal
8
8
 
9
9
  import numpy as np
10
+ import scipy as sp
10
11
  from numpy.typing import ArrayLike, NDArray
11
12
 
12
- from dataeval.metrics.bias.metadata import (
13
- diversity_bar_plot,
14
- entropy,
15
- get_counts,
16
- get_num_bins,
17
- heatmap,
18
- preprocess_metadata,
19
- )
13
+ from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
14
+ from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
20
15
  from dataeval.output import OutputMetadata, set_metadata
21
16
  from dataeval.utils.shared import get_method
22
17
 
@@ -31,26 +26,25 @@ class DiversityOutput(OutputMetadata):
31
26
 
32
27
  Attributes
33
28
  ----------
34
- diversity_index : NDArray[np.float64]
29
+ diversity_index : NDArray[np.double]
35
30
  :term:`Diversity` index for classes and factors
36
- classwise : NDArray[np.float64]
31
+ classwise : NDArray[np.double]
37
32
  Classwise diversity index [n_class x n_factor]
38
- class_list: NDArray[np.int64]
39
- Class labels for each value in the dataset
40
- metadata_names: list[str]
33
+ factor_names : list[str]
41
34
  Names of each metadata factor
35
+ class_list : NDArray[Any]
36
+ Class labels for each value in the dataset
42
37
  """
43
38
 
44
- diversity_index: NDArray[np.float64]
45
- classwise: NDArray[np.float64]
39
+ diversity_index: NDArray[np.double]
40
+ classwise: NDArray[np.double]
41
+ factor_names: list[str]
46
42
  class_list: NDArray[Any]
47
- metadata_names: list[str]
48
- method: Literal["shannon", "simpson"]
49
43
 
50
44
  def plot(
51
45
  self,
52
- row_labels: list[Any] | NDArray[Any] | None = None,
53
- col_labels: list[Any] | NDArray[Any] | None = None,
46
+ row_labels: ArrayLike | None = None,
47
+ col_labels: ArrayLike | None = None,
54
48
  plot_classwise: bool = False,
55
49
  ) -> Figure:
56
50
  """
@@ -58,9 +52,9 @@ class DiversityOutput(OutputMetadata):
58
52
 
59
53
  Parameters
60
54
  ----------
61
- row_labels : ArrayLike | None, default None
55
+ row_labels : ArrayLike or None, default None
62
56
  List/Array containing the labels for rows in the histogram
63
- col_labels : ArrayLike | None, default None
57
+ col_labels : ArrayLike or None, default None
64
58
  List/Array containing the labels for columns in the histogram
65
59
  plot_classwise : bool, default False
66
60
  Whether to plot per-class balance instead of global balance
@@ -69,7 +63,7 @@ class DiversityOutput(OutputMetadata):
69
63
  if row_labels is None:
70
64
  row_labels = self.class_list
71
65
  if col_labels is None:
72
- col_labels = self.metadata_names
66
+ col_labels = self.factor_names
73
67
 
74
68
  fig = heatmap(
75
69
  self.classwise,
@@ -77,12 +71,12 @@ class DiversityOutput(OutputMetadata):
77
71
  col_labels,
78
72
  xlabel="Factors",
79
73
  ylabel="Class",
80
- cbarlabel=f"Normalized {self.method.title()} Index",
74
+ cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
81
75
  )
82
76
 
83
77
  else:
84
78
  # Creating label array for heat map axes
85
- heat_labels = np.concatenate((["class"], self.metadata_names))
79
+ heat_labels = np.concatenate((["class"], self.factor_names))
86
80
 
87
81
  fig = diversity_bar_plot(heat_labels, self.diversity_index)
88
82
 
@@ -90,11 +84,9 @@ class DiversityOutput(OutputMetadata):
90
84
 
91
85
 
92
86
  def diversity_shannon(
93
- data: NDArray[Any],
94
- names: list[str],
95
- is_categorical: list[bool],
96
- subset_mask: NDArray[np.bool_] | None = None,
97
- ) -> NDArray[np.float64]:
87
+ counts: NDArray[np.int_],
88
+ num_bins: NDArray[np.int_],
89
+ ) -> NDArray[np.double]:
98
90
  """
99
91
  Compute :term:`diversity<Diversity>` for discrete/categorical variables and, through standard
100
92
  histogram binning, for continuous variables.
@@ -106,46 +98,31 @@ def diversity_shannon(
106
98
 
107
99
  Parameters
108
100
  ----------
109
- data: NDArray
110
- Array containing numerical values for metadata factors
111
- names: list[str]
112
- Names of metadata factors -- keys of the metadata dictionary
113
- is_categorical: list[bool]
114
- List of flags to identify whether variables are categorical (True) or
115
- continuous (False)
116
- subset_mask: NDArray[np.bool_] | None
117
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
118
-
119
- Note
120
- ----
121
- For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
101
+ counts : NDArray[np.int_]
102
+ Array containing bin counts for each factor
103
+ num_bins : NDArray[np.int_]
104
+ Number of bins with values for each factor
122
105
 
123
106
  Returns
124
107
  -------
125
- diversity_index: NDArray
108
+ diversity_index : NDArray[np.double]
126
109
  Diversity index per column of X
127
110
 
128
111
  See Also
129
112
  --------
130
- numpy.histogram
113
+ scipy.stats.entropy
131
114
  """
132
-
133
- # entropy computed using global auto bins so that we can properly normalize
134
- ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
135
- # normalize by global counts rather than classwise counts
136
- num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
137
- ent_norm = np.empty(ent_unnormalized.shape)
138
- ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
115
+ raw_entropy = sp.stats.entropy(counts, axis=0)
116
+ ent_norm = np.empty(raw_entropy.shape)
117
+ ent_norm[num_bins != 1] = raw_entropy[num_bins != 1] / np.log(num_bins[num_bins != 1])
139
118
  ent_norm[num_bins == 1] = 0
140
119
  return ent_norm
141
120
 
142
121
 
143
122
  def diversity_simpson(
144
- data: NDArray[Any],
145
- names: list[str],
146
- is_categorical: list[bool],
147
- subset_mask: NDArray[np.bool_] | None = None,
148
- ) -> NDArray[np.float64]:
123
+ counts: NDArray[np.int_],
124
+ num_bins: NDArray[np.int_],
125
+ ) -> NDArray[np.double]:
149
126
  """
150
127
  Compute :term:`diversity<Diversity>` for discrete/categorical variables and, through standard
151
128
  histogram binning, for continuous variables.
@@ -157,53 +134,39 @@ def diversity_simpson(
157
134
 
158
135
  Parameters
159
136
  ----------
160
- data: NDArray
161
- Array containing numerical values for metadata factors
162
- names: list[str]
163
- Names of metadata factors -- keys of the metadata dictionary
164
- is_categorical: list[bool]
165
- List of flags to identify whether variables are categorical (True) or
166
- continuous (False)
167
- subset_mask: NDArray[np.bool_] | None
168
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
137
+ counts : NDArray[np.int_]
138
+ Array containing bin counts for each factor
139
+ num_bins : NDArray[np.int_]
140
+ Number of bins with values for each factor
169
141
 
170
142
  Note
171
143
  ----
172
- For continuous variables, histogram bins are chosen automatically. See
173
- numpy.histogram for details.
174
144
  If there is only one category, the diversity index takes a value of 0.
175
145
 
176
146
  Returns
177
147
  -------
178
- NDArray
148
+ diversity_index : NDArray[np.double]
179
149
  Diversity index per column of X
180
-
181
- See Also
182
- --------
183
- numpy.histogram
184
150
  """
185
-
186
- hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
187
- # normalize by global counts, not classwise counts
188
- num_bins = get_num_bins(data, names, is_categorical)
189
-
190
- ev_index = np.empty(len(names))
151
+ ev_index = np.empty(counts.shape[1])
191
152
  # loop over columns for convenience
192
- for col, cnts in enumerate(hist_counts.values()):
153
+ for col, cnts in enumerate(counts.T):
193
154
  # relative frequencies
194
- p_i = cnts / cnts.sum()
195
- # inverse Simpson index normalized by (number of bins)
196
- s_0 = 1 / np.sum(p_i**2) / num_bins[col]
155
+ p_i = cnts / np.sum(cnts)
156
+ # inverse Simpson index
157
+ s_0 = 1 / np.sum(p_i**2)
197
158
  if num_bins[col] == 1:
198
159
  ev_index[col] = 0
199
160
  else:
200
- ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
161
+ # normalized by number of bins
162
+ ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
201
163
  return ev_index
202
164
 
203
165
 
204
166
  @set_metadata()
205
167
  def diversity(
206
- class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
168
+ metadata: MetadataOutput,
169
+ method: Literal["simpson", "shannon"] = "simpson",
207
170
  ) -> DiversityOutput:
208
171
  """
209
172
  Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
@@ -216,18 +179,13 @@ def diversity(
216
179
 
217
180
  Parameters
218
181
  ----------
219
- class_labels: ArrayLike
220
- List of class labels for each image
221
- metadata: Mapping[str, ArrayLike]
222
- Dict of list of metadata factors for each image
223
- method: Literal["shannon", "simpson"], default "simpson"
224
- Indicates which diversity index should be computed
182
+ metadata : MetadataOutput
183
+ Output after running `metadata_preprocessing`
225
184
 
226
185
  Note
227
186
  ----
228
- - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
229
187
  - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
230
- - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
188
+ - If there is only one category, the diversity index takes a value of 0.
231
189
 
232
190
  Returns
233
191
  -------
@@ -239,40 +197,42 @@ def diversity(
239
197
  -------
240
198
  Compute Simpson diversity index of metadata and class labels
241
199
 
242
- >>> div_simp = diversity(class_labels, metadata, method="simpson")
200
+ >>> div_simp = diversity(metadata, method="simpson")
243
201
  >>> div_simp.diversity_index
244
- array([0.18103448, 0.18103448, 0.88636364])
202
+ array([0.72413793, 0.88636364, 0.72413793])
245
203
 
246
204
  >>> div_simp.classwise
247
- array([[0.17241379, 0.39473684],
248
- [0.2 , 0.2 ]])
205
+ array([[0.69230769, 0.68965517],
206
+ [0.5 , 0.8 ]])
249
207
 
250
208
  Compute Shannon diversity index of metadata and class labels
251
209
 
252
- >>> div_shan = diversity(class_labels, metadata, method="shannon")
210
+ >>> div_shan = diversity(metadata, method="shannon")
253
211
  >>> div_shan.diversity_index
254
- array([0.37955133, 0.37955133, 0.96748876])
212
+ array([0.8812909 , 0.96748876, 0.8812909 ])
255
213
 
256
214
  >>> div_shan.classwise
257
- array([[0.43156028, 0.83224889],
258
- [0.57938016, 0.57938016]])
215
+ array([[0.91651644, 0.86312057],
216
+ [0.68260619, 0.91829583]])
259
217
 
260
218
  See Also
261
219
  --------
262
- numpy.histogram
220
+ scipy.stats.entropy
263
221
  """
264
222
  diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
265
- data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
266
- diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
267
-
268
- class_idx = names.index("class_label")
269
- u_classes = np.unique(data[:, class_idx])
270
- num_factors = len(names)
271
- diversity = np.empty((len(u_classes), num_factors))
272
- diversity[:] = np.nan
223
+ discretized_data = np.hstack((metadata.class_labels[:, np.newaxis], metadata.discrete_data))
224
+ cnts = get_counts(discretized_data)
225
+ num_bins = np.bincount(np.nonzero(cnts)[1])
226
+ diversity_index = diversity_fn(cnts, num_bins)
227
+
228
+ class_lbl = metadata.class_labels
229
+
230
+ u_classes = np.unique(class_lbl)
231
+ num_factors = len(metadata.discrete_factor_names)
232
+ classwise_div = np.full((len(u_classes), num_factors), np.nan)
273
233
  for idx, cls in enumerate(u_classes):
274
- subset_mask = data[:, class_idx] == cls
275
- diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
276
- div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
234
+ subset_mask = class_lbl == cls
235
+ cls_cnts = get_counts(metadata.discrete_data[subset_mask], min_num_bins=cnts.shape[0])
236
+ classwise_div[idx, :] = diversity_fn(cls_cnts, num_bins[1:])
277
237
 
278
- return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()), method)
238
+ return DiversityOutput(diversity_index, classwise_div, metadata.discrete_factor_names, metadata.class_names)
@@ -0,0 +1,285 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ["MetadataOutput", "metadata_preprocessing"]
4
+
5
+ import warnings
6
+ from dataclasses import dataclass
7
+ from typing import Any, Iterable, Literal, Mapping, TypeVar
8
+
9
+ import numpy as np
10
+ from numpy.typing import ArrayLike, NDArray
11
+ from scipy.stats import wasserstein_distance as wd
12
+
13
+ from dataeval.interop import as_numpy, to_numpy
14
+ from dataeval.output import OutputMetadata, set_metadata
15
+ from dataeval.utils.metadata import merge_metadata
16
+
17
+ TNum = TypeVar("TNum", int, float)
18
+ DISCRETE_MIN_WD = 0.054
19
+ CONTINUOUS_MIN_SAMPLE_SIZE = 20
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class MetadataOutput(OutputMetadata):
24
+ """
25
+ Output class for :func:`metadata_binning` function
26
+
27
+ Attributes
28
+ ----------
29
+ discrete_factor_names : list[str]
30
+ List containing factor names for the original data that was discrete and the binned continuous data
31
+ discrete_data : NDArray[np.int]
32
+ Array containing values for the original data that was discrete and the binned continuous data
33
+ continuous_factor_names : list[str]
34
+ List containing factor names for the original continuous data
35
+ continuous_data : NDArray[np.int or np.double] | None
36
+ Array containing values for the original continuous data or None if there was no continuous data
37
+ class_labels : NDArray[np.int]
38
+ Numerical class labels for the images/objects
39
+ class_names : NDArray[Any]
40
+ Array of unique class names (for use with plotting)
41
+ total_num_factors : int
42
+ Sum of discrete_factor_names and continuous_factor_names plus 1 for class
43
+ """
44
+
45
+ discrete_factor_names: list[str]
46
+ discrete_data: NDArray[np.int_]
47
+ continuous_factor_names: list[str]
48
+ continuous_data: NDArray[np.int_ | np.double] | None
49
+ class_labels: NDArray[np.int_]
50
+ class_names: NDArray[Any]
51
+ total_num_factors: int
52
+
53
+
54
+ @set_metadata()
55
+ def metadata_preprocessing(
56
+ raw_metadata: Iterable[Mapping[str, Any]],
57
+ class_labels: ArrayLike | str,
58
+ continuous_factor_bins: Mapping[str, int | list[tuple[TNum, TNum]]] | None = None,
59
+ auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
60
+ exclude: Iterable[str] | None = None,
61
+ ) -> MetadataOutput:
62
+ """
63
+ Restructures the metadata to be in the correct format for the bias functions.
64
+
65
+ This identifies whether the incoming metadata is discrete or continuous,
66
+ and whether the data is already binned or still needs binning.
67
+ It accepts a list of dictionaries containing the per image metadata and
68
+ automatically adjusts for multiple targets in an image.
69
+
70
+ Parameters
71
+ ----------
72
+ raw_metadata : Iterable[Mapping[str, Any]]
73
+ Iterable collection of metadata dictionaries to flatten and merge.
74
+ class_labels : ArrayLike or string or None
75
+ If arraylike, expects the labels for each image (image classification) or each object (object detection).
76
+ If the labels are included in the metadata dictionary, pass in the key value.
77
+ continuous_factor_bins : Mapping[str, int] or Mapping[str, list[tuple[TNum, TNum]]] or None, default None
78
+ User provided dictionary specifying how to bin the continuous metadata factors
79
+ auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
80
+ Method by which the function will automatically bin continuous metadata factors. It is recommended
81
+ that the user provide the bins through the `continuous_factor_bins`.
82
+ exclude : Iterable[str] or None, default None
83
+ User provided collection of metadata keys to exclude when processing metadata.
84
+
85
+ Returns
86
+ -------
87
+ MetadataOutput
88
+ Output class containing the binned metadata
89
+ """
90
+ # Transform metadata into single, flattened dictionary
91
+ metadata, image_repeats = merge_metadata(raw_metadata)
92
+
93
+ # Drop any excluded metadata keys
94
+ if exclude:
95
+ for k in list(metadata):
96
+ if k in exclude:
97
+ metadata.pop(k)
98
+
99
+ # Get the class label array in numeric form
100
+ class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
101
+ if class_array.ndim > 1:
102
+ raise ValueError(
103
+ f"Got class labels with {class_array.ndim}-dimensional "
104
+ f"shape {class_array.shape}, but expected a 1-dimensional array."
105
+ )
106
+ if not np.issubdtype(class_array.dtype, np.int_):
107
+ unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
108
+ else:
109
+ numerical_labels = class_array
110
+ unique_classes = np.unique(class_array)
111
+
112
+ # Bin according to user supplied bins
113
+ continuous_metadata = {}
114
+ discrete_metadata = {}
115
+ if continuous_factor_bins is not None and continuous_factor_bins != {}:
116
+ invalid_keys = set(continuous_factor_bins.keys()) - set(metadata.keys())
117
+ if invalid_keys:
118
+ raise KeyError(
119
+ f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
120
+ "but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
121
+ "or add corresponding entries to the `metadata` dictionary."
122
+ )
123
+ for factor, grouping in continuous_factor_bins.items():
124
+ discrete_metadata[factor] = _user_defined_bin(metadata[factor], grouping)
125
+ continuous_metadata[factor] = metadata[factor]
126
+
127
+ # Determine category of the rest of the keys
128
+ remaining_keys = set(metadata.keys()) - set(continuous_metadata.keys())
129
+ for key in remaining_keys:
130
+ data = to_numpy(metadata[key])
131
+ if np.issubdtype(data.dtype, np.number):
132
+ result = _is_continuous(data, image_repeats)
133
+ if result:
134
+ continuous_metadata[key] = data
135
+ unique_samples, ordinal_data = np.unique(data, return_inverse=True)
136
+ if unique_samples.size <= np.max([20, data.size * 0.01]):
137
+ discrete_metadata[key] = ordinal_data
138
+ else:
139
+ warnings.warn(
140
+ f"A user defined binning was not provided for {key}. "
141
+ f"Using the {auto_bin_method} method to discretize the data. "
142
+ "It is recommended that the user rerun and supply the desired "
143
+ "bins using the continuous_factor_bins parameter.",
144
+ UserWarning,
145
+ )
146
+ discrete_metadata[key] = _binning_function(data, auto_bin_method)
147
+ else:
148
+ _, discrete_metadata[key] = np.unique(data, return_inverse=True)
149
+
150
+ # splitting out the dictionaries into the keys and values
151
+ discrete_factor_names = list(discrete_metadata.keys())
152
+ discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
153
+ continuous_factor_names = list(continuous_metadata.keys())
154
+ continuous_data = np.stack(list(continuous_metadata.values()), axis=-1) if continuous_metadata else None
155
+ total_num_factors = len(discrete_factor_names + continuous_factor_names) + 1
156
+
157
+ return MetadataOutput(
158
+ discrete_factor_names,
159
+ discrete_data,
160
+ continuous_factor_names,
161
+ continuous_data,
162
+ numerical_labels,
163
+ unique_classes,
164
+ total_num_factors,
165
+ )
166
+
167
+
168
+ def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[TNum, TNum]]) -> NDArray[np.intp]:
169
+ """
170
+ Digitizes a list of values into a given number of bins.
171
+
172
+ Parameters
173
+ ----------
174
+ data : list | NDArray
175
+ The values to be digitized.
176
+ binning : int | list[tuple[TNum, TNum]]
177
+ The number of bins for the discrete values that data will be digitized into.
178
+
179
+ Returns
180
+ -------
181
+ NDArray[np.intp]
182
+ The digitized values
183
+ """
184
+
185
+ if not np.all([np.issubdtype(type(n), np.number) for n in data]):
186
+ raise TypeError(
187
+ "Encountered a data value with non-numeric type when digitizing a factor. "
188
+ "Ensure all occurrences of continuous factors are numeric types."
189
+ )
190
+ if type(binning) is int:
191
+ _, bin_edges = np.histogram(data, bins=binning)
192
+ bin_edges[-1] = np.inf
193
+ bin_edges[0] = -np.inf
194
+ else:
195
+ bin_edges = binning
196
+ return np.digitize(data, bin_edges)
197
+
198
+
199
+ def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
200
+ """
201
+ Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
202
+ """
203
+ if bin_method == "clusters":
204
+ # bin_edges = _binning_by_clusters(data)
205
+ warnings.warn(
206
+ "Binning by clusters is currently unavailable until changes to the clustering function go through.",
207
+ UserWarning,
208
+ )
209
+ bin_method = "uniform_width"
210
+
211
+ if bin_method != "clusters":
212
+ counts, bin_edges = np.histogram(data, bins="auto")
213
+ n_bins = counts.size
214
+ if counts[counts > 0].min() < 10:
215
+ for _ in range(20):
216
+ n_bins -= 1
217
+ counts, bin_edges = np.histogram(data, bins=n_bins)
218
+ if counts[counts > 0].min() >= 10 or n_bins < 2:
219
+ break
220
+
221
+ if bin_method == "uniform_count":
222
+ quantiles = np.linspace(0, 100, n_bins + 1)
223
+ bin_edges = np.asarray(np.percentile(data, quantiles))
224
+
225
+ bin_edges[0] = -np.inf # type: ignore # until the clusters speed up is merged
226
+ bin_edges[-1] = np.inf # type: ignore # and the _binning_by_clusters can be uncommented
227
+ return np.digitize(data, bin_edges) # type: ignore
228
+
229
+
230
+ def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
231
+ """
232
+ Determines whether the data is continuous or discrete using the Wasserstein distance.
233
+
234
+ Given a 1D sample, we consider the intervals between adjacent points. For a continuous distribution,
235
+ a point is equally likely to lie anywhere in the interval bounded by its two neighbors. Furthermore,
236
+ we can put all "between neighbor" locations on the same scale of 0 to 1 by subtracting the smaller
237
+ neighbor and dividing out the length of the interval. (Duplicates are either assigned to zero or
238
+ ignored, depending on context). These normalized locations will be much more uniformly distributed
239
+ for continuous data than for discrete, and this gives us a way to distinguish them. Call this the
240
+ Normalized Near Neighbor distribution (NNN), defined on the interval [0,1].
241
+
242
+ The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure
243
+ how close the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at
244
+ least 20 points, and furthermore at least half as many points as there are discrete values, we can
245
+ reliably distinguish discrete from continuous samples by testing that the Wasserstein distance
246
+ measured from a uniform distribution is greater or less than 0.054, respectively.
247
+ """
248
+ # Check if the metadata is image specific
249
+ _, data_indicies_unsorted = np.unique(data, return_index=True)
250
+ if data_indicies_unsorted.size == image_indicies.size:
251
+ data_indicies = np.sort(data_indicies_unsorted)
252
+ if (data_indicies == image_indicies).all():
253
+ data = data[data_indicies]
254
+
255
+ # OLD METHOD
256
+ # uvals = np.unique(data)
257
+ # pct_unique = uvals.size / data.size
258
+ # return pct_unique < threshold
259
+
260
+ n_examples = len(data)
261
+
262
+ if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
263
+ warnings.warn(
264
+ f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})", UserWarning
265
+ )
266
+ return False
267
+
268
+ # Require at least 3 unique values before bothering with NNN
269
+ xu = np.unique(data, axis=None)
270
+ if xu.size < 3:
271
+ return False
272
+
273
+ Xs = np.sort(data)
274
+
275
+ X0, X1 = Xs[0:-2], Xs[2:] # left and right neighbors
276
+
277
+ dx = np.zeros(n_examples - 2) # no dx at end points
278
+ gtz = (X1 - X0) > 0 # check for dups; dx will be zero for them
279
+ dx[np.logical_not(gtz)] = 0.0
280
+
281
+ dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz] # the core idea: dx is NNN samples.
282
+
283
+ shift = wd(dx, np.linspace(0, 1, dx.size)) # how far is dx from uniform, for this feature?
284
+
285
+ return shift < DISCRETE_MIN_WD # if NNN is close enough to uniform, consider the sample continuous.