dataeval 0.74.1__py3-none-any.whl → 0.75.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. dataeval/__init__.py +33 -10
  2. dataeval/detectors/__init__.py +2 -2
  3. dataeval/detectors/drift/__init__.py +14 -12
  4. dataeval/detectors/drift/base.py +1 -1
  5. dataeval/detectors/drift/cvm.py +1 -1
  6. dataeval/detectors/drift/ks.py +1 -1
  7. dataeval/detectors/drift/mmd.py +6 -5
  8. dataeval/detectors/drift/torch.py +12 -12
  9. dataeval/detectors/drift/uncertainty.py +3 -2
  10. dataeval/detectors/linters/__init__.py +4 -4
  11. dataeval/detectors/linters/clusterer.py +2 -7
  12. dataeval/detectors/linters/duplicates.py +6 -10
  13. dataeval/detectors/linters/outliers.py +4 -2
  14. dataeval/detectors/ood/__init__.py +3 -10
  15. dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
  16. dataeval/detectors/ood/base.py +64 -161
  17. dataeval/detectors/ood/metadata_ks_compare.py +34 -42
  18. dataeval/detectors/ood/metadata_least_likely.py +3 -3
  19. dataeval/detectors/ood/metadata_ood_mi.py +6 -5
  20. dataeval/detectors/ood/mixin.py +146 -0
  21. dataeval/detectors/ood/output.py +63 -0
  22. dataeval/interop.py +16 -3
  23. dataeval/log.py +18 -0
  24. dataeval/metrics/__init__.py +2 -2
  25. dataeval/metrics/bias/__init__.py +9 -12
  26. dataeval/metrics/bias/balance.py +10 -8
  27. dataeval/metrics/bias/coverage.py +52 -4
  28. dataeval/metrics/bias/diversity.py +42 -14
  29. dataeval/metrics/bias/parity.py +15 -12
  30. dataeval/metrics/estimators/__init__.py +2 -2
  31. dataeval/metrics/estimators/ber.py +3 -1
  32. dataeval/metrics/estimators/divergence.py +1 -1
  33. dataeval/metrics/estimators/uap.py +1 -1
  34. dataeval/metrics/stats/__init__.py +18 -18
  35. dataeval/metrics/stats/base.py +4 -4
  36. dataeval/metrics/stats/boxratiostats.py +8 -9
  37. dataeval/metrics/stats/datasetstats.py +10 -14
  38. dataeval/metrics/stats/dimensionstats.py +4 -4
  39. dataeval/metrics/stats/hashstats.py +12 -8
  40. dataeval/metrics/stats/labelstats.py +5 -5
  41. dataeval/metrics/stats/pixelstats.py +4 -9
  42. dataeval/metrics/stats/visualstats.py +4 -9
  43. dataeval/output.py +1 -1
  44. dataeval/utils/__init__.py +4 -13
  45. dataeval/utils/dataset/__init__.py +7 -0
  46. dataeval/utils/{torch → dataset}/datasets.py +2 -0
  47. dataeval/utils/dataset/read.py +63 -0
  48. dataeval/utils/dataset/split.py +527 -0
  49. dataeval/utils/image.py +2 -2
  50. dataeval/utils/metadata.py +310 -5
  51. dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +1 -104
  52. dataeval/utils/torch/__init__.py +2 -17
  53. dataeval/utils/torch/gmm.py +29 -6
  54. dataeval/utils/torch/{utils.py → internal.py} +82 -58
  55. dataeval/utils/torch/models.py +10 -8
  56. dataeval/utils/torch/trainer.py +6 -85
  57. dataeval/workflows/__init__.py +2 -5
  58. dataeval/workflows/sufficiency.py +16 -6
  59. dataeval-0.75.0.dist-info/METADATA +136 -0
  60. dataeval-0.75.0.dist-info/RECORD +67 -0
  61. dataeval/detectors/ood/base_torch.py +0 -109
  62. dataeval/metrics/bias/metadata_preprocessing.py +0 -285
  63. dataeval/utils/gmm.py +0 -26
  64. dataeval/utils/split_dataset.py +0 -492
  65. dataeval-0.74.1.dist-info/METADATA +0 -120
  66. dataeval-0.74.1.dist-info/RECORD +0 -65
  67. {dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/LICENSE.txt +0 -0
  68. {dataeval-0.74.1.dist-info → dataeval-0.75.0.dist-info}/WHEEL +0 -0
@@ -1,285 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = ["MetadataOutput", "metadata_preprocessing"]
4
-
5
- import warnings
6
- from dataclasses import dataclass
7
- from typing import Any, Iterable, Literal, Mapping, TypeVar
8
-
9
- import numpy as np
10
- from numpy.typing import ArrayLike, NDArray
11
- from scipy.stats import wasserstein_distance as wd
12
-
13
- from dataeval.interop import as_numpy, to_numpy
14
- from dataeval.output import Output, set_metadata
15
- from dataeval.utils.metadata import merge_metadata
16
-
17
- TNum = TypeVar("TNum", int, float)
18
- DISCRETE_MIN_WD = 0.054
19
- CONTINUOUS_MIN_SAMPLE_SIZE = 20
20
-
21
-
22
- @dataclass(frozen=True)
23
- class MetadataOutput(Output):
24
- """
25
- Output class for :func:`metadata_binning` function
26
-
27
- Attributes
28
- ----------
29
- discrete_factor_names : list[str]
30
- List containing factor names for the original data that was discrete and the binned continuous data
31
- discrete_data : NDArray[np.int]
32
- Array containing values for the original data that was discrete and the binned continuous data
33
- continuous_factor_names : list[str]
34
- List containing factor names for the original continuous data
35
- continuous_data : NDArray[np.int or np.double] | None
36
- Array containing values for the original continuous data or None if there was no continuous data
37
- class_labels : NDArray[np.int]
38
- Numerical class labels for the images/objects
39
- class_names : NDArray[Any]
40
- Array of unique class names (for use with plotting)
41
- total_num_factors : int
42
- Sum of discrete_factor_names and continuous_factor_names plus 1 for class
43
- """
44
-
45
- discrete_factor_names: list[str]
46
- discrete_data: NDArray[np.int_]
47
- continuous_factor_names: list[str]
48
- continuous_data: NDArray[np.int_ | np.double] | None
49
- class_labels: NDArray[np.int_]
50
- class_names: NDArray[Any]
51
- total_num_factors: int
52
-
53
-
54
- @set_metadata
55
- def metadata_preprocessing(
56
- raw_metadata: Iterable[Mapping[str, Any]],
57
- class_labels: ArrayLike | str,
58
- continuous_factor_bins: Mapping[str, int | list[tuple[TNum, TNum]]] | None = None,
59
- auto_bin_method: Literal["uniform_width", "uniform_count", "clusters"] = "uniform_width",
60
- exclude: Iterable[str] | None = None,
61
- ) -> MetadataOutput:
62
- """
63
- Restructures the metadata to be in the correct format for the bias functions.
64
-
65
- This identifies whether the incoming metadata is discrete or continuous,
66
- and whether the data is already binned or still needs binning.
67
- It accepts a list of dictionaries containing the per image metadata and
68
- automatically adjusts for multiple targets in an image.
69
-
70
- Parameters
71
- ----------
72
- raw_metadata : Iterable[Mapping[str, Any]]
73
- Iterable collection of metadata dictionaries to flatten and merge.
74
- class_labels : ArrayLike or string or None
75
- If arraylike, expects the labels for each image (image classification) or each object (object detection).
76
- If the labels are included in the metadata dictionary, pass in the key value.
77
- continuous_factor_bins : Mapping[str, int] or Mapping[str, list[tuple[TNum, TNum]]] or None, default None
78
- User provided dictionary specifying how to bin the continuous metadata factors
79
- auto_bin_method : "uniform_width" or "uniform_count" or "clusters", default "uniform_width"
80
- Method by which the function will automatically bin continuous metadata factors. It is recommended
81
- that the user provide the bins through the `continuous_factor_bins`.
82
- exclude : Iterable[str] or None, default None
83
- User provided collection of metadata keys to exclude when processing metadata.
84
-
85
- Returns
86
- -------
87
- MetadataOutput
88
- Output class containing the binned metadata
89
- """
90
- # Transform metadata into single, flattened dictionary
91
- metadata, image_repeats = merge_metadata(raw_metadata)
92
-
93
- # Drop any excluded metadata keys
94
- if exclude:
95
- for k in list(metadata):
96
- if k in exclude:
97
- metadata.pop(k)
98
-
99
- # Get the class label array in numeric form
100
- class_array = as_numpy(metadata.pop(class_labels)) if isinstance(class_labels, str) else as_numpy(class_labels)
101
- if class_array.ndim > 1:
102
- raise ValueError(
103
- f"Got class labels with {class_array.ndim}-dimensional "
104
- f"shape {class_array.shape}, but expected a 1-dimensional array."
105
- )
106
- if not np.issubdtype(class_array.dtype, np.int_):
107
- unique_classes, numerical_labels = np.unique(class_array, return_inverse=True)
108
- else:
109
- numerical_labels = class_array
110
- unique_classes = np.unique(class_array)
111
-
112
- # Bin according to user supplied bins
113
- continuous_metadata = {}
114
- discrete_metadata = {}
115
- if continuous_factor_bins is not None and continuous_factor_bins != {}:
116
- invalid_keys = set(continuous_factor_bins.keys()) - set(metadata.keys())
117
- if invalid_keys:
118
- raise KeyError(
119
- f"The keys - {invalid_keys} - are present in the `continuous_factor_bins` dictionary "
120
- "but are not keys in the `metadata` dictionary. Delete these keys from `continuous_factor_bins` "
121
- "or add corresponding entries to the `metadata` dictionary."
122
- )
123
- for factor, grouping in continuous_factor_bins.items():
124
- discrete_metadata[factor] = _user_defined_bin(metadata[factor], grouping)
125
- continuous_metadata[factor] = metadata[factor]
126
-
127
- # Determine category of the rest of the keys
128
- remaining_keys = set(metadata.keys()) - set(continuous_metadata.keys())
129
- for key in remaining_keys:
130
- data = to_numpy(metadata[key])
131
- if np.issubdtype(data.dtype, np.number):
132
- result = _is_continuous(data, image_repeats)
133
- if result:
134
- continuous_metadata[key] = data
135
- unique_samples, ordinal_data = np.unique(data, return_inverse=True)
136
- if unique_samples.size <= np.max([20, data.size * 0.01]):
137
- discrete_metadata[key] = ordinal_data
138
- else:
139
- warnings.warn(
140
- f"A user defined binning was not provided for {key}. "
141
- f"Using the {auto_bin_method} method to discretize the data. "
142
- "It is recommended that the user rerun and supply the desired "
143
- "bins using the continuous_factor_bins parameter.",
144
- UserWarning,
145
- )
146
- discrete_metadata[key] = _binning_function(data, auto_bin_method)
147
- else:
148
- _, discrete_metadata[key] = np.unique(data, return_inverse=True)
149
-
150
- # splitting out the dictionaries into the keys and values
151
- discrete_factor_names = list(discrete_metadata.keys())
152
- discrete_data = np.stack(list(discrete_metadata.values()), axis=-1)
153
- continuous_factor_names = list(continuous_metadata.keys())
154
- continuous_data = np.stack(list(continuous_metadata.values()), axis=-1) if continuous_metadata else None
155
- total_num_factors = len(discrete_factor_names + continuous_factor_names) + 1
156
-
157
- return MetadataOutput(
158
- discrete_factor_names,
159
- discrete_data,
160
- continuous_factor_names,
161
- continuous_data,
162
- numerical_labels,
163
- unique_classes,
164
- total_num_factors,
165
- )
166
-
167
-
168
- def _user_defined_bin(data: list[Any] | NDArray[Any], binning: int | list[tuple[TNum, TNum]]) -> NDArray[np.intp]:
169
- """
170
- Digitizes a list of values into a given number of bins.
171
-
172
- Parameters
173
- ----------
174
- data : list | NDArray
175
- The values to be digitized.
176
- binning : int | list[tuple[TNum, TNum]]
177
- The number of bins for the discrete values that data will be digitized into.
178
-
179
- Returns
180
- -------
181
- NDArray[np.intp]
182
- The digitized values
183
- """
184
-
185
- if not np.all([np.issubdtype(type(n), np.number) for n in data]):
186
- raise TypeError(
187
- "Encountered a data value with non-numeric type when digitizing a factor. "
188
- "Ensure all occurrences of continuous factors are numeric types."
189
- )
190
- if type(binning) is int:
191
- _, bin_edges = np.histogram(data, bins=binning)
192
- bin_edges[-1] = np.inf
193
- bin_edges[0] = -np.inf
194
- else:
195
- bin_edges = binning
196
- return np.digitize(data, bin_edges)
197
-
198
-
199
- def _binning_function(data: NDArray[Any], bin_method: str) -> NDArray[np.int_]:
200
- """
201
- Bins continuous data through either equal width bins, equal amounts in each bin, or by clusters.
202
- """
203
- if bin_method == "clusters":
204
- # bin_edges = _binning_by_clusters(data)
205
- warnings.warn(
206
- "Binning by clusters is currently unavailable until changes to the clustering function go through.",
207
- UserWarning,
208
- )
209
- bin_method = "uniform_width"
210
-
211
- if bin_method != "clusters":
212
- counts, bin_edges = np.histogram(data, bins="auto")
213
- n_bins = counts.size
214
- if counts[counts > 0].min() < 10:
215
- for _ in range(20):
216
- n_bins -= 1
217
- counts, bin_edges = np.histogram(data, bins=n_bins)
218
- if counts[counts > 0].min() >= 10 or n_bins < 2:
219
- break
220
-
221
- if bin_method == "uniform_count":
222
- quantiles = np.linspace(0, 100, n_bins + 1)
223
- bin_edges = np.asarray(np.percentile(data, quantiles))
224
-
225
- bin_edges[0] = -np.inf # type: ignore # until the clusters speed up is merged
226
- bin_edges[-1] = np.inf # type: ignore # and the _binning_by_clusters can be uncommented
227
- return np.digitize(data, bin_edges) # type: ignore
228
-
229
-
230
- def _is_continuous(data: NDArray[np.number], image_indicies: NDArray[np.number]) -> bool:
231
- """
232
- Determines whether the data is continuous or discrete using the Wasserstein distance.
233
-
234
- Given a 1D sample, we consider the intervals between adjacent points. For a continuous distribution,
235
- a point is equally likely to lie anywhere in the interval bounded by its two neighbors. Furthermore,
236
- we can put all "between neighbor" locations on the same scale of 0 to 1 by subtracting the smaller
237
- neighbor and dividing out the length of the interval. (Duplicates are either assigned to zero or
238
- ignored, depending on context). These normalized locations will be much more uniformly distributed
239
- for continuous data than for discrete, and this gives us a way to distinguish them. Call this the
240
- Normalized Near Neighbor distribution (NNN), defined on the interval [0,1].
241
-
242
- The Wasserstein distance is available in scipy.stats.wasserstein_distance. We can use it to measure
243
- how close the NNN is to a uniform distribution over [0,1]. We found that as long as a sample has at
244
- least 20 points, and furthermore at least half as many points as there are discrete values, we can
245
- reliably distinguish discrete from continuous samples by testing that the Wasserstein distance
246
- measured from a uniform distribution is greater or less than 0.054, respectively.
247
- """
248
- # Check if the metadata is image specific
249
- _, data_indicies_unsorted = np.unique(data, return_index=True)
250
- if data_indicies_unsorted.size == image_indicies.size:
251
- data_indicies = np.sort(data_indicies_unsorted)
252
- if (data_indicies == image_indicies).all():
253
- data = data[data_indicies]
254
-
255
- # OLD METHOD
256
- # uvals = np.unique(data)
257
- # pct_unique = uvals.size / data.size
258
- # return pct_unique < threshold
259
-
260
- n_examples = len(data)
261
-
262
- if n_examples < CONTINUOUS_MIN_SAMPLE_SIZE:
263
- warnings.warn(
264
- f"All samples look discrete with so few data points (< {CONTINUOUS_MIN_SAMPLE_SIZE})", UserWarning
265
- )
266
- return False
267
-
268
- # Require at least 3 unique values before bothering with NNN
269
- xu = np.unique(data, axis=None)
270
- if xu.size < 3:
271
- return False
272
-
273
- Xs = np.sort(data)
274
-
275
- X0, X1 = Xs[0:-2], Xs[2:] # left and right neighbors
276
-
277
- dx = np.zeros(n_examples - 2) # no dx at end points
278
- gtz = (X1 - X0) > 0 # check for dups; dx will be zero for them
279
- dx[np.logical_not(gtz)] = 0.0
280
-
281
- dx[gtz] = (Xs[1:-1] - X0)[gtz] / (X1 - X0)[gtz] # the core idea: dx is NNN samples.
282
-
283
- shift = wd(dx, np.linspace(0, 1, dx.size)) # how far is dx from uniform, for this feature?
284
-
285
- return shift < DISCRETE_MIN_WD # if NNN is close enough to uniform, consider the sample continuous.
dataeval/utils/gmm.py DELETED
@@ -1,26 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Generic, TypeVar
3
-
4
- TGMMData = TypeVar("TGMMData")
5
-
6
-
7
- @dataclass
8
- class GaussianMixtureModelParams(Generic[TGMMData]):
9
- """
10
- phi : TGMMData
11
- Mixture component distribution weights.
12
- mu : TGMMData
13
- Mixture means.
14
- cov : TGMMData
15
- Mixture covariance.
16
- L : TGMMData
17
- Cholesky decomposition of `cov`.
18
- log_det_cov : TGMMData
19
- Log of the determinant of `cov`.
20
- """
21
-
22
- phi: TGMMData
23
- mu: TGMMData
24
- cov: TGMMData
25
- L: TGMMData
26
- log_det_cov: TGMMData