dataeval 0.69.4__py3-none-any.whl → 0.70.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. dataeval/__init__.py +8 -8
  2. dataeval/_internal/datasets.py +235 -131
  3. dataeval/_internal/detectors/clusterer.py +2 -0
  4. dataeval/_internal/detectors/drift/base.py +7 -8
  5. dataeval/_internal/detectors/drift/mmd.py +4 -4
  6. dataeval/_internal/detectors/duplicates.py +64 -45
  7. dataeval/_internal/detectors/merged_stats.py +23 -54
  8. dataeval/_internal/detectors/ood/ae.py +8 -6
  9. dataeval/_internal/detectors/ood/aegmm.py +6 -4
  10. dataeval/_internal/detectors/ood/base.py +12 -7
  11. dataeval/_internal/detectors/ood/llr.py +6 -4
  12. dataeval/_internal/detectors/ood/vae.py +5 -3
  13. dataeval/_internal/detectors/ood/vaegmm.py +6 -4
  14. dataeval/_internal/detectors/outliers.py +137 -63
  15. dataeval/_internal/interop.py +11 -7
  16. dataeval/_internal/metrics/balance.py +13 -11
  17. dataeval/_internal/metrics/ber.py +5 -3
  18. dataeval/_internal/metrics/coverage.py +4 -0
  19. dataeval/_internal/metrics/divergence.py +9 -5
  20. dataeval/_internal/metrics/diversity.py +14 -12
  21. dataeval/_internal/metrics/parity.py +32 -22
  22. dataeval/_internal/metrics/stats/base.py +231 -0
  23. dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
  24. dataeval/_internal/metrics/stats/datasetstats.py +99 -0
  25. dataeval/_internal/metrics/stats/dimensionstats.py +113 -0
  26. dataeval/_internal/metrics/stats/hashstats.py +75 -0
  27. dataeval/_internal/metrics/stats/labelstats.py +125 -0
  28. dataeval/_internal/metrics/stats/pixelstats.py +119 -0
  29. dataeval/_internal/metrics/stats/visualstats.py +124 -0
  30. dataeval/_internal/metrics/uap.py +8 -4
  31. dataeval/_internal/metrics/utils.py +30 -15
  32. dataeval/_internal/models/pytorch/autoencoder.py +5 -5
  33. dataeval/_internal/models/tensorflow/pixelcnn.py +1 -4
  34. dataeval/_internal/output.py +3 -18
  35. dataeval/_internal/utils.py +11 -16
  36. dataeval/_internal/workflows/sufficiency.py +152 -151
  37. dataeval/detectors/__init__.py +4 -0
  38. dataeval/detectors/drift/__init__.py +8 -3
  39. dataeval/detectors/drift/kernels/__init__.py +4 -0
  40. dataeval/detectors/drift/updates/__init__.py +4 -0
  41. dataeval/detectors/linters/__init__.py +15 -4
  42. dataeval/detectors/ood/__init__.py +14 -2
  43. dataeval/metrics/__init__.py +5 -0
  44. dataeval/metrics/bias/__init__.py +13 -4
  45. dataeval/metrics/estimators/__init__.py +8 -8
  46. dataeval/metrics/stats/__init__.py +25 -3
  47. dataeval/utils/__init__.py +16 -3
  48. dataeval/utils/tensorflow/__init__.py +11 -0
  49. dataeval/utils/torch/__init__.py +12 -0
  50. dataeval/utils/torch/datasets/__init__.py +7 -0
  51. dataeval/workflows/__init__.py +6 -2
  52. {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/METADATA +12 -4
  53. dataeval-0.70.1.dist-info/RECORD +80 -0
  54. {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/WHEEL +1 -1
  55. dataeval/_internal/flags.py +0 -77
  56. dataeval/_internal/metrics/stats.py +0 -397
  57. dataeval/flags/__init__.py +0 -3
  58. dataeval/tensorflow/__init__.py +0 -3
  59. dataeval/torch/__init__.py +0 -3
  60. dataeval-0.69.4.dist-info/RECORD +0 -74
  61. /dataeval/{tensorflow → utils/tensorflow}/loss/__init__.py +0 -0
  62. /dataeval/{tensorflow → utils/tensorflow}/models/__init__.py +0 -0
  63. /dataeval/{tensorflow → utils/tensorflow}/recon/__init__.py +0 -0
  64. /dataeval/{torch → utils/torch}/models/__init__.py +0 -0
  65. /dataeval/{torch → utils/torch}/trainer/__init__.py +0 -0
  66. {dataeval-0.69.4.dist-info → dataeval-0.70.1.dist-info}/LICENSE.txt +0 -0
@@ -1,39 +1,47 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Iterable, Literal, Sequence, cast
5
- from warnings import warn
4
+ from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
6
5
 
7
6
  import numpy as np
8
7
  from numpy.typing import ArrayLike, NDArray
9
8
 
10
9
  from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
11
- from dataeval._internal.flags import ImageStat, to_distinct, verify_supported
12
- from dataeval._internal.metrics.stats import StatsOutput, imagestats
10
+ from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
11
+ from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
12
+ from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
13
+ from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
14
+ from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
13
15
  from dataeval._internal.output import OutputMetadata, set_metadata
14
16
 
15
17
  IndexIssueMap = dict[int, dict[str, float]]
16
- DatasetIndexIssueMap = dict[int, IndexIssueMap]
17
- """
18
- Mapping of image indices to a dictionary of issue types and calculated values
19
- """
18
+ OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
19
+ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
20
20
 
21
21
 
22
22
  @dataclass(frozen=True)
23
- class OutliersOutput(OutputMetadata):
23
+ class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
24
24
  """
25
+ Output class for :class:`Outliers` lint detector
26
+
25
27
  Attributes
26
28
  ----------
27
- issues : dict[int, dict[str, float]] | dict[int, dict[int, dict[str, float]]]
29
+ issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
28
30
  Indices of image outliers with their associated issue type and calculated values.
29
31
 
30
32
  - For a single dataset, a dictionary containing the indices of outliers and
31
33
  a dictionary showing the issues and calculated values for the given index.
32
- - For multiple datasets, a map of dataset indices to the indices of outliers
33
- and their associated issues and calculated values.
34
+ - For multiple stats outputs, a list of dictionaries containing the indices of
35
+ outliers and their associated issues and calculated values.
34
36
  """
35
37
 
36
- issues: IndexIssueMap | DatasetIndexIssueMap
38
+ issues: TIndexIssueMap
39
+
40
+ def __len__(self):
41
+ if isinstance(self.issues, dict):
42
+ return len(self.issues)
43
+ else:
44
+ return sum(len(d) for d in self.issues)
37
45
 
38
46
 
39
47
  def _get_outlier_mask(
@@ -43,7 +51,7 @@ def _get_outlier_mask(
43
51
  threshold = threshold if threshold else 3.0
44
52
  std = np.std(values)
45
53
  abs_diff = np.abs(values - np.mean(values))
46
- return (abs_diff / std) > threshold
54
+ return std != 0 and (abs_diff / std) > threshold
47
55
  elif method == "modzscore":
48
56
  threshold = threshold if threshold else 3.5
49
57
  abs_diff = np.abs(values - np.median(values))
@@ -65,9 +73,6 @@ class Outliers:
65
73
 
66
74
  Parameters
67
75
  ----------
68
- flags : ImageStat, default ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS
69
- Metric(s) to calculate for each image - calculates all metrics if None
70
- Only supports ImageStat.ALL_STATS
71
76
  outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
72
77
  Statistical method used to identify outliers
73
78
  outlier_threshold : float, optional - default None
@@ -76,15 +81,15 @@ class Outliers:
76
81
 
77
82
  Attributes
78
83
  ----------
79
- stats : dict[str, Any]
80
- Dictionary to hold the value of each metric for each image
84
+ stats : tuple[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
85
+ Various stats output classes that hold the value of each metric for each image
81
86
 
82
87
  See Also
83
88
  --------
84
89
  Duplicates
85
90
 
86
- Notes
87
- ------
91
+ Note
92
+ ----
88
93
  There are 3 different statistical methods:
89
94
 
90
95
  - zscore
@@ -109,52 +114,61 @@ class Outliers:
109
114
 
110
115
  >>> outliers = Outliers()
111
116
 
112
- Specifying specific metrics to analyze:
113
-
114
- >>> outliers = Outliers(flags=ImageStat.SIZE | ImageStat.ALL_VISUALS)
115
-
116
117
  Specifying an outlier method:
117
118
 
118
119
  >>> outliers = Outliers(outlier_method="iqr")
119
120
 
120
121
  Specifying an outlier method and threshold:
121
122
 
122
- >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=2.75)
123
+ >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
123
124
  """
124
125
 
125
126
  def __init__(
126
127
  self,
127
- flags: ImageStat = ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS,
128
+ use_dimension: bool = True,
129
+ use_pixel: bool = True,
130
+ use_visual: bool = True,
128
131
  outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
129
132
  outlier_threshold: float | None = None,
130
133
  ):
131
- verify_supported(flags, ImageStat.ALL_STATS)
132
- self.flags = flags
134
+ self.stats: DatasetStatsOutput
135
+ self.use_dimension = use_dimension
136
+ self.use_pixel = use_pixel
137
+ self.use_visual = use_visual
133
138
  self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
134
139
  self.outlier_threshold = outlier_threshold
135
140
 
136
- def _get_outliers(self) -> dict:
137
- flagged_images = {}
138
- stats_dict = self.stats.dict()
139
- supported = to_distinct(ImageStat.ALL_STATS)
140
- for stat, values in stats_dict.items():
141
- if stat in supported.values() and values.ndim == 1 and np.std(values) != 0:
142
- mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
141
+ def _get_outliers(self, stats: dict) -> dict[int, dict[str, float]]:
142
+ flagged_images: dict[int, dict[str, float]] = {}
143
+ for stat, values in stats.items():
144
+ if stat in (SOURCE_INDEX, BOX_COUNT):
145
+ continue
146
+ if values.ndim == 1:
147
+ mask = _get_outlier_mask(values.astype(np.float64), self.outlier_method, self.outlier_threshold)
143
148
  indices = np.flatnonzero(mask)
144
149
  for i, value in zip(indices, values[mask]):
145
- flagged_images.setdefault(i, {}).update({stat: np.round(value, 2)})
150
+ flagged_images.setdefault(i, {}).update({stat: value})
146
151
 
147
152
  return dict(sorted(flagged_images.items()))
148
153
 
149
- @set_metadata("dataeval.detectors", ["flags", "outlier_method", "outlier_threshold"])
150
- def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> OutliersOutput:
154
+ @overload
155
+ def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
156
+
157
+ @overload
158
+ def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
159
+
160
+ @set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
161
+ def from_stats(
162
+ self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
163
+ ) -> OutliersOutput:
151
164
  """
152
165
  Returns indices of outliers with the issues identified for each
153
166
 
154
167
  Parameters
155
168
  ----------
156
- data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput | Sequence[StatsOutput]
157
- A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
169
+ stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
170
+ The output(s) from a dimensionstats, pixelstats, or visualstats metric
171
+ analysis or an aggregate DatasetStatsOutput
158
172
 
159
173
  Returns
160
174
  -------
@@ -162,36 +176,96 @@ class Outliers:
162
176
  Output class containing the indices of outliers and a dictionary showing
163
177
  the issues and calculated values for the given index.
164
178
 
179
+ See Also
180
+ --------
181
+ dimensionstats
182
+ pixelstats
183
+ visualstats
184
+
165
185
  Example
166
186
  -------
167
187
  Evaluate the dataset:
168
188
 
169
- >>> outliers.evaluate(images)
170
- OutliersOutput(issues={10: {'blurriness': 1.26, 'contrast': 1.06, 'zeros': 0.05}, 12: {'blurriness': 1.51, 'contrast': 1.06, 'zeros': 0.05}})
189
+ >>> results = outliers.from_stats([stats1, stats2])
190
+ >>> len(results)
191
+ 2
192
+ >>> results.issues[0]
193
+ {10: {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}, 12: {'std': 0.00536, 'var': 2.87e-05, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}}
194
+ >>> results.issues[1]
195
+ {}
171
196
  """ # noqa: E501
172
- stats, dataset_steps = combine_stats(data)
173
-
174
- if isinstance(stats, StatsOutput):
175
- selected_flags = set(to_distinct(self.flags).values())
176
- provided = set(stats.dict())
177
- missing = selected_flags - provided
178
- if missing:
179
- warn(
180
- f"StatsOutput provided {provided} and is missing {missing} \
181
- from the selected stat flags: {selected_flags}."
197
+ if isinstance(stats, DatasetStatsOutput):
198
+ outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
199
+ return OutliersOutput(outliers)
200
+
201
+ if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
202
+ return OutliersOutput(self._get_outliers(stats.dict()))
203
+
204
+ if not isinstance(stats, Sequence):
205
+ raise TypeError(
206
+ "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
207
+ )
208
+
209
+ stats_map: dict[type, list[int]] = {}
210
+ for i, stats_output in enumerate(stats):
211
+ if not isinstance(
212
+ stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
213
+ ):
214
+ raise TypeError(
215
+ "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
182
216
  )
183
- self.stats = stats
184
- else:
185
- self.stats = imagestats(cast(Iterable[ArrayLike], data), self.flags)
186
-
187
- outliers = self._get_outliers()
217
+ stats_map.setdefault(type(stats_output), []).append(i)
188
218
 
189
- # split up results from combined dataset into individual dataset buckets
190
- if dataset_steps:
191
- out_dict = {}
219
+ output_list: list[dict[int, dict[str, float]]] = [{} for _ in stats]
220
+ for _, indices in stats_map.items():
221
+ substats, dataset_steps = combine_stats([stats[i] for i in indices])
222
+ outliers = self._get_outliers(substats.dict())
192
223
  for idx, issue in outliers.items():
193
224
  k, v = get_dataset_step_from_idx(idx, dataset_steps)
194
- out_dict.setdefault(k, {})[v] = issue
195
- outliers = out_dict
225
+ output_list[indices[k]][v] = issue
226
+
227
+ return OutliersOutput(output_list)
228
+
229
+ @set_metadata(
230
+ "dataeval.detectors",
231
+ [
232
+ "use_dimension",
233
+ "use_pixel",
234
+ "use_visual",
235
+ "outlier_method",
236
+ "outlier_threshold",
237
+ ],
238
+ )
239
+ def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
240
+ """
241
+ Returns indices of outliers with the issues identified for each
242
+
243
+ Parameters
244
+ ----------
245
+ data : Iterable[ArrayLike], shape - (C, H, W)
246
+ A dataset of images in an ArrayLike format
196
247
 
248
+ Returns
249
+ -------
250
+ OutliersOutput
251
+ Output class containing the indices of outliers and a dictionary showing
252
+ the issues and calculated values for the given index.
253
+
254
+ Example
255
+ -------
256
+ Evaluate the dataset:
257
+
258
+ >>> results = outliers.evaluate(images)
259
+ >>> list(results.issues)
260
+ [10, 12]
261
+ >>> results.issues[10]
262
+ {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
263
+ """
264
+ self.stats = datasetstats(
265
+ images=data,
266
+ use_dimension=self.use_dimension,
267
+ use_pixel=self.use_pixel,
268
+ use_visual=self.use_visual,
269
+ )
270
+ outliers = self._get_outliers({k: v for o in self.stats.outputs() for k, v in o.dict().items()})
197
271
  return OutliersOutput(outliers)
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from importlib import import_module
4
- from typing import Iterable
4
+ from typing import Any, Iterable, Iterator
5
5
 
6
6
  import numpy as np
7
7
  from numpy.typing import ArrayLike, NDArray
@@ -22,24 +22,28 @@ def try_import(module_name):
22
22
  return module
23
23
 
24
24
 
25
- def to_numpy(array: ArrayLike | None) -> NDArray:
25
+ def as_numpy(array: ArrayLike | None) -> NDArray[Any]:
26
+ return to_numpy(array, copy=False)
27
+
28
+
29
+ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
26
30
  if array is None:
27
31
  return np.ndarray([])
28
32
 
29
33
  if isinstance(array, np.ndarray):
30
- return array
34
+ return array.copy() if copy else array
31
35
 
32
36
  tf = try_import("tensorflow")
33
37
  if tf and tf.is_tensor(array):
34
- return array.numpy() # type: ignore
38
+ return array.numpy().copy() if copy else array.numpy() # type: ignore
35
39
 
36
40
  torch = try_import("torch")
37
41
  if torch and isinstance(array, torch.Tensor):
38
- return array.detach().cpu().numpy() # type: ignore
42
+ return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
39
43
 
40
- return np.asarray(array)
44
+ return np.array(array, copy=copy)
41
45
 
42
46
 
43
- def to_numpy_iter(iterable: Iterable[ArrayLike]):
47
+ def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
44
48
  for array in iterable:
45
49
  yield to_numpy(array)
@@ -2,10 +2,10 @@ from __future__ import annotations
2
2
 
3
3
  import warnings
4
4
  from dataclasses import dataclass
5
- from typing import Sequence
5
+ from typing import Mapping
6
6
 
7
7
  import numpy as np
8
- from numpy.typing import NDArray
8
+ from numpy.typing import ArrayLike, NDArray
9
9
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
10
10
 
11
11
  from dataeval._internal.metrics.utils import entropy, preprocess_metadata
@@ -15,6 +15,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
15
15
  @dataclass(frozen=True)
16
16
  class BalanceOutput(OutputMetadata):
17
17
  """
18
+ Output class for :func:`balance` bias metric
19
+
18
20
  Attributes
19
21
  ----------
20
22
  balance : NDArray[np.float64]
@@ -51,16 +53,16 @@ def validate_num_neighbors(num_neighbors: int) -> int:
51
53
 
52
54
 
53
55
  @set_metadata("dataeval.metrics")
54
- def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: int = 5) -> BalanceOutput:
56
+ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neighbors: int = 5) -> BalanceOutput:
55
57
  """
56
58
  Mutual information (MI) between factors (class label, metadata, label/image properties)
57
59
 
58
60
  Parameters
59
61
  ----------
60
- class_labels: Sequence[int]
62
+ class_labels: ArrayLike
61
63
  List of class labels for each image
62
- metadata: List[Dict]
63
- List of metadata factors for each image
64
+ metadata: Mapping[str, ArrayLike]
65
+ Dict of lists of metadata factors for each image
64
66
  num_neighbors: int, default 5
65
67
  Number of nearest neighbors to use for computing MI between discrete
66
68
  and continuous variables.
@@ -71,8 +73,8 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
71
73
  (num_factors+1) x (num_factors+1) estimate of mutual information
72
74
  between num_factors metadata factors and class label. Symmetry is enforced.
73
75
 
74
- Notes
75
- -----
76
+ Note
77
+ ----
76
78
  We use `mutual_info_classif` from sklearn since class label is categorical.
77
79
  `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
78
80
  seed. MI is computed differently for categorical and continuous variables, and
@@ -90,9 +92,9 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
90
92
  Return intra/interfactor balance (mutual information)
91
93
 
92
94
  >>> bal.factors
93
- array([[0.99999843, 0.03510422, 0.09725766],
94
- [0.03510422, 0.08433558, 0.15621459],
95
- [0.09725766, 0.15621459, 0.99999856]])
95
+ array([[0.99999843, 0.04133555, 0.09725766],
96
+ [0.04133555, 0.08433558, 0.1301489 ],
97
+ [0.09725766, 0.1301489 , 0.99999856]])
96
98
 
97
99
  Return classwise balance (mutual information) of factors with individual class_labels
98
100
 
@@ -17,7 +17,7 @@ from numpy.typing import ArrayLike, NDArray
17
17
  from scipy.sparse import coo_matrix
18
18
  from scipy.stats import mode
19
19
 
20
- from dataeval._internal.interop import to_numpy
20
+ from dataeval._internal.interop import as_numpy
21
21
  from dataeval._internal.metrics.utils import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
22
22
  from dataeval._internal.output import OutputMetadata, set_metadata
23
23
 
@@ -25,6 +25,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
25
25
  @dataclass(frozen=True)
26
26
  class BEROutput(OutputMetadata):
27
27
  """
28
+ Output class for :func:`ber` estimator metric
29
+
28
30
  Attributes
29
31
  ----------
30
32
  ber : float
@@ -145,7 +147,7 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
145
147
  BEROutput(ber=0.04, ber_lower=0.020416847668728033)
146
148
  """
147
149
  ber_fn = get_method(BER_FN_MAP, method)
148
- X = to_numpy(images)
149
- y = to_numpy(labels)
150
+ X = as_numpy(images)
151
+ y = as_numpy(labels)
150
152
  upper, lower = ber_fn(X, y, k) if method == "KNN" else ber_fn(X, y)
151
153
  return BEROutput(upper, lower)
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import math
2
4
  from dataclasses import dataclass
3
5
  from typing import Literal
@@ -14,6 +16,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
14
16
  @dataclass(frozen=True)
15
17
  class CoverageOutput(OutputMetadata):
16
18
  """
19
+ Output class for :func:`coverage` bias metric
20
+
17
21
  Attributes
18
22
  ----------
19
23
  indices : NDArray
@@ -3,13 +3,15 @@ This module contains the implementation of HP Divergence
3
3
  using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
4
4
  """
5
5
 
6
+ from __future__ import annotations
7
+
6
8
  from dataclasses import dataclass
7
9
  from typing import Literal
8
10
 
9
11
  import numpy as np
10
12
  from numpy.typing import ArrayLike, NDArray
11
13
 
12
- from dataeval._internal.interop import to_numpy
14
+ from dataeval._internal.interop import as_numpy
13
15
  from dataeval._internal.metrics.utils import compute_neighbors, get_method, minimum_spanning_tree
14
16
  from dataeval._internal.output import OutputMetadata, set_metadata
15
17
 
@@ -17,6 +19,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
17
19
  @dataclass(frozen=True)
18
20
  class DivergenceOutput(OutputMetadata):
19
21
  """
22
+ Output class for :func:`divergence` estimator metric
23
+
20
24
  Attributes
21
25
  ----------
22
26
  divergence : float
@@ -96,8 +100,8 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
96
100
  DivergenceOutput
97
101
  The divergence value (0.0..1.0) and the number of differing edges between the datasets
98
102
 
99
- Notes
100
- -----
103
+ Note
104
+ ----
101
105
  The divergence value indicates how similar the 2 datasets are
102
106
  with 0 indicating approximately identical data distributions.
103
107
 
@@ -123,8 +127,8 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
123
127
  DivergenceOutput(divergence=0.28, errors=36.0)
124
128
  """
125
129
  div_fn = get_method(DIVERGENCE_FN_MAP, method)
126
- a = to_numpy(data_a)
127
- b = to_numpy(data_b)
130
+ a = as_numpy(data_a)
131
+ b = as_numpy(data_b)
128
132
  N = a.shape[0]
129
133
  M = b.shape[0]
130
134
 
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Literal, Sequence
4
+ from typing import Literal, Mapping
5
5
 
6
6
  import numpy as np
7
- from numpy.typing import NDArray
7
+ from numpy.typing import ArrayLike, NDArray
8
8
 
9
9
  from dataeval._internal.metrics.utils import entropy, get_counts, get_method, get_num_bins, preprocess_metadata
10
10
  from dataeval._internal.output import OutputMetadata, set_metadata
@@ -13,6 +13,8 @@ from dataeval._internal.output import OutputMetadata, set_metadata
13
13
  @dataclass(frozen=True)
14
14
  class DiversityOutput(OutputMetadata):
15
15
  """
16
+ Output class for :func:`diversity` bias metric
17
+
16
18
  Attributes
17
19
  ----------
18
20
  diversity_index : NDArray[np.float64]
@@ -52,8 +54,8 @@ def diversity_shannon(
52
54
  subset_mask: NDArray[np.bool_] | None
53
55
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
54
56
 
55
- Notes
56
- -----
57
+ Note
58
+ ----
57
59
  For continuous variables, histogram bins are chosen automatically. See `numpy.histogram` for details.
58
60
 
59
61
  Returns
@@ -103,8 +105,8 @@ def diversity_simpson(
103
105
  subset_mask: NDArray[np.bool_] | None
104
106
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
105
107
 
106
- Notes
107
- -----
108
+ Note
109
+ ----
108
110
  For continuous variables, histogram bins are chosen automatically. See
109
111
  numpy.histogram for details.
110
112
  If there is only one category, the diversity index takes a value of 0.
@@ -142,7 +144,7 @@ DIVERSITY_FN_MAP = {"simpson": diversity_simpson, "shannon": diversity_shannon}
142
144
 
143
145
  @set_metadata("dataeval.metrics")
144
146
  def diversity(
145
- class_labels: Sequence[int], metadata: list[dict], method: Literal["shannon", "simpson"] = "simpson"
147
+ class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
146
148
  ) -> DiversityOutput:
147
149
  """
148
150
  Compute diversity and classwise diversity for discrete/categorical variables and, through standard
@@ -155,15 +157,15 @@ def diversity(
155
157
 
156
158
  Parameters
157
159
  ----------
158
- class_labels: Sequence[int]
160
+ class_labels: ArrayLike
159
161
  List of class labels for each image
160
- metadata: List[Dict]
161
- List of metadata factors for each image
162
+ metadata: Mapping[str, ArrayLike]
163
+ Dict of list of metadata factors for each image
162
164
  method: Literal["shannon", "simpson"], default "simpson"
163
165
  Indicates which diversity index should be computed
164
166
 
165
- Notes
166
- -----
167
+ Note
168
+ ----
167
169
  - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
168
170
  - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
169
171
  - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.