dataeval 0.86.9__py3-none-any.whl → 0.87.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. dataeval/__init__.py +1 -1
  2. dataeval/_version.py +2 -2
  3. dataeval/config.py +4 -19
  4. dataeval/data/_metadata.py +56 -27
  5. dataeval/data/_split.py +1 -1
  6. dataeval/data/selections/_classbalance.py +4 -3
  7. dataeval/data/selections/_classfilter.py +5 -5
  8. dataeval/data/selections/_indices.py +2 -2
  9. dataeval/data/selections/_prioritize.py +249 -29
  10. dataeval/data/selections/_reverse.py +1 -1
  11. dataeval/data/selections/_shuffle.py +2 -2
  12. dataeval/detectors/ood/__init__.py +2 -1
  13. dataeval/detectors/ood/base.py +38 -1
  14. dataeval/detectors/ood/knn.py +95 -0
  15. dataeval/metrics/bias/_balance.py +28 -21
  16. dataeval/metrics/bias/_diversity.py +4 -4
  17. dataeval/metrics/bias/_parity.py +2 -2
  18. dataeval/metrics/stats/_hashstats.py +19 -2
  19. dataeval/outputs/_workflows.py +20 -7
  20. dataeval/typing.py +14 -2
  21. dataeval/utils/__init__.py +2 -2
  22. dataeval/utils/_bin.py +7 -6
  23. dataeval/utils/data/__init__.py +2 -0
  24. dataeval/utils/data/_dataset.py +13 -6
  25. dataeval/utils/data/_validate.py +169 -0
  26. {dataeval-0.86.9.dist-info → dataeval-0.87.0.dist-info}/METADATA +5 -17
  27. {dataeval-0.86.9.dist-info → dataeval-0.87.0.dist-info}/RECORD +29 -39
  28. dataeval/utils/datasets/__init__.py +0 -21
  29. dataeval/utils/datasets/_antiuav.py +0 -189
  30. dataeval/utils/datasets/_base.py +0 -266
  31. dataeval/utils/datasets/_cifar10.py +0 -201
  32. dataeval/utils/datasets/_fileio.py +0 -142
  33. dataeval/utils/datasets/_milco.py +0 -197
  34. dataeval/utils/datasets/_mixin.py +0 -54
  35. dataeval/utils/datasets/_mnist.py +0 -202
  36. dataeval/utils/datasets/_seadrone.py +0 -512
  37. dataeval/utils/datasets/_ships.py +0 -144
  38. dataeval/utils/datasets/_types.py +0 -48
  39. dataeval/utils/datasets/_voc.py +0 -583
  40. {dataeval-0.86.9.dist-info → dataeval-0.87.0.dist-info}/WHEEL +0 -0
  41. /dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.87.0.dist-info/licenses/LICENSE +0 -0
dataeval/__init__.py CHANGED
@@ -9,7 +9,7 @@ from __future__ import annotations
9
9
 
10
10
  try:
11
11
  from ._version import __version__
12
- except ImportError:
12
+ except ImportError: # pragma: no cover
13
13
  __version__ = "unknown"
14
14
 
15
15
  # Strongly type for pyright
dataeval/_version.py CHANGED
@@ -17,5 +17,5 @@ __version__: str
17
17
  __version_tuple__: VERSION_TUPLE
18
18
  version_tuple: VERSION_TUPLE
19
19
 
20
- __version__ = version = '0.86.9'
21
- __version_tuple__ = version_tuple = (0, 86, 9)
20
+ __version__ = version = '0.87.0'
21
+ __version_tuple__ = version_tuple = (0, 87, 0)
dataeval/config.py CHANGED
@@ -4,19 +4,15 @@ Global configuration settings for DataEval.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes", "DeviceLike"]
7
+ __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"]
8
8
 
9
- import sys
10
- from typing import Any, Union
11
-
12
- if sys.version_info >= (3, 10):
13
- from typing import TypeAlias
14
- else:
15
- from typing_extensions import TypeAlias
9
+ from typing import Any
16
10
 
17
11
  import numpy as np
18
12
  import torch
19
13
 
14
+ from dataeval.typing import DeviceLike
15
+
20
16
  ### GLOBALS ###
21
17
 
22
18
  _device: torch.device | None = None
@@ -27,17 +23,6 @@ _seed: int | None = None
27
23
 
28
24
  EPSILON = 1e-12
29
25
 
30
- ### TYPES ###
31
-
32
- DeviceLike: TypeAlias = Union[int, str, tuple[str, int], torch.device]
33
- """
34
- Type alias for types that are acceptable for specifying a torch.device.
35
-
36
- See Also
37
- --------
38
- `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
39
- """
40
-
41
26
  ### FUNCS ###
42
27
 
43
28
 
@@ -16,18 +16,31 @@ from dataeval.typing import (
16
16
  ObjectDetectionTarget,
17
17
  )
18
18
  from dataeval.utils._array import as_numpy
19
- from dataeval.utils._bin import bin_data, digitize_data
19
+ from dataeval.utils._bin import bin_data, digitize_data, is_continuous
20
20
  from dataeval.utils.data.metadata import merge
21
21
 
22
22
 
23
23
  def _binned(name: str) -> str:
24
- return f"{name}[]"
24
+ return f"{name}"
25
+
26
+
27
+ def _digitized(name: str) -> str:
28
+ return f"{name}#"
25
29
 
26
30
 
27
31
  @dataclass
28
32
  class FactorInfo:
29
- factor_type: Literal["categorical", "continuous", "discrete"] | None = None
30
- discretized_col: str | None = None
33
+ factor_type: Literal["categorical", "continuous", "discrete"]
34
+ is_binned: bool = False
35
+ is_digitized: bool = False
36
+
37
+
38
+ def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
39
+ if binned and info.is_binned:
40
+ return _binned(name)
41
+ if info.is_digitized:
42
+ return _digitized(name)
43
+ return name
31
44
 
32
45
 
33
46
  class Metadata:
@@ -60,7 +73,7 @@ class Metadata:
60
73
  self._class_labels: NDArray[np.intp]
61
74
  self._class_names: list[str]
62
75
  self._image_indices: NDArray[np.intp]
63
- self._factors: dict[str, FactorInfo]
76
+ self._factors: dict[str, FactorInfo | None]
64
77
  self._dropped_factors: dict[str, list[str]]
65
78
  self._dataframe: pl.DataFrame
66
79
  self._raw: Sequence[Mapping[str, Any]]
@@ -146,14 +159,27 @@ class Metadata:
146
159
  return self._dropped_factors
147
160
 
148
161
  @property
149
- def discretized_data(self) -> NDArray[np.int64]:
150
- """Factor data with continuous data discretized."""
162
+ def digitized_data(self) -> NDArray[np.int64]:
163
+ """Factor data with digitized categorical data."""
164
+ if not self.factor_names:
165
+ return np.array([], dtype=np.int64)
166
+
167
+ self._bin()
168
+ return (
169
+ self.dataframe.select([_to_col(k, v, False) for k, v in self.factor_info.items()])
170
+ .to_numpy()
171
+ .astype(np.int64)
172
+ )
173
+
174
+ @property
175
+ def binned_data(self) -> NDArray[np.int64]:
176
+ """Factor data with binned continuous data."""
151
177
  if not self.factor_names:
152
178
  return np.array([], dtype=np.int64)
153
179
 
154
180
  self._bin()
155
181
  return (
156
- self.dataframe.select([info.discretized_col or name for name, info in self.factor_info.items()])
182
+ self.dataframe.select([_to_col(k, v, True) for k, v in self.factor_info.items()])
157
183
  .to_numpy()
158
184
  .astype(np.int64)
159
185
  )
@@ -168,7 +194,7 @@ class Metadata:
168
194
  def factor_info(self) -> Mapping[str, FactorInfo]:
169
195
  """Factor types of the metadata."""
170
196
  self._bin()
171
- return dict(filter(self._filter, self._factors.items()))
197
+ return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
172
198
 
173
199
  @property
174
200
  def factor_data(self) -> NDArray[Any]:
@@ -194,7 +220,7 @@ class Metadata:
194
220
  @property
195
221
  def image_indices(self) -> NDArray[np.intp]:
196
222
  """Indices of images as a NumPy array."""
197
- self._bin()
223
+ self._structure()
198
224
  return self._image_indices
199
225
 
200
226
  @property
@@ -212,7 +238,7 @@ class Metadata:
212
238
  columns = self._dataframe.columns
213
239
  for col in (col for col in cols or columns if _binned(col) in columns):
214
240
  self._dataframe.drop_in_place(_binned(col))
215
- self._factors[col] = FactorInfo()
241
+ self._factors[col] = None
216
242
  self._is_binned = False
217
243
 
218
244
  def _structure(self) -> None:
@@ -277,7 +303,7 @@ class Metadata:
277
303
  self._class_labels = labels
278
304
  self._class_names = list(index2label.values())
279
305
  self._image_indices = target_dict["image_index"]
280
- self._factors = dict.fromkeys(factor_dict, FactorInfo())
306
+ self._factors = dict.fromkeys(factor_dict, None)
281
307
  self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
282
308
  self._dropped_factors = merged[1]
283
309
  self._is_structured = True
@@ -303,24 +329,25 @@ class Metadata:
303
329
  )
304
330
 
305
331
  column_set = set(df.columns)
306
- for col in (col for col in self.factor_names if _binned(col) not in column_set):
332
+ for col in (col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set):
307
333
  # Get data as numpy array for processing
308
334
  data = df[col].to_numpy()
309
- col_dz = _binned(col)
310
335
  if col in factor_bins:
311
336
  # User provided binning
312
337
  bins = factor_bins[col]
313
- df = df.with_columns(pl.Series(name=col_dz, values=digitize_data(data, bins).astype(np.int64)))
314
- factor_info[col] = FactorInfo("continuous", col_dz)
338
+ col_bn = _binned(col)
339
+ df = df.with_columns(pl.Series(name=col_bn, values=digitize_data(data, bins).astype(np.int64)))
340
+ factor_info[col] = FactorInfo("continuous", is_binned=True)
315
341
  else:
316
342
  # Check if data is numeric
317
- unique, ordinal = np.unique(data, return_inverse=True)
318
- if not np.issubdtype(data.dtype, np.number) or unique.size <= max(20, data.size * 0.01):
319
- # Non-numeric data or small number of unique values - convert to categorical
320
- df = df.with_columns(pl.Series(name=col_dz, values=ordinal.astype(np.int64)))
321
- factor_info[col] = FactorInfo("categorical", col_dz)
322
- elif data.dtype == float:
323
- # Many unique values - discretize by binning
343
+ _, ordinal = np.unique(data, return_inverse=True)
344
+ if not np.issubdtype(data.dtype, np.number):
345
+ # Non-numeric data - convert to categorical
346
+ col_dg = _digitized(col)
347
+ df = df.with_columns(pl.Series(name=col_dg, values=ordinal.astype(np.int64)))
348
+ factor_info[col] = FactorInfo("categorical", is_digitized=True)
349
+ elif is_continuous(data, self.image_indices):
350
+ # Continuous values - discretize by binning
324
351
  warnings.warn(
325
352
  f"A user defined binning was not provided for {col}. "
326
353
  f"Using the {self.auto_bin_method} method to discretize the data. "
@@ -330,10 +357,12 @@ class Metadata:
330
357
  )
331
358
  # Create binned version
332
359
  binned_data = bin_data(data, self.auto_bin_method)
333
- df = df.with_columns(pl.Series(name=col_dz, values=binned_data.astype(np.int64)))
334
- factor_info[col] = FactorInfo("continuous", col_dz)
360
+ col_bn = _binned(col)
361
+ df = df.with_columns(pl.Series(name=col_bn, values=binned_data.astype(np.int64)))
362
+ factor_info[col] = FactorInfo("continuous", is_binned=True)
335
363
  else:
336
- factor_info[col] = FactorInfo("discrete", col)
364
+ # Non-continuous values - treat as discrete
365
+ factor_info[col] = FactorInfo("discrete")
337
366
 
338
367
  # Store the results
339
368
  self._dataframe = df
@@ -367,7 +396,7 @@ class Metadata:
367
396
  for k, v in factors.items():
368
397
  data = as_numpy(v)[self.image_indices]
369
398
  new_columns.append(pl.Series(name=k, values=data))
370
- self._factors[k] = FactorInfo()
399
+ self._factors[k] = None
371
400
 
372
401
  if new_columns:
373
402
  self._dataframe = self.dataframe.with_columns(new_columns)
dataeval/data/_split.py CHANGED
@@ -208,7 +208,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
208
208
 
209
209
  split_set = set(split_on)
210
210
  indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
211
- binned_features = metadata.discretized_data[:, indices]
211
+ binned_features = metadata.binned_data[:, indices]
212
212
  return np.unique(binned_features, axis=0, return_inverse=True)[1]
213
213
 
214
214
 
@@ -11,12 +11,13 @@ from dataeval.utils._array import as_numpy
11
11
 
12
12
  class ClassBalance(Selection[ImageClassificationDatum]):
13
13
  """
14
- Balance the dataset by class.
14
+ Select indices of a dataset that will equalize the occurrences of all classes.
15
15
 
16
16
  Note
17
17
  ----
18
- The total number of instances of each class will be equalized which may result
18
+ 1. The total number of instances of each class will be equalized which may result
19
19
  in a lower total number of instances than specified by the selection limit.
20
+ 2. This selection currently only supports classification tasks
20
21
  """
21
22
 
22
23
  stage = SelectionStage.FILTER
@@ -29,7 +30,7 @@ class ClassBalance(Selection[ImageClassificationDatum]):
29
30
  label = int(np.argmax(as_numpy(target)))
30
31
  else:
31
32
  # ObjectDetectionTarget and SegmentationTarget not supported yet
32
- raise TypeError("ClassFilter only supports classification targets as an array of confidence scores.")
33
+ raise TypeError("ClassBalance only supports classification targets as an array of class probabilities.")
33
34
  class_indices.setdefault(label, []).append(i)
34
35
 
35
36
  per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))
@@ -14,12 +14,12 @@ from dataeval.utils._array import as_numpy
14
14
 
15
15
  class ClassFilter(Selection[Any]):
16
16
  """
17
- Filter the dataset by class.
17
+ Select dataset indices based on class labels, keeping only those present in `classes`.
18
18
 
19
19
  Parameters
20
20
  ----------
21
21
  classes : Sequence[int]
22
- The classes to filter by.
22
+ The sequence of classes to keep.
23
23
  filter_detections : bool, default True
24
24
  Whether to filter detections from targets for object detection and segmentation datasets.
25
25
  """
@@ -41,16 +41,16 @@ class ClassFilter(Selection[Any]):
41
41
  if isinstance(target, Array):
42
42
  # Get the label for the image
43
43
  label = int(np.argmax(as_numpy(target)))
44
- # Check to see if the label is in the classes to filter for
44
+ # Check to see if the label is in the classes to keep
45
45
  if label in self.classes:
46
- # Include the image
46
+ # Include the image index
47
47
  selection.append(idx)
48
48
  elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
49
49
  # Get the set of labels from the target
50
50
  labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
51
51
  # Check to see if any labels are in the classes to filter for
52
52
  if labels.intersection(self.classes):
53
- # Include the image
53
+ # Include the image index
54
54
  selection.append(idx)
55
55
  # If we are filtering out other labels and there are other labels, add a subselection filter
56
56
  if self.filter_detections and labels.difference(self.classes):
@@ -9,12 +9,12 @@ from dataeval.data._selection import Select, Selection, SelectionStage
9
9
 
10
10
  class Indices(Selection[Any]):
11
11
  """
12
- Selects specific indices from the dataset.
12
+ Selects only the given indices from the dataset.
13
13
 
14
14
  Parameters
15
15
  ----------
16
16
  indices : Sequence[int]
17
- The indices to select from the dataset.
17
+ The specific indices to select.
18
18
  """
19
19
 
20
20
  stage = SelectionStage.FILTER