dataeval 0.86.9__py3-none-any.whl → 0.87.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +1 -1
- dataeval/_version.py +2 -2
- dataeval/config.py +4 -19
- dataeval/data/_metadata.py +56 -27
- dataeval/data/_split.py +1 -1
- dataeval/data/selections/_classbalance.py +4 -3
- dataeval/data/selections/_classfilter.py +5 -5
- dataeval/data/selections/_indices.py +2 -2
- dataeval/data/selections/_prioritize.py +249 -29
- dataeval/data/selections/_reverse.py +1 -1
- dataeval/data/selections/_shuffle.py +2 -2
- dataeval/detectors/ood/__init__.py +2 -1
- dataeval/detectors/ood/base.py +38 -1
- dataeval/detectors/ood/knn.py +95 -0
- dataeval/metrics/bias/_balance.py +28 -21
- dataeval/metrics/bias/_diversity.py +4 -4
- dataeval/metrics/bias/_parity.py +2 -2
- dataeval/metrics/stats/_hashstats.py +19 -2
- dataeval/outputs/_workflows.py +20 -7
- dataeval/typing.py +14 -2
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_bin.py +7 -6
- dataeval/utils/data/__init__.py +2 -0
- dataeval/utils/data/_dataset.py +13 -6
- dataeval/utils/data/_validate.py +169 -0
- {dataeval-0.86.9.dist-info → dataeval-0.87.0.dist-info}/METADATA +5 -17
- {dataeval-0.86.9.dist-info → dataeval-0.87.0.dist-info}/RECORD +29 -39
- dataeval/utils/datasets/__init__.py +0 -21
- dataeval/utils/datasets/_antiuav.py +0 -189
- dataeval/utils/datasets/_base.py +0 -266
- dataeval/utils/datasets/_cifar10.py +0 -201
- dataeval/utils/datasets/_fileio.py +0 -142
- dataeval/utils/datasets/_milco.py +0 -197
- dataeval/utils/datasets/_mixin.py +0 -54
- dataeval/utils/datasets/_mnist.py +0 -202
- dataeval/utils/datasets/_seadrone.py +0 -512
- dataeval/utils/datasets/_ships.py +0 -144
- dataeval/utils/datasets/_types.py +0 -48
- dataeval/utils/datasets/_voc.py +0 -583
- {dataeval-0.86.9.dist-info → dataeval-0.87.0.dist-info}/WHEEL +0 -0
- /dataeval-0.86.9.dist-info/licenses/LICENSE.txt → /dataeval-0.87.0.dist-info/licenses/LICENSE +0 -0
dataeval/__init__.py
CHANGED
dataeval/_version.py
CHANGED
dataeval/config.py
CHANGED
@@ -4,19 +4,15 @@ Global configuration settings for DataEval.
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
7
|
-
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"
|
7
|
+
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes", "use_max_processes"]
|
8
8
|
|
9
|
-
import
|
10
|
-
from typing import Any, Union
|
11
|
-
|
12
|
-
if sys.version_info >= (3, 10):
|
13
|
-
from typing import TypeAlias
|
14
|
-
else:
|
15
|
-
from typing_extensions import TypeAlias
|
9
|
+
from typing import Any
|
16
10
|
|
17
11
|
import numpy as np
|
18
12
|
import torch
|
19
13
|
|
14
|
+
from dataeval.typing import DeviceLike
|
15
|
+
|
20
16
|
### GLOBALS ###
|
21
17
|
|
22
18
|
_device: torch.device | None = None
|
@@ -27,17 +23,6 @@ _seed: int | None = None
|
|
27
23
|
|
28
24
|
EPSILON = 1e-12
|
29
25
|
|
30
|
-
### TYPES ###
|
31
|
-
|
32
|
-
DeviceLike: TypeAlias = Union[int, str, tuple[str, int], torch.device]
|
33
|
-
"""
|
34
|
-
Type alias for types that are acceptable for specifying a torch.device.
|
35
|
-
|
36
|
-
See Also
|
37
|
-
--------
|
38
|
-
`torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
|
39
|
-
"""
|
40
|
-
|
41
26
|
### FUNCS ###
|
42
27
|
|
43
28
|
|
dataeval/data/_metadata.py
CHANGED
@@ -16,18 +16,31 @@ from dataeval.typing import (
|
|
16
16
|
ObjectDetectionTarget,
|
17
17
|
)
|
18
18
|
from dataeval.utils._array import as_numpy
|
19
|
-
from dataeval.utils._bin import bin_data, digitize_data
|
19
|
+
from dataeval.utils._bin import bin_data, digitize_data, is_continuous
|
20
20
|
from dataeval.utils.data.metadata import merge
|
21
21
|
|
22
22
|
|
23
23
|
def _binned(name: str) -> str:
|
24
|
-
return f"{name}
|
24
|
+
return f"{name}↕"
|
25
|
+
|
26
|
+
|
27
|
+
def _digitized(name: str) -> str:
|
28
|
+
return f"{name}#"
|
25
29
|
|
26
30
|
|
27
31
|
@dataclass
|
28
32
|
class FactorInfo:
|
29
|
-
factor_type: Literal["categorical", "continuous", "discrete"]
|
30
|
-
|
33
|
+
factor_type: Literal["categorical", "continuous", "discrete"]
|
34
|
+
is_binned: bool = False
|
35
|
+
is_digitized: bool = False
|
36
|
+
|
37
|
+
|
38
|
+
def _to_col(name: str, info: FactorInfo, binned: bool = True) -> str:
|
39
|
+
if binned and info.is_binned:
|
40
|
+
return _binned(name)
|
41
|
+
if info.is_digitized:
|
42
|
+
return _digitized(name)
|
43
|
+
return name
|
31
44
|
|
32
45
|
|
33
46
|
class Metadata:
|
@@ -60,7 +73,7 @@ class Metadata:
|
|
60
73
|
self._class_labels: NDArray[np.intp]
|
61
74
|
self._class_names: list[str]
|
62
75
|
self._image_indices: NDArray[np.intp]
|
63
|
-
self._factors: dict[str, FactorInfo]
|
76
|
+
self._factors: dict[str, FactorInfo | None]
|
64
77
|
self._dropped_factors: dict[str, list[str]]
|
65
78
|
self._dataframe: pl.DataFrame
|
66
79
|
self._raw: Sequence[Mapping[str, Any]]
|
@@ -146,14 +159,27 @@ class Metadata:
|
|
146
159
|
return self._dropped_factors
|
147
160
|
|
148
161
|
@property
|
149
|
-
def
|
150
|
-
"""Factor data with
|
162
|
+
def digitized_data(self) -> NDArray[np.int64]:
|
163
|
+
"""Factor data with digitized categorical data."""
|
164
|
+
if not self.factor_names:
|
165
|
+
return np.array([], dtype=np.int64)
|
166
|
+
|
167
|
+
self._bin()
|
168
|
+
return (
|
169
|
+
self.dataframe.select([_to_col(k, v, False) for k, v in self.factor_info.items()])
|
170
|
+
.to_numpy()
|
171
|
+
.astype(np.int64)
|
172
|
+
)
|
173
|
+
|
174
|
+
@property
|
175
|
+
def binned_data(self) -> NDArray[np.int64]:
|
176
|
+
"""Factor data with binned continuous data."""
|
151
177
|
if not self.factor_names:
|
152
178
|
return np.array([], dtype=np.int64)
|
153
179
|
|
154
180
|
self._bin()
|
155
181
|
return (
|
156
|
-
self.dataframe.select([
|
182
|
+
self.dataframe.select([_to_col(k, v, True) for k, v in self.factor_info.items()])
|
157
183
|
.to_numpy()
|
158
184
|
.astype(np.int64)
|
159
185
|
)
|
@@ -168,7 +194,7 @@ class Metadata:
|
|
168
194
|
def factor_info(self) -> Mapping[str, FactorInfo]:
|
169
195
|
"""Factor types of the metadata."""
|
170
196
|
self._bin()
|
171
|
-
return dict(filter(self._filter, self._factors.items()))
|
197
|
+
return dict(filter(self._filter, ((k, v) for k, v in self._factors.items() if v is not None)))
|
172
198
|
|
173
199
|
@property
|
174
200
|
def factor_data(self) -> NDArray[Any]:
|
@@ -194,7 +220,7 @@ class Metadata:
|
|
194
220
|
@property
|
195
221
|
def image_indices(self) -> NDArray[np.intp]:
|
196
222
|
"""Indices of images as a NumPy array."""
|
197
|
-
self.
|
223
|
+
self._structure()
|
198
224
|
return self._image_indices
|
199
225
|
|
200
226
|
@property
|
@@ -212,7 +238,7 @@ class Metadata:
|
|
212
238
|
columns = self._dataframe.columns
|
213
239
|
for col in (col for col in cols or columns if _binned(col) in columns):
|
214
240
|
self._dataframe.drop_in_place(_binned(col))
|
215
|
-
self._factors[col] =
|
241
|
+
self._factors[col] = None
|
216
242
|
self._is_binned = False
|
217
243
|
|
218
244
|
def _structure(self) -> None:
|
@@ -277,7 +303,7 @@ class Metadata:
|
|
277
303
|
self._class_labels = labels
|
278
304
|
self._class_names = list(index2label.values())
|
279
305
|
self._image_indices = target_dict["image_index"]
|
280
|
-
self._factors = dict.fromkeys(factor_dict,
|
306
|
+
self._factors = dict.fromkeys(factor_dict, None)
|
281
307
|
self._dataframe = pl.DataFrame({**target_dict, **factor_dict})
|
282
308
|
self._dropped_factors = merged[1]
|
283
309
|
self._is_structured = True
|
@@ -303,24 +329,25 @@ class Metadata:
|
|
303
329
|
)
|
304
330
|
|
305
331
|
column_set = set(df.columns)
|
306
|
-
for col in (col for col in self.factor_names if _binned(col)
|
332
|
+
for col in (col for col in self.factor_names if not {_binned(col), _digitized(col)} & column_set):
|
307
333
|
# Get data as numpy array for processing
|
308
334
|
data = df[col].to_numpy()
|
309
|
-
col_dz = _binned(col)
|
310
335
|
if col in factor_bins:
|
311
336
|
# User provided binning
|
312
337
|
bins = factor_bins[col]
|
313
|
-
|
314
|
-
|
338
|
+
col_bn = _binned(col)
|
339
|
+
df = df.with_columns(pl.Series(name=col_bn, values=digitize_data(data, bins).astype(np.int64)))
|
340
|
+
factor_info[col] = FactorInfo("continuous", is_binned=True)
|
315
341
|
else:
|
316
342
|
# Check if data is numeric
|
317
|
-
|
318
|
-
if not np.issubdtype(data.dtype, np.number)
|
319
|
-
# Non-numeric data
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
343
|
+
_, ordinal = np.unique(data, return_inverse=True)
|
344
|
+
if not np.issubdtype(data.dtype, np.number):
|
345
|
+
# Non-numeric data - convert to categorical
|
346
|
+
col_dg = _digitized(col)
|
347
|
+
df = df.with_columns(pl.Series(name=col_dg, values=ordinal.astype(np.int64)))
|
348
|
+
factor_info[col] = FactorInfo("categorical", is_digitized=True)
|
349
|
+
elif is_continuous(data, self.image_indices):
|
350
|
+
# Continuous values - discretize by binning
|
324
351
|
warnings.warn(
|
325
352
|
f"A user defined binning was not provided for {col}. "
|
326
353
|
f"Using the {self.auto_bin_method} method to discretize the data. "
|
@@ -330,10 +357,12 @@ class Metadata:
|
|
330
357
|
)
|
331
358
|
# Create binned version
|
332
359
|
binned_data = bin_data(data, self.auto_bin_method)
|
333
|
-
|
334
|
-
|
360
|
+
col_bn = _binned(col)
|
361
|
+
df = df.with_columns(pl.Series(name=col_bn, values=binned_data.astype(np.int64)))
|
362
|
+
factor_info[col] = FactorInfo("continuous", is_binned=True)
|
335
363
|
else:
|
336
|
-
|
364
|
+
# Non-continuous values - treat as discrete
|
365
|
+
factor_info[col] = FactorInfo("discrete")
|
337
366
|
|
338
367
|
# Store the results
|
339
368
|
self._dataframe = df
|
@@ -367,7 +396,7 @@ class Metadata:
|
|
367
396
|
for k, v in factors.items():
|
368
397
|
data = as_numpy(v)[self.image_indices]
|
369
398
|
new_columns.append(pl.Series(name=k, values=data))
|
370
|
-
self._factors[k] =
|
399
|
+
self._factors[k] = None
|
371
400
|
|
372
401
|
if new_columns:
|
373
402
|
self._dataframe = self.dataframe.with_columns(new_columns)
|
dataeval/data/_split.py
CHANGED
@@ -208,7 +208,7 @@ def get_groups(metadata: Metadata, split_on: Sequence[str] | None) -> NDArray[np
|
|
208
208
|
|
209
209
|
split_set = set(split_on)
|
210
210
|
indices = [i for i, name in enumerate(metadata.factor_names) if name in split_set]
|
211
|
-
binned_features = metadata.
|
211
|
+
binned_features = metadata.binned_data[:, indices]
|
212
212
|
return np.unique(binned_features, axis=0, return_inverse=True)[1]
|
213
213
|
|
214
214
|
|
@@ -11,12 +11,13 @@ from dataeval.utils._array import as_numpy
|
|
11
11
|
|
12
12
|
class ClassBalance(Selection[ImageClassificationDatum]):
|
13
13
|
"""
|
14
|
-
|
14
|
+
Select indices of a dataset that will equalize the occurrences of all classes.
|
15
15
|
|
16
16
|
Note
|
17
17
|
----
|
18
|
-
The total number of instances of each class will be equalized which may result
|
18
|
+
1. The total number of instances of each class will be equalized which may result
|
19
19
|
in a lower total number of instances than specified by the selection limit.
|
20
|
+
2. This selection currently only supports classification tasks
|
20
21
|
"""
|
21
22
|
|
22
23
|
stage = SelectionStage.FILTER
|
@@ -29,7 +30,7 @@ class ClassBalance(Selection[ImageClassificationDatum]):
|
|
29
30
|
label = int(np.argmax(as_numpy(target)))
|
30
31
|
else:
|
31
32
|
# ObjectDetectionTarget and SegmentationTarget not supported yet
|
32
|
-
raise TypeError("
|
33
|
+
raise TypeError("ClassBalance only supports classification targets as an array of class probabilities.")
|
33
34
|
class_indices.setdefault(label, []).append(i)
|
34
35
|
|
35
36
|
per_class_limit = min(min(len(c) for c in class_indices.values()), dataset._size_limit // len(class_indices))
|
@@ -14,12 +14,12 @@ from dataeval.utils._array import as_numpy
|
|
14
14
|
|
15
15
|
class ClassFilter(Selection[Any]):
|
16
16
|
"""
|
17
|
-
|
17
|
+
Select dataset indices based on class labels, keeping only those present in `classes`.
|
18
18
|
|
19
19
|
Parameters
|
20
20
|
----------
|
21
21
|
classes : Sequence[int]
|
22
|
-
The classes to
|
22
|
+
The sequence of classes to keep.
|
23
23
|
filter_detections : bool, default True
|
24
24
|
Whether to filter detections from targets for object detection and segmentation datasets.
|
25
25
|
"""
|
@@ -41,16 +41,16 @@ class ClassFilter(Selection[Any]):
|
|
41
41
|
if isinstance(target, Array):
|
42
42
|
# Get the label for the image
|
43
43
|
label = int(np.argmax(as_numpy(target)))
|
44
|
-
# Check to see if the label is in the classes to
|
44
|
+
# Check to see if the label is in the classes to keep
|
45
45
|
if label in self.classes:
|
46
|
-
# Include the image
|
46
|
+
# Include the image index
|
47
47
|
selection.append(idx)
|
48
48
|
elif isinstance(target, (ObjectDetectionTarget, SegmentationTarget)):
|
49
49
|
# Get the set of labels from the target
|
50
50
|
labels = set(target.labels if isinstance(target.labels, Iterable) else [target.labels])
|
51
51
|
# Check to see if any labels are in the classes to filter for
|
52
52
|
if labels.intersection(self.classes):
|
53
|
-
# Include the image
|
53
|
+
# Include the image index
|
54
54
|
selection.append(idx)
|
55
55
|
# If we are filtering out other labels and there are other labels, add a subselection filter
|
56
56
|
if self.filter_detections and labels.difference(self.classes):
|
@@ -9,12 +9,12 @@ from dataeval.data._selection import Select, Selection, SelectionStage
|
|
9
9
|
|
10
10
|
class Indices(Selection[Any]):
|
11
11
|
"""
|
12
|
-
Selects
|
12
|
+
Selects only the given indices from the dataset.
|
13
13
|
|
14
14
|
Parameters
|
15
15
|
----------
|
16
16
|
indices : Sequence[int]
|
17
|
-
The indices to select
|
17
|
+
The specific indices to select.
|
18
18
|
"""
|
19
19
|
|
20
20
|
stage = SelectionStage.FILTER
|