dataeval 0.74.1__py3-none-any.whl → 0.74.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +21 -2
- dataeval/interop.py +14 -2
- dataeval/logging.py +16 -0
- dataeval/output.py +1 -1
- dataeval/utils/split_dataset.py +306 -279
- {dataeval-0.74.1.dist-info → dataeval-0.74.2.dist-info}/METADATA +1 -1
- {dataeval-0.74.1.dist-info → dataeval-0.74.2.dist-info}/RECORD +9 -8
- {dataeval-0.74.1.dist-info → dataeval-0.74.2.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.74.1.dist-info → dataeval-0.74.2.dist-info}/WHEEL +0 -0
dataeval/__init__.py
CHANGED
@@ -1,7 +1,26 @@
|
|
1
|
-
__version__ = "0.74.
|
1
|
+
__version__ = "0.74.2"
|
2
2
|
|
3
|
+
import logging
|
3
4
|
from importlib.util import find_spec
|
4
5
|
|
6
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
7
|
+
|
8
|
+
|
9
|
+
def log_stderr(level: int = logging.DEBUG) -> None:
|
10
|
+
"""
|
11
|
+
Helper for quickly adding a StreamHandler to the logger. Useful for
|
12
|
+
debugging.
|
13
|
+
"""
|
14
|
+
import logging
|
15
|
+
|
16
|
+
logger = logging.getLogger(__name__)
|
17
|
+
handler = logging.StreamHandler()
|
18
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
19
|
+
logger.addHandler(handler)
|
20
|
+
logger.setLevel(level)
|
21
|
+
logger.debug("Added a stderr logging handler to logger: %s", __name__)
|
22
|
+
|
23
|
+
|
5
24
|
_IS_TORCH_AVAILABLE = find_spec("torch") is not None
|
6
25
|
_IS_TORCHVISION_AVAILABLE = find_spec("torchvision") is not None
|
7
26
|
|
@@ -9,7 +28,7 @@ del find_spec
|
|
9
28
|
|
10
29
|
from dataeval import detectors, metrics # noqa: E402
|
11
30
|
|
12
|
-
__all__ = ["detectors", "metrics"]
|
31
|
+
__all__ = ["log_stderr", "detectors", "metrics"]
|
13
32
|
|
14
33
|
if _IS_TORCH_AVAILABLE:
|
15
34
|
from dataeval import utils, workflows
|
dataeval/interop.py
CHANGED
@@ -1,23 +1,31 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from types import ModuleType
|
4
|
+
|
5
|
+
from dataeval.logging import LogMessage
|
6
|
+
|
3
7
|
__all__ = ["as_numpy", "to_numpy", "to_numpy_iter"]
|
4
8
|
|
9
|
+
import logging
|
5
10
|
from importlib import import_module
|
6
11
|
from typing import Any, Iterable, Iterator
|
7
12
|
|
8
13
|
import numpy as np
|
9
14
|
from numpy.typing import ArrayLike, NDArray
|
10
15
|
|
16
|
+
_logger = logging.getLogger(__name__)
|
17
|
+
|
11
18
|
_MODULE_CACHE = {}
|
12
19
|
|
13
20
|
|
14
|
-
def _try_import(module_name):
|
21
|
+
def _try_import(module_name) -> ModuleType | None:
|
15
22
|
if module_name in _MODULE_CACHE:
|
16
23
|
return _MODULE_CACHE[module_name]
|
17
24
|
|
18
25
|
try:
|
19
26
|
module = import_module(module_name)
|
20
27
|
except ImportError: # pragma: no cover - covered by test_mindeps.py
|
28
|
+
_logger.log(logging.INFO, f"Unable to import {module_name}.")
|
21
29
|
module = None
|
22
30
|
|
23
31
|
_MODULE_CACHE[module_name] = module
|
@@ -40,12 +48,16 @@ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
|
|
40
48
|
if array.__class__.__module__.startswith("tensorflow"):
|
41
49
|
tf = _try_import("tensorflow")
|
42
50
|
if tf and tf.is_tensor(array):
|
51
|
+
_logger.log(logging.INFO, "Converting Tensorflow array to NumPy array.")
|
43
52
|
return array.numpy().copy() if copy else array.numpy() # type: ignore
|
44
53
|
|
45
54
|
if array.__class__.__module__.startswith("torch"):
|
46
55
|
torch = _try_import("torch")
|
47
56
|
if torch and isinstance(array, torch.Tensor):
|
48
|
-
|
57
|
+
_logger.log(logging.INFO, "Converting PyTorch array to NumPy array.")
|
58
|
+
numpy = array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
|
59
|
+
_logger.log(logging.DEBUG, LogMessage(lambda: f"{str(array)} -> {str(numpy)}"))
|
60
|
+
return numpy
|
49
61
|
|
50
62
|
return np.array(array) if copy else np.asarray(array)
|
51
63
|
|
dataeval/logging.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
from typing import Callable
|
2
|
+
|
3
|
+
|
4
|
+
class LogMessage:
|
5
|
+
"""
|
6
|
+
Deferred message callback for logging expensive messages.
|
7
|
+
"""
|
8
|
+
|
9
|
+
def __init__(self, fn: Callable[..., str]):
|
10
|
+
self._fn = fn
|
11
|
+
self._str = None
|
12
|
+
|
13
|
+
def __str__(self) -> str:
|
14
|
+
if self._str is None:
|
15
|
+
self._str = self._fn()
|
16
|
+
return self._str
|
dataeval/output.py
CHANGED
@@ -65,7 +65,7 @@ R = TypeVar("R", bound=Output)
|
|
65
65
|
|
66
66
|
|
67
67
|
def set_metadata(fn: Callable[P, R] | None = None, *, state: list[str] | None = None) -> Callable[P, R]:
|
68
|
-
"""Decorator to stamp
|
68
|
+
"""Decorator to stamp Output classes with runtime metadata"""
|
69
69
|
|
70
70
|
if fn is None:
|
71
71
|
return partial(set_metadata, state=state) # type: ignore
|
dataeval/utils/split_dataset.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
|
3
|
+
from dataclasses import dataclass
|
4
|
+
|
5
|
+
from dataeval.output import Output, set_metadata
|
6
|
+
|
7
|
+
__all__ = ["split_dataset", "SplitDatasetOutput"]
|
4
8
|
|
5
9
|
import warnings
|
6
|
-
from typing import Any
|
10
|
+
from typing import Any, Iterator, NamedTuple, Protocol
|
7
11
|
|
8
12
|
import numpy as np
|
9
13
|
from numpy.typing import NDArray
|
@@ -13,128 +17,156 @@ from sklearn.model_selection import GroupKFold, KFold, StratifiedGroupKFold, Str
|
|
13
17
|
from sklearn.utils.multiclass import type_of_target
|
14
18
|
|
15
19
|
|
16
|
-
|
17
|
-
"""
|
18
|
-
|
20
|
+
class TrainValSplit(NamedTuple):
|
21
|
+
"""Tuple containing train and validation indices"""
|
22
|
+
|
23
|
+
train: NDArray[np.int_]
|
24
|
+
val: NDArray[np.int_]
|
25
|
+
|
26
|
+
|
27
|
+
@dataclass(frozen=True)
|
28
|
+
class SplitDatasetOutput(Output):
|
29
|
+
"""Output class containing test indices and a list of TrainValSplits"""
|
30
|
+
|
31
|
+
test: NDArray[np.int_]
|
32
|
+
folds: list[TrainValSplit]
|
33
|
+
|
19
34
|
|
35
|
+
class KFoldSplitter(Protocol):
|
36
|
+
"""Protocol covering sklearn KFold variant splitters"""
|
37
|
+
|
38
|
+
def __init__(self, n_splits: int): ...
|
39
|
+
def split(self, X: Any, y: Any, groups: Any) -> Iterator[tuple[NDArray[Any], NDArray[Any]]]: ...
|
40
|
+
|
41
|
+
|
42
|
+
KFOLD_GROUP_STRATIFIED_MAP: dict[tuple[bool, bool], type[KFoldSplitter]] = {
|
43
|
+
(False, False): KFold,
|
44
|
+
(False, True): StratifiedKFold,
|
45
|
+
(True, False): GroupKFold,
|
46
|
+
(True, True): StratifiedGroupKFold,
|
47
|
+
}
|
48
|
+
|
49
|
+
|
50
|
+
def calculate_validation_fraction(num_folds: int, test_frac: float, val_frac: float) -> float:
|
51
|
+
"""
|
52
|
+
Calculate possible validation fraction based on the number of folds and test fraction.
|
20
53
|
|
21
54
|
Parameters
|
22
55
|
----------
|
23
56
|
num_folds : int
|
24
|
-
number of
|
25
|
-
test_frac : float
|
26
|
-
|
27
|
-
val_frac
|
28
|
-
|
29
|
-
|
57
|
+
number of train and validation cross-validation folds to generate
|
58
|
+
test_frac : float
|
59
|
+
The fraction of the data to extract for testing before folds are created
|
60
|
+
val_frac : float
|
61
|
+
The validation split will contain (val_frac * 100)% of any data not already allocated to the test set.
|
62
|
+
Only required if requesting a single [train, val] split.
|
30
63
|
|
31
64
|
Raises
|
32
65
|
------
|
33
|
-
UnboundLocalError
|
34
|
-
Raised if more than one fold AND the fraction of data to be used for validation are
|
35
|
-
both requested. In this case, val_frac is ambiguous, since the validation fraction must be
|
36
|
-
by definition 1/num_folds
|
37
66
|
ValueError
|
38
|
-
|
39
|
-
requested, we need to know how much of the data should be allocated for validation.
|
67
|
+
When number of folds requested is less than 1
|
40
68
|
ValueError
|
41
|
-
|
69
|
+
When the test fraction is not within 0.0 and 1.0 inclusively
|
70
|
+
ValueError
|
71
|
+
When more than one fold and the validation fraction are both requested
|
72
|
+
ValueError
|
73
|
+
When number of folds equals one but the validation fraction is 0.0
|
74
|
+
ValueError
|
75
|
+
When the validation fraction is not within 0.0 and 1.0 inclusively
|
42
76
|
|
43
77
|
Returns
|
44
78
|
-------
|
45
|
-
|
46
|
-
|
79
|
+
float
|
80
|
+
The updated validation fraction of the remaining data after the testing fraction is removed
|
81
|
+
"""
|
82
|
+
if num_folds < 1:
|
83
|
+
raise ValueError(f"Number of folds must be greater than or equal to 1, got {num_folds}")
|
84
|
+
if test_frac < 0.0 or test_frac > 1.0:
|
85
|
+
raise ValueError(f"test_frac out of bounds. Must be between 0.0 and 1.0, got {test_frac}")
|
86
|
+
|
87
|
+
# val base is a variable placeholder so val_frac can be ignored if num_folds != 1
|
88
|
+
val_base: float = 1.0
|
89
|
+
if num_folds == 1:
|
90
|
+
if val_frac == 0.0:
|
91
|
+
raise ValueError("If num_folds equals 1, must assign a value to val_frac")
|
92
|
+
if val_frac < 0.0 or val_frac > 1.0:
|
93
|
+
raise ValueError(f"val_frac out of bounds. Must be between 0.0 and 1.0, got {val_frac}")
|
94
|
+
val_base = val_frac
|
95
|
+
# num folds must be >1 in this case
|
96
|
+
elif val_frac != 0.0:
|
97
|
+
raise ValueError("Can only specify val_frac when num_folds equals 1")
|
98
|
+
|
99
|
+
# This value is mathematically bound between 0-1 inclusive
|
100
|
+
return val_base * (1.0 / num_folds) * (1.0 - test_frac)
|
101
|
+
|
102
|
+
|
103
|
+
def _validate_labels(labels: NDArray[np.int_], total_partitions: int) -> None:
|
47
104
|
"""
|
48
|
-
|
49
|
-
raise ValueError("If specifying val_frac, num_folds must be None or 1")
|
50
|
-
if (num_folds == 1) and (val_frac is None):
|
51
|
-
raise ValueError("If num_folds is None or 1, must assign a value to val_frac")
|
52
|
-
t_frac = 0.0 if test_frac is None else test_frac
|
53
|
-
v_frac = 1.0 / num_folds * (1.0 - t_frac) if val_frac is None else val_frac * (1.0 - t_frac)
|
54
|
-
if (t_frac + v_frac) >= 1.0:
|
55
|
-
raise ValueError(f"val_frac + test_frac must be less that 1.0, currently {v_frac+t_frac}")
|
56
|
-
return t_frac, v_frac
|
57
|
-
|
58
|
-
|
59
|
-
def check_labels(
|
60
|
-
labels: list[int] | NDArray[np.int_], total_partitions: int
|
61
|
-
) -> tuple[NDArray[np.int_], NDArray[np.int_]]:
|
62
|
-
"""Check to make sure there are more input data than the total number of partitions requested
|
63
|
-
Also converts labels to a numpy array, if it isn't already
|
105
|
+
Check to make sure there is more input data than the total number of partitions requested
|
64
106
|
|
65
107
|
Parameters
|
66
108
|
----------
|
67
|
-
labels :
|
68
|
-
|
109
|
+
labels : np.ndarray of ints
|
110
|
+
All class labels from the input dataset
|
69
111
|
total_partitions : int
|
70
|
-
|
112
|
+
Number of [train, val, test] splits requested
|
71
113
|
|
72
114
|
Raises
|
73
115
|
------
|
74
|
-
IndexError
|
75
|
-
Raised if more partitions are requested than number of labels. This is exceedingly rare and
|
76
|
-
usually means you've specified some argument incorrectly.
|
77
116
|
ValueError
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
-------
|
84
|
-
index : np.ndarray
|
85
|
-
Integer index generated based on the total number of labels
|
86
|
-
labels : np.ndarray
|
87
|
-
labels, converted to an ndarray if passed as a list.
|
117
|
+
When more partitions are requested than number of labels.
|
118
|
+
ValueError
|
119
|
+
When the labels are considered continuous by Scikit-Learn. This does not necessarily
|
120
|
+
mean that floats are not accepted as a label format. Rather, this implies that
|
121
|
+
there are too many unique values in the set relative to its cardinality.
|
88
122
|
"""
|
123
|
+
|
89
124
|
if len(labels) <= total_partitions:
|
90
|
-
raise
|
91
|
-
Total number of labels must greater than the number of
|
92
|
-
Got {len(labels)} labels and {total_partitions} total train
|
93
|
-
|
94
|
-
|
125
|
+
raise ValueError(
|
126
|
+
"Total number of labels must be greater than the total number of partitions. "
|
127
|
+
f"Got {len(labels)} labels and {total_partitions} total [train, val, test] partitions."
|
128
|
+
)
|
129
|
+
|
95
130
|
if type_of_target(labels) == "continuous":
|
96
|
-
raise ValueError("Detected continuous labels
|
97
|
-
index = np.arange(len(labels))
|
98
|
-
return index, labels
|
131
|
+
raise ValueError("Detected continuous labels. Labels must be discrete for proper stratification")
|
99
132
|
|
100
133
|
|
101
|
-
def
|
134
|
+
def is_stratifiable(labels: NDArray[np.int_], num_partitions: int) -> bool:
|
102
135
|
"""
|
103
|
-
|
104
|
-
comprehensive test, as factors such as grouping also affect the ability to stratify by label
|
136
|
+
Check if the dataset can be stratified by class label over the given number of partitions
|
105
137
|
|
106
138
|
Parameters
|
107
139
|
----------
|
108
|
-
labels :
|
109
|
-
|
110
|
-
|
111
|
-
number of train
|
140
|
+
labels : NDArray of ints
|
141
|
+
All class labels of the input dataset
|
142
|
+
num_partitions : int
|
143
|
+
Total number of [train, val, test] splits requested
|
144
|
+
|
145
|
+
Returns
|
146
|
+
-------
|
147
|
+
bool
|
148
|
+
True if dataset can be stratified else False
|
112
149
|
|
113
150
|
Warns
|
114
151
|
-----
|
115
152
|
UserWarning
|
116
|
-
Warns user if the dataset cannot be stratified due to the number of
|
153
|
+
Warns user if the dataset cannot be stratified due to the total number of [train, val, test]
|
117
154
|
partitions exceeding the number of instances of the rarest class label.
|
118
|
-
|
119
|
-
Returns
|
120
|
-
-------
|
121
|
-
stratifiable : bool
|
122
|
-
True if dataset can be stratified according to the criteria above.
|
123
155
|
"""
|
124
156
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
return
|
157
|
+
# Get the minimum count of all labels
|
158
|
+
lowest_label_count = np.unique(labels, return_counts=True)[1].min()
|
159
|
+
if lowest_label_count < num_partitions:
|
160
|
+
warnings.warn(
|
161
|
+
f"Unable to stratify due to label frequency. The lowest label count ({lowest_label_count}) is fewer "
|
162
|
+
f"than the total number of partitions ({num_partitions}) requested.",
|
163
|
+
UserWarning,
|
164
|
+
)
|
165
|
+
return False
|
166
|
+
return True
|
135
167
|
|
136
168
|
|
137
|
-
def
|
169
|
+
def is_groupable(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
|
138
170
|
"""
|
139
171
|
Warns user if the number of unique group_ids is incompatible with a grouped partition containing
|
140
172
|
num_folds folds. If this is the case, returns groups=None, which tells the partitioner not to
|
@@ -142,34 +174,35 @@ def check_groups(group_ids: NDArray[np.int_], num_partitions: int) -> bool:
|
|
142
174
|
|
143
175
|
Parameters
|
144
176
|
----------
|
145
|
-
group_ids :
|
146
|
-
|
177
|
+
group_ids : NDArray of ints
|
178
|
+
The id of the group each sample at the corresponding index belongs to
|
147
179
|
num_partitions : int
|
148
|
-
|
180
|
+
Total number of train, val, and test splits requested
|
181
|
+
|
182
|
+
Returns
|
183
|
+
-------
|
184
|
+
bool
|
185
|
+
True if the dataset can be grouped by the given group ids else False
|
149
186
|
|
150
187
|
Warns
|
151
188
|
-----
|
152
189
|
UserWarning
|
153
|
-
Warns if there are fewer groups than the
|
154
|
-
into num_partitions. The minimum is defined as the number of partitions requested plus one.
|
155
|
-
|
156
|
-
Returns
|
157
|
-
-------
|
158
|
-
groupable : bool
|
159
|
-
True if dataset can be grouped by the given group ids, given the criteria above.
|
190
|
+
Warns if there are fewer groups than the requested number of partitions plus one
|
160
191
|
"""
|
161
192
|
|
162
|
-
groupable = True
|
163
193
|
num_unique_groups = len(np.unique(group_ids))
|
164
|
-
|
165
|
-
if num_unique_groups
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
194
|
+
# Cannot separate if only one group exists
|
195
|
+
if num_unique_groups == 1:
|
196
|
+
return False
|
197
|
+
|
198
|
+
if num_unique_groups < num_partitions:
|
199
|
+
warnings.warn(
|
200
|
+
f"Groups must be greater than num partitions. Got {num_unique_groups} and {num_partitions}. "
|
201
|
+
"Reverting to ungrouped partitioning",
|
202
|
+
UserWarning,
|
203
|
+
)
|
204
|
+
return False
|
205
|
+
return True
|
173
206
|
|
174
207
|
|
175
208
|
def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
|
@@ -179,14 +212,15 @@ def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
|
|
179
212
|
|
180
213
|
Parameters
|
181
214
|
----------
|
182
|
-
array :
|
215
|
+
array : NDArray
|
183
216
|
continuous data to bin
|
184
217
|
|
185
218
|
Returns
|
186
219
|
-------
|
187
|
-
|
220
|
+
NDArray[int]:
|
221
|
+
bin numbers assigned by the kmeans best clusterer.
|
188
222
|
"""
|
189
|
-
|
223
|
+
|
190
224
|
if array.ndim == 1:
|
191
225
|
array = array.reshape([-1, 1])
|
192
226
|
best_score = 0.60
|
@@ -203,28 +237,9 @@ def bin_kmeans(array: NDArray[Any]) -> NDArray[np.int_]:
|
|
203
237
|
return bin_index
|
204
238
|
|
205
239
|
|
206
|
-
def angle2xy(angles: NDArray[Any]) -> NDArray[Any]:
|
207
|
-
"""
|
208
|
-
Converts angle measurements to xy coordinates on the unit circle. Needed for binning angle data.
|
209
|
-
|
210
|
-
Parameters
|
211
|
-
----------
|
212
|
-
angles : np.ndarray
|
213
|
-
angle data in either radians or degrees
|
214
|
-
|
215
|
-
Returns
|
216
|
-
-------
|
217
|
-
xy : np.ndarray
|
218
|
-
Nx2 array of xy coordinates for each angle (can be radians or degrees)
|
219
|
-
"""
|
220
|
-
is_radians = ((angles >= -np.pi) & (angles <= 2 * np.pi)).all()
|
221
|
-
radians = angles if is_radians else np.pi / 180 * angles
|
222
|
-
xy = np.stack([np.cos(radians), np.sin(radians)], axis=1)
|
223
|
-
return xy
|
224
|
-
|
225
|
-
|
226
240
|
def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples: int) -> NDArray[np.int_]:
|
227
|
-
"""
|
241
|
+
"""
|
242
|
+
Returns individual group numbers based on a subset of metadata defined by groupnames
|
228
243
|
|
229
244
|
Parameters
|
230
245
|
----------
|
@@ -242,7 +257,7 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
|
|
242
257
|
|
243
258
|
Returns
|
244
259
|
-------
|
245
|
-
|
260
|
+
np.ndarray
|
246
261
|
group identifiers from metadata
|
247
262
|
"""
|
248
263
|
features2group = {k: np.array(v) for k, v in metadata.items() if k in group_names}
|
@@ -250,11 +265,12 @@ def get_group_ids(metadata: dict[str, Any], group_names: list[str], num_samples:
|
|
250
265
|
return np.zeros(num_samples, dtype=np.int_)
|
251
266
|
for name, feature in features2group.items():
|
252
267
|
if len(feature) != num_samples:
|
253
|
-
raise
|
254
|
-
|
268
|
+
raise ValueError(
|
269
|
+
f"Feature length does not match number of labels. "
|
270
|
+
f"Got {len(feature)} features and {num_samples} samples"
|
271
|
+
)
|
272
|
+
|
255
273
|
if type_of_target(feature) == "continuous":
|
256
|
-
if ("ANGLE" in name.upper()) or ("AZIMUTH" in name.upper()):
|
257
|
-
feature = angle2xy(feature)
|
258
274
|
features2group[name] = bin_kmeans(feature)
|
259
275
|
binned_features = np.stack(list(features2group.values()), axis=1)
|
260
276
|
_, group_ids = np.unique(binned_features, axis=0, return_inverse=True)
|
@@ -265,228 +281,239 @@ def make_splits(
|
|
265
281
|
index: NDArray[np.int_],
|
266
282
|
labels: NDArray[np.int_],
|
267
283
|
n_folds: int,
|
268
|
-
groups: NDArray[np.int_] | None
|
269
|
-
stratified: bool
|
270
|
-
) -> list[
|
271
|
-
"""
|
284
|
+
groups: NDArray[np.int_] | None,
|
285
|
+
stratified: bool,
|
286
|
+
) -> list[TrainValSplit]:
|
287
|
+
"""
|
288
|
+
Split data into n_folds partitions of training and validation data.
|
272
289
|
|
273
290
|
Parameters
|
274
291
|
----------
|
275
|
-
index :
|
276
|
-
index corresponding to each label
|
277
|
-
labels :
|
292
|
+
index : NDArray of ints
|
293
|
+
index corresponding to each label
|
294
|
+
labels : NDArray of ints
|
278
295
|
classification labels
|
279
296
|
n_folds : int
|
280
|
-
number
|
281
|
-
groups :
|
297
|
+
number of [train, val] folds
|
298
|
+
groups : NDArray of ints or None
|
282
299
|
group index for grouped partitions. Grouped partitions are split such that no group id is
|
283
300
|
present in both a training and validation split.
|
284
|
-
stratified : bool
|
285
|
-
If True, maintain dataset class balance within each train
|
301
|
+
stratified : bool
|
302
|
+
If True, maintain dataset class balance within each [train, val] split
|
286
303
|
|
287
304
|
Returns
|
288
305
|
-------
|
289
|
-
split_defs : list[
|
290
|
-
|
306
|
+
split_defs : list[TrainValSplit]
|
307
|
+
List of TrainValSplits, which specify train index, validation index, and the ratio of
|
291
308
|
validation to all data.
|
292
309
|
"""
|
293
|
-
split_defs = []
|
294
|
-
|
295
|
-
|
296
|
-
|
310
|
+
split_defs: list[TrainValSplit] = []
|
311
|
+
n_labels = len(np.unique(labels))
|
312
|
+
splitter = KFOLD_GROUP_STRATIFIED_MAP[(groups is not None, stratified)](n_folds)
|
313
|
+
good = False
|
314
|
+
attempts = 0
|
315
|
+
while not good and attempts < 3:
|
316
|
+
attempts += 1
|
297
317
|
splits = splitter.split(index, labels, groups)
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
"eval_frac": test_ratio,
|
308
|
-
}
|
309
|
-
)
|
318
|
+
split_defs.clear()
|
319
|
+
for train_idx, eval_idx in splits:
|
320
|
+
# test_ratio = len(eval_idx) / len(index)
|
321
|
+
t = np.atleast_1d(train_idx).astype(np.int_)
|
322
|
+
v = np.atleast_1d(eval_idx).astype(np.int_)
|
323
|
+
good = good or (len(np.unique(labels[t])) == n_labels and len(np.unique(labels[v])) == n_labels)
|
324
|
+
split_defs.append(TrainValSplit(t, v))
|
325
|
+
if not good and attempts == 3:
|
326
|
+
warnings.warn("Unable to create a good split definition, not all classes are represented in each split.")
|
310
327
|
return split_defs
|
311
328
|
|
312
329
|
|
313
330
|
def find_best_split(
|
314
|
-
labels: NDArray[np.int_], split_defs: list[
|
315
|
-
) ->
|
316
|
-
"""
|
331
|
+
labels: NDArray[np.int_], split_defs: list[TrainValSplit], stratified: bool, split_frac: float
|
332
|
+
) -> TrainValSplit:
|
333
|
+
"""
|
334
|
+
Finds the split that most closely satisfies a criterion determined by the arguments passed.
|
317
335
|
If stratified is True, returns the split whose class balance most closely resembles the overall
|
318
|
-
class balance. If false, returns the split with the size closest to the desired
|
336
|
+
class balance. If false, returns the split with the size closest to the desired split_frac
|
319
337
|
|
320
338
|
Parameters
|
321
339
|
----------
|
322
340
|
labels : np.ndarray
|
323
341
|
Labels upon which splits are (optionally) stratified
|
324
|
-
split_defs : list
|
325
|
-
|
326
|
-
validation to all data.
|
342
|
+
split_defs : list of TrainValSplits
|
343
|
+
Specifies the train index, validation index
|
327
344
|
stratified : bool
|
328
|
-
If True, maintain dataset class balance within each train
|
329
|
-
|
345
|
+
If True, maintain dataset class balance within each [train, val] split
|
346
|
+
split_frac : float
|
330
347
|
Desired fraction of the dataset sequestered for evaluation
|
331
348
|
|
332
349
|
Returns
|
333
350
|
-------
|
334
|
-
|
335
|
-
|
336
|
-
eval_index : np.ndarray
|
337
|
-
indices of data partitioned for evaluation
|
351
|
+
TrainValSplit
|
352
|
+
Indices of data partitioned for training and evaluation
|
338
353
|
"""
|
339
354
|
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
355
|
+
# Minimization functions and helpers
|
356
|
+
def freq(arr: NDArray[Any], minlength: int = 0) -> NDArray[np.floating[Any]]:
|
357
|
+
counts = np.bincount(arr, minlength=minlength)
|
358
|
+
return counts / np.sum(counts)
|
359
|
+
|
360
|
+
def weight(arr: NDArray, class_freq: NDArray) -> np.float64:
|
361
|
+
return np.sum(np.abs(freq(arr, len(class_freq)) - class_freq))
|
362
|
+
|
363
|
+
def class_freq_diff(split: TrainValSplit) -> np.float64:
|
364
|
+
class_freq = freq(labels)
|
365
|
+
return weight(labels[split.train], class_freq) + weight(labels[split.val], class_freq)
|
366
|
+
|
367
|
+
def split_ratio(split: TrainValSplit) -> np.float64:
|
368
|
+
return np.float64(len(split.val) / (len(split.val) + len(split.train)))
|
345
369
|
|
370
|
+
def split_diff(split: TrainValSplit) -> np.float64:
|
371
|
+
return abs(split_frac - split_ratio(split))
|
372
|
+
|
373
|
+
def split_inv_diff(split: TrainValSplit) -> np.float64:
|
374
|
+
return abs(1 - split_frac - split_ratio(split))
|
375
|
+
|
376
|
+
# Selects minimization function based on inputs
|
346
377
|
if stratified:
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
return best_split["train"], best_split["eval"]
|
351
|
-
elif eval_frac <= 2 / 3:
|
352
|
-
best_split = min(split_defs, key=lambda x: abs(eval_frac - x["eval_frac"])) # type: ignore
|
353
|
-
return best_split["train"], best_split["eval"]
|
378
|
+
key_func = class_freq_diff
|
379
|
+
elif split_frac <= 2 / 3:
|
380
|
+
key_func = split_diff
|
354
381
|
else:
|
355
|
-
|
356
|
-
|
382
|
+
key_func = split_inv_diff
|
383
|
+
|
384
|
+
return min(split_defs, key=key_func)
|
357
385
|
|
358
386
|
|
359
387
|
def single_split(
|
360
388
|
index: NDArray[np.int_],
|
361
389
|
labels: NDArray[np.int_],
|
362
|
-
|
390
|
+
split_frac: float,
|
363
391
|
groups: NDArray[np.int_] | None = None,
|
364
392
|
stratified: bool = False,
|
365
|
-
) ->
|
366
|
-
"""
|
393
|
+
) -> TrainValSplit:
|
394
|
+
"""
|
395
|
+
Handles the special case where only 1 partition of the data is desired (such as when
|
367
396
|
generating the test holdout split). In this case, the desired fraction of the data to be
|
368
|
-
partitioned into the test data must be specified, and a single [train,
|
397
|
+
partitioned into the test data must be specified, and a single [train, val] pair is returned.
|
369
398
|
|
370
399
|
Parameters
|
371
400
|
----------
|
372
|
-
index :
|
401
|
+
index : NDArray of ints
|
373
402
|
Input Dataset index corresponding to each label
|
374
|
-
labels :
|
403
|
+
labels : NDArray of ints
|
375
404
|
Labels upon which splits are (optionally) stratified
|
376
|
-
|
405
|
+
split_frac : float
|
377
406
|
Fraction of incoming data to be set aside for evaluation
|
378
|
-
groups :
|
407
|
+
groups : NDArray of ints, Optional
|
379
408
|
Group_ids (same shape as labels) for optional group partitioning
|
380
|
-
stratified : bool, default
|
409
|
+
stratified : bool, default False
|
381
410
|
Generates stratified splits if true (recommended)
|
382
411
|
|
383
412
|
Returns
|
384
413
|
-------
|
385
|
-
|
386
|
-
|
387
|
-
eval_index : np.ndarray
|
388
|
-
indices of data partitioned for evaluation
|
414
|
+
TrainValSplit
|
415
|
+
Indices of data partitioned for training and evaluation
|
389
416
|
"""
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
n_folds = max(2, int(round(1 / (1 - eval_frac - 1e-6))))
|
417
|
+
|
418
|
+
_, label_counts = np.unique(labels, return_counts=True)
|
419
|
+
max_folds = label_counts.min()
|
420
|
+
min_folds = np.unique(groups).shape[0] if groups is not None else 2
|
421
|
+
divisor = split_frac + 1e-06 if split_frac <= 2 / 3 else 1 - split_frac - 1e-06
|
422
|
+
n_folds = round(min(max(1 / divisor, min_folds), max_folds)) # Clips value between min_folds and max_folds
|
423
|
+
|
398
424
|
split_candidates = make_splits(index, labels, n_folds, groups, stratified)
|
399
|
-
|
400
|
-
return best_train, best_eval
|
425
|
+
return find_best_split(labels, split_candidates, stratified, split_frac)
|
401
426
|
|
402
427
|
|
428
|
+
@set_metadata
|
403
429
|
def split_dataset(
|
404
430
|
labels: list[int] | NDArray[np.int_],
|
405
431
|
num_folds: int = 1,
|
406
432
|
stratify: bool = False,
|
407
433
|
split_on: list[str] | None = None,
|
408
434
|
metadata: dict[str, Any] | None = None,
|
409
|
-
test_frac: float
|
410
|
-
val_frac: float
|
411
|
-
) ->
|
412
|
-
"""
|
413
|
-
|
435
|
+
test_frac: float = 0.0,
|
436
|
+
val_frac: float = 0.0,
|
437
|
+
) -> SplitDatasetOutput:
|
438
|
+
"""
|
439
|
+
Top level splitting function. Returns a dataclass containing a list of train and validation indices.
|
440
|
+
Indices for a test holdout may also be optionally included
|
414
441
|
|
415
442
|
Parameters
|
416
443
|
----------
|
417
|
-
labels :
|
444
|
+
labels : list or NDArray of ints
|
418
445
|
Classification Labels used to generate splits. Determines the size of the dataset
|
419
|
-
num_folds : int,
|
420
|
-
Number of train
|
421
|
-
|
422
|
-
stratify : bool, default=False
|
446
|
+
num_folds : int, default 1
|
447
|
+
Number of [train, val] folds. If equal to 1, val_frac must be greater than 0.0
|
448
|
+
stratify : bool, default False
|
423
449
|
If true, dataset is split such that the class distribution of the entire dataset is
|
424
|
-
preserved within each train
|
425
|
-
split_on : list,
|
426
|
-
Keys of the metadata dictionary
|
450
|
+
preserved within each [train, val] partition, which is generally recommended.
|
451
|
+
split_on : list or None, default None
|
452
|
+
Keys of the metadata dictionary upon which to group the dataset.
|
427
453
|
A grouped partition is divided such that no group is present within both the training and
|
428
|
-
validation set. Split_on groups should be selected to mitigate validation bias
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
set is created.
|
435
|
-
val_frac : float, optional
|
454
|
+
validation set. Split_on groups should be selected to mitigate validation bias
|
455
|
+
metadata : dict or None, default None
|
456
|
+
Dict containing data for potential dataset grouping. See split_on above
|
457
|
+
test_frac : float, default 0.0
|
458
|
+
Fraction of data to be optionally held out for test set
|
459
|
+
val_frac : float, default 0.0
|
436
460
|
Fraction of training data to be set aside for validation in the case where a single
|
437
|
-
train
|
461
|
+
[train, val] split is desired
|
462
|
+
|
463
|
+
Returns
|
464
|
+
-------
|
465
|
+
split_defs : SplitDatasetOutput
|
466
|
+
Output class containing a list of indices of training
|
467
|
+
and validation data for each fold and optional test indices
|
438
468
|
|
439
469
|
Raises
|
440
470
|
------
|
441
|
-
|
442
|
-
Raised if split_on is passed, but metadata is
|
443
|
-
defines the keys in which metadata dict must be indexed to determine the group index of the
|
444
|
-
data
|
471
|
+
TypeError
|
472
|
+
Raised if split_on is passed, but metadata is None or empty
|
445
473
|
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
ex.
|
451
|
-
{
|
452
|
-
"Fold_00": {
|
453
|
-
"train": [1,2,3,5,6,7,9,10,11],
|
454
|
-
"val": [0, 4, 8, 12]
|
455
|
-
},
|
456
|
-
"test": [13, 14, 15, 16]
|
457
|
-
}
|
474
|
+
Note
|
475
|
+
----
|
476
|
+
When specifying groups and/or stratification, ratios for test and validation splits can vary
|
477
|
+
as the stratification and grouping take higher priority than the percentages
|
458
478
|
"""
|
459
479
|
|
460
|
-
|
480
|
+
val_frac = calculate_validation_fraction(num_folds, test_frac, val_frac)
|
461
481
|
total_partitions = num_folds + 1 if test_frac else num_folds
|
462
|
-
|
463
|
-
|
482
|
+
|
483
|
+
if isinstance(labels, list):
|
484
|
+
labels = np.array(labels, dtype=np.int_)
|
485
|
+
|
486
|
+
label_length: int = len(labels)
|
487
|
+
|
488
|
+
_validate_labels(labels, total_partitions)
|
489
|
+
stratify &= is_stratifiable(labels, total_partitions)
|
490
|
+
groups = None
|
464
491
|
if split_on:
|
465
|
-
if metadata is None:
|
466
|
-
raise
|
467
|
-
|
468
|
-
|
469
|
-
if
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
492
|
+
if metadata is None or metadata == {}:
|
493
|
+
raise TypeError("If split_on is specified, metadata must also be provided, got None")
|
494
|
+
possible_groups = get_group_ids(metadata, split_on, label_length)
|
495
|
+
# Accounts for a test set that is 100 % of the data
|
496
|
+
group_partitions = total_partitions + 1 if val_frac else total_partitions
|
497
|
+
if is_groupable(possible_groups, group_partitions):
|
498
|
+
groups = possible_groups
|
499
|
+
|
500
|
+
test_indices: NDArray[np.int_]
|
501
|
+
index = np.arange(label_length)
|
502
|
+
|
503
|
+
tv_indices, test_indices = (
|
504
|
+
single_split(index=index, labels=labels, split_frac=test_frac, groups=groups, stratified=stratify)
|
505
|
+
if test_frac
|
506
|
+
else (index, np.array([], dtype=np.int_))
|
507
|
+
)
|
508
|
+
|
509
|
+
tv_labels = labels[tv_indices]
|
510
|
+
tv_groups = groups[tv_indices] if groups is not None else None
|
511
|
+
|
483
512
|
if num_folds == 1:
|
484
|
-
|
485
|
-
split_defs["fold_0"] = {"train": tv_idx[train_idx].squeeze(), "val": tv_idx[val_idx].squeeze()}
|
513
|
+
tv_splits = [single_split(tv_indices, tv_labels, val_frac, tv_groups, stratify)]
|
486
514
|
else:
|
487
|
-
tv_splits = make_splits(
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
return split_defs
|
515
|
+
tv_splits = make_splits(tv_indices, tv_labels, num_folds, tv_groups, stratify)
|
516
|
+
|
517
|
+
folds: list[TrainValSplit] = [TrainValSplit(tv_indices[split.train], tv_indices[split.val]) for split in tv_splits]
|
518
|
+
|
519
|
+
return SplitDatasetOutput(test_indices, folds)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.74.
|
3
|
+
Version: 0.74.2
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,4 +1,4 @@
|
|
1
|
-
dataeval/__init__.py,sha256=
|
1
|
+
dataeval/__init__.py,sha256=w_On8sJ5o_f8PboMo6LLErdFSqDAQ1Jg_e0mcp-5FRU,959
|
2
2
|
dataeval/detectors/__init__.py,sha256=Y-0bbyWyuMvZU80bCx6WPt3IV_r2hu9ymzpA8uzMqoI,206
|
3
3
|
dataeval/detectors/drift/__init__.py,sha256=BSXm21y7cAawHep-ZldCJ5HOvzYjPzYGKGrmoEs3i0E,737
|
4
4
|
dataeval/detectors/drift/base.py,sha256=QDGHMu1WADD-38MEIOwjQMEQM3DE7B0yFHO3hsMbV-E,14481
|
@@ -20,7 +20,8 @@ dataeval/detectors/ood/base_torch.py,sha256=yFbSfQsBMwZeVf8mrixmkZYBGChhV5oAHtkg
|
|
20
20
|
dataeval/detectors/ood/metadata_ks_compare.py,sha256=LNDNWGEDKTW8_-djgmK53sn9EZzzXq1Sgwc47k0QI-Y,5380
|
21
21
|
dataeval/detectors/ood/metadata_least_likely.py,sha256=nxMCXUOjOfWHDTGT2SLE7OYBCydRq8zHLd8t17k7hMM,5193
|
22
22
|
dataeval/detectors/ood/metadata_ood_mi.py,sha256=KLay2BmgHrStBV92VpIs_B1yEfQKllsMTgzOQEng01I,4065
|
23
|
-
dataeval/interop.py,sha256=
|
23
|
+
dataeval/interop.py,sha256=5lACbR7bZYGCagiwbXzAWvWeHRj8kWBmsTC9oEjFh78,2249
|
24
|
+
dataeval/logging.py,sha256=uGxXPqGpn5guQjuHtm25rzILaz7nCQUsy2o7tFo91OI,343
|
24
25
|
dataeval/metrics/__init__.py,sha256=fPBNLd-T6mCErZBBJrxWmXIL0jCk7fNUYIcNEBkMa80,238
|
25
26
|
dataeval/metrics/bias/__init__.py,sha256=dYiPHenS8J7pgRMMW2jNkTBmTbPoYTxT04fZu9PFats,747
|
26
27
|
dataeval/metrics/bias/balance.py,sha256=_TZEe17AT-qOvPp-QFrQfTqNwh8uVVCYjC4Sv6JBx9o,9118
|
@@ -42,14 +43,14 @@ dataeval/metrics/stats/hashstats.py,sha256=vxw_K74EJM9CZy-EV617vdrysFO8nEspVWqIY
|
|
42
43
|
dataeval/metrics/stats/labelstats.py,sha256=K0hJTphMe7htSjyss8GPtKDiHepTuU60_hX0xRA-uAg,4096
|
43
44
|
dataeval/metrics/stats/pixelstats.py,sha256=2zr9i3GLNx1i_SCtbfdtZNxXBEc_9wCe4qDpmXLVbKY,4576
|
44
45
|
dataeval/metrics/stats/visualstats.py,sha256=vLIC4sMo796axWl-4e4RzT33ll-_6ki54Dirn3V-EL8,4948
|
45
|
-
dataeval/output.py,sha256=
|
46
|
+
dataeval/output.py,sha256=hR5TJ67f7FgrZO9Du46aw-jvRpMjOimSgJSau4ZNK44,3565
|
46
47
|
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
48
|
dataeval/utils/__init__.py,sha256=z7HxSijjycey-rGdQkgVOdpvT0oO2pKAuT4uYyxYGMs,555
|
48
49
|
dataeval/utils/gmm.py,sha256=YuLsJKsVWgH_wHr1u_hSRH5Yeexdj8exht8h99L7bLo,561
|
49
50
|
dataeval/utils/image.py,sha256=KgC_1nW__nGN5q6bVZNvG4U_qIBdjcPATz9qe8f2XuA,1928
|
50
51
|
dataeval/utils/metadata.py,sha256=0A--iru0zEmi044mKz5P35q69KrI30yoiRSlvs7TSdQ,9418
|
51
52
|
dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
|
52
|
-
dataeval/utils/split_dataset.py,sha256=
|
53
|
+
dataeval/utils/split_dataset.py,sha256=KYIl2ueLN0BeBoEvbUP5FdwVcMYW_l-ES1nQf_zKpQA,18776
|
53
54
|
dataeval/utils/torch/__init__.py,sha256=lpkqfgyARUxgrV94cZESQv8PIP2p-UnwItZ_wIr0XzQ,675
|
54
55
|
dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
55
56
|
dataeval/utils/torch/datasets.py,sha256=10elNgLuH_FDX_CHE3y2Z215JN4-PQovQm5brcIJOeM,15021
|
@@ -59,7 +60,7 @@ dataeval/utils/torch/trainer.py,sha256=8BEXr6xtk-CHJTcNxOBnWgkFWfJUAiBy28cEdBhLM
|
|
59
60
|
dataeval/utils/torch/utils.py,sha256=nWRcT6z6DbFVrL1RyxCOX3DPoCrv9G0B-VI_9LdGCQQ,5784
|
60
61
|
dataeval/workflows/__init__.py,sha256=ef1MiVL5IuhlDXXbwsiAfafhnr7tD3TXF9GRusy9_O8,290
|
61
62
|
dataeval/workflows/sufficiency.py,sha256=v9AV3BZT0NW-zD2VNIL_5aWspvoscrxRIUKcUdpy7HI,18540
|
62
|
-
dataeval-0.74.
|
63
|
-
dataeval-0.74.
|
64
|
-
dataeval-0.74.
|
65
|
-
dataeval-0.74.
|
63
|
+
dataeval-0.74.2.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
|
64
|
+
dataeval-0.74.2.dist-info/METADATA,sha256=Rcnn55cRPZ2JZ1jn8YamuVDxmQVDKEItK4oqZyAYkHM,4298
|
65
|
+
dataeval-0.74.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
66
|
+
dataeval-0.74.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|