dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +27 -23
- dataeval/detectors/__init__.py +2 -2
- dataeval/detectors/drift/__init__.py +14 -12
- dataeval/detectors/drift/base.py +3 -3
- dataeval/detectors/drift/cvm.py +1 -1
- dataeval/detectors/drift/ks.py +3 -2
- dataeval/detectors/drift/mmd.py +9 -7
- dataeval/detectors/drift/torch.py +12 -12
- dataeval/detectors/drift/uncertainty.py +5 -4
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +4 -4
- dataeval/detectors/linters/clusterer.py +5 -9
- dataeval/detectors/linters/duplicates.py +10 -14
- dataeval/detectors/linters/outliers.py +100 -5
- dataeval/detectors/ood/__init__.py +4 -11
- dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
- dataeval/detectors/ood/base.py +47 -160
- dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- dataeval/detectors/ood/metadata_least_likely.py +3 -3
- dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval/detectors/ood/mixin.py +146 -0
- dataeval/detectors/ood/output.py +63 -0
- dataeval/interop.py +7 -6
- dataeval/{logging.py → log.py} +2 -0
- dataeval/metrics/__init__.py +3 -3
- dataeval/metrics/bias/__init__.py +10 -13
- dataeval/metrics/bias/balance.py +13 -11
- dataeval/metrics/bias/coverage.py +53 -5
- dataeval/metrics/bias/diversity.py +56 -24
- dataeval/metrics/bias/parity.py +20 -17
- dataeval/metrics/estimators/__init__.py +2 -2
- dataeval/metrics/estimators/ber.py +7 -4
- dataeval/metrics/estimators/divergence.py +4 -4
- dataeval/metrics/estimators/uap.py +4 -4
- dataeval/metrics/stats/__init__.py +19 -19
- dataeval/metrics/stats/base.py +28 -12
- dataeval/metrics/stats/boxratiostats.py +13 -14
- dataeval/metrics/stats/datasetstats.py +49 -20
- dataeval/metrics/stats/dimensionstats.py +8 -8
- dataeval/metrics/stats/hashstats.py +14 -10
- dataeval/metrics/stats/labelstats.py +94 -11
- dataeval/metrics/stats/pixelstats.py +11 -14
- dataeval/metrics/stats/visualstats.py +10 -13
- dataeval/output.py +23 -14
- dataeval/utils/__init__.py +5 -14
- dataeval/utils/dataset/__init__.py +7 -0
- dataeval/utils/{torch → dataset}/datasets.py +2 -0
- dataeval/utils/dataset/read.py +63 -0
- dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
- dataeval/utils/image.py +2 -2
- dataeval/utils/metadata.py +317 -14
- dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
- dataeval/utils/torch/__init__.py +2 -17
- dataeval/utils/torch/gmm.py +29 -6
- dataeval/utils/torch/{utils.py → internal.py} +82 -58
- dataeval/utils/torch/models.py +10 -8
- dataeval/utils/torch/trainer.py +6 -85
- dataeval/workflows/__init__.py +2 -5
- dataeval/workflows/sufficiency.py +18 -8
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
- dataeval-0.76.0.dist-info/METADATA +137 -0
- dataeval-0.76.0.dist-info/RECORD +67 -0
- dataeval/detectors/ood/base_torch.py +0 -109
- dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval/utils/gmm.py +0 -26
- dataeval-0.74.2.dist-info/METADATA +0 -120
- dataeval-0.74.2.dist-info/RECORD +0 -66
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0
@@ -1,70 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
|
-
from collections import defaultdict
|
6
5
|
from functools import partial
|
7
6
|
from typing import Any, Callable
|
8
7
|
|
9
8
|
import numpy as np
|
10
9
|
import torch
|
11
10
|
from numpy.typing import NDArray
|
12
|
-
from torch.utils.data import
|
13
|
-
|
14
|
-
|
15
|
-
def read_dataset(dataset: Dataset[Any]) -> list[list[Any]]:
|
16
|
-
"""
|
17
|
-
Extract information from a dataset at each index into individual lists of each information position
|
18
|
-
|
19
|
-
Parameters
|
20
|
-
----------
|
21
|
-
dataset : torch.utils.data.Dataset
|
22
|
-
Input dataset
|
23
|
-
|
24
|
-
Returns
|
25
|
-
-------
|
26
|
-
List[List[Any]]
|
27
|
-
All objects in individual lists based on return position from dataset
|
28
|
-
|
29
|
-
Warning
|
30
|
-
-------
|
31
|
-
No type checking is done between lists or data inside lists
|
32
|
-
|
33
|
-
See Also
|
34
|
-
--------
|
35
|
-
torch.utils.data.Dataset
|
36
|
-
|
37
|
-
Examples
|
38
|
-
--------
|
39
|
-
>>> import numpy as np
|
40
|
-
>>> data = np.ones((10, 1, 3, 3))
|
41
|
-
>>> labels = np.ones((10,))
|
42
|
-
>>> class ICDataset:
|
43
|
-
... def __init__(self, data, labels):
|
44
|
-
... self.data = data
|
45
|
-
... self.labels = labels
|
46
|
-
...
|
47
|
-
... def __getitem__(self, idx):
|
48
|
-
... return self.data[idx], self.labels[idx]
|
49
|
-
|
50
|
-
>>> ds = ICDataset(data, labels)
|
51
|
-
|
52
|
-
>>> result = read_dataset(ds)
|
53
|
-
>>> len(result) # images and labels
|
54
|
-
2
|
55
|
-
>>> np.asarray(result[0]).shape # images
|
56
|
-
(10, 1, 3, 3)
|
57
|
-
>>> np.asarray(result[1]).shape # labels
|
58
|
-
(10,)
|
59
|
-
"""
|
60
|
-
|
61
|
-
ddict: dict[int, list[Any]] = defaultdict(list[Any])
|
62
|
-
|
63
|
-
for data in dataset:
|
64
|
-
for i, d in enumerate(data if isinstance(data, tuple) else (data,)):
|
65
|
-
ddict[i].append(d)
|
66
|
-
|
67
|
-
return list(ddict.values())
|
11
|
+
from torch.utils.data import DataLoader, TensorDataset
|
12
|
+
from tqdm import tqdm
|
68
13
|
|
69
14
|
|
70
15
|
def get_device(device: str | torch.device | None = None) -> torch.device:
|
@@ -167,3 +112,82 @@ def predict_batch(
|
|
167
112
|
tuple(concat(p) for p in preds) if isinstance(preds, tuple) else concat(preds) # type: ignore
|
168
113
|
)
|
169
114
|
return out
|
115
|
+
|
116
|
+
|
117
|
+
def trainer(
|
118
|
+
model: torch.nn.Module,
|
119
|
+
x_train: NDArray[Any],
|
120
|
+
y_train: NDArray[Any] | None,
|
121
|
+
loss_fn: Callable[..., torch.Tensor | torch.nn.Module] | None,
|
122
|
+
optimizer: torch.optim.Optimizer | None,
|
123
|
+
preprocess_fn: Callable[[torch.Tensor], torch.Tensor] | None,
|
124
|
+
epochs: int,
|
125
|
+
batch_size: int,
|
126
|
+
device: torch.device,
|
127
|
+
verbose: bool,
|
128
|
+
) -> None:
|
129
|
+
"""
|
130
|
+
Train Pytorch model.
|
131
|
+
|
132
|
+
Parameters
|
133
|
+
----------
|
134
|
+
model
|
135
|
+
Model to train.
|
136
|
+
loss_fn
|
137
|
+
Loss function used for training.
|
138
|
+
x_train
|
139
|
+
Training data.
|
140
|
+
y_train
|
141
|
+
Training labels.
|
142
|
+
optimizer
|
143
|
+
Optimizer used for training.
|
144
|
+
preprocess_fn
|
145
|
+
Preprocessing function applied to each training batch.
|
146
|
+
epochs
|
147
|
+
Number of training epochs.
|
148
|
+
reg_loss_fn
|
149
|
+
Allows an additional regularisation term to be defined as reg_loss_fn(model)
|
150
|
+
batch_size
|
151
|
+
Batch size used for training.
|
152
|
+
buffer_size
|
153
|
+
Maximum number of elements that will be buffered when prefetching.
|
154
|
+
verbose
|
155
|
+
Whether to print training progress.
|
156
|
+
"""
|
157
|
+
if optimizer is None:
|
158
|
+
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
159
|
+
|
160
|
+
if y_train is None:
|
161
|
+
dataset = TensorDataset(torch.from_numpy(x_train).to(torch.float32))
|
162
|
+
|
163
|
+
else:
|
164
|
+
dataset = TensorDataset(
|
165
|
+
torch.from_numpy(x_train).to(torch.float32), torch.from_numpy(y_train).to(torch.float32)
|
166
|
+
)
|
167
|
+
|
168
|
+
loader = DataLoader(dataset=dataset)
|
169
|
+
|
170
|
+
model = model.to(device)
|
171
|
+
|
172
|
+
# iterate over epochs
|
173
|
+
loss = torch.nan
|
174
|
+
disable_tqdm = not verbose
|
175
|
+
for epoch in (pbar := tqdm(range(epochs), disable=disable_tqdm)):
|
176
|
+
epoch_loss = loss
|
177
|
+
for step, data in enumerate(loader):
|
178
|
+
if step % 250 == 0:
|
179
|
+
pbar.set_description(f"Epoch: {epoch} ({epoch_loss:.3f}), loss: {loss:.3f}")
|
180
|
+
|
181
|
+
x, y = [d.to(device) for d in data] if len(data) > 1 else (data[0].to(device), None)
|
182
|
+
|
183
|
+
if isinstance(preprocess_fn, Callable):
|
184
|
+
x = preprocess_fn(x)
|
185
|
+
|
186
|
+
y_hat = model(x)
|
187
|
+
y = x if y is None else y
|
188
|
+
|
189
|
+
loss = loss_fn(y, y_hat) # type: ignore
|
190
|
+
|
191
|
+
optimizer.zero_grad()
|
192
|
+
loss.backward()
|
193
|
+
optimizer.step()
|
dataeval/utils/torch/models.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
"""Simple PyTorch model architectures used by DataEval."""
|
2
|
+
|
1
3
|
from __future__ import annotations
|
2
4
|
|
3
|
-
__all__ = ["
|
5
|
+
__all__ = ["Autoencoder", "Encoder", "Decoder"]
|
4
6
|
|
5
7
|
import math
|
6
8
|
from typing import Any
|
@@ -9,7 +11,7 @@ import torch
|
|
9
11
|
import torch.nn as nn
|
10
12
|
|
11
13
|
|
12
|
-
class
|
14
|
+
class Autoencoder(nn.Module):
|
13
15
|
"""
|
14
16
|
An autoencoder model with a separate encoder and decoder.
|
15
17
|
|
@@ -63,7 +65,7 @@ class Encoder(nn.Module):
|
|
63
65
|
"""
|
64
66
|
A simple encoder to be used in an autoencoder model.
|
65
67
|
|
66
|
-
This is the encoder used by the
|
68
|
+
This is the encoder used by the Autoencoder model.
|
67
69
|
|
68
70
|
Parameters
|
69
71
|
----------
|
@@ -104,7 +106,7 @@ class Decoder(nn.Module):
|
|
104
106
|
"""
|
105
107
|
A simple decoder to be used in an autoencoder model.
|
106
108
|
|
107
|
-
This is the decoder used by the
|
109
|
+
This is the decoder used by the Autoencoder model.
|
108
110
|
|
109
111
|
Parameters
|
110
112
|
----------
|
@@ -142,14 +144,14 @@ class Decoder(nn.Module):
|
|
142
144
|
|
143
145
|
class AE(nn.Module):
|
144
146
|
"""
|
145
|
-
An autoencoder model with a separate encoder and decoder
|
146
|
-
|
147
|
+
An autoencoder model with a separate encoder and decoder used as the core of an autoencoder-based
|
148
|
+
OOD detector, i.e. as an argument to OOD_AE().
|
147
149
|
|
148
150
|
Parameters
|
149
151
|
----------
|
150
152
|
input_shape : tuple[int, int, int]
|
151
153
|
Number of input channels, number of rows, number of columns.() Number of examples per batch will be inferred
|
152
|
-
|
154
|
+
at runtime.)
|
153
155
|
"""
|
154
156
|
|
155
157
|
def __init__(self, input_shape: tuple[int, int, int]) -> None:
|
@@ -279,7 +281,7 @@ class Decoder_AE(nn.Module):
|
|
279
281
|
"""
|
280
282
|
A simple decoder to be used in an autoencoder model.
|
281
283
|
|
282
|
-
This is the decoder used by the
|
284
|
+
This is the decoder used by the Autoencoder model.
|
283
285
|
|
284
286
|
Parameters
|
285
287
|
----------
|
dataeval/utils/torch/trainer.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1
|
+
"""Utility classes for training PyTorch models."""
|
2
|
+
|
1
3
|
from __future__ import annotations
|
2
4
|
|
3
|
-
|
5
|
+
__all__ = ["AETrainer"]
|
6
|
+
|
7
|
+
from typing import Any
|
4
8
|
|
5
9
|
import torch
|
6
10
|
import torch.nn as nn
|
7
|
-
from numpy.typing import NDArray
|
8
11
|
from torch.optim import Adam
|
9
|
-
from torch.utils.data import DataLoader, Dataset
|
10
|
-
from tqdm import tqdm
|
11
|
-
|
12
|
-
__all__ = ["AETrainer", "trainer"]
|
12
|
+
from torch.utils.data import DataLoader, Dataset
|
13
13
|
|
14
14
|
|
15
15
|
def get_images_from_batch(batch: Any) -> Any:
|
@@ -176,82 +176,3 @@ class AETrainer:
|
|
176
176
|
encodings = torch.vstack((encodings, embeddings)) if len(encodings) else embeddings
|
177
177
|
|
178
178
|
return encodings
|
179
|
-
|
180
|
-
|
181
|
-
def trainer(
|
182
|
-
model: torch.nn.Module,
|
183
|
-
x_train: NDArray[Any],
|
184
|
-
y_train: NDArray[Any] | None,
|
185
|
-
loss_fn: Callable[..., torch.Tensor | torch.nn.Module] | None,
|
186
|
-
optimizer: torch.optim.Optimizer | None,
|
187
|
-
preprocess_fn: Callable[[torch.Tensor], torch.Tensor] | None,
|
188
|
-
epochs: int,
|
189
|
-
batch_size: int,
|
190
|
-
device: torch.device,
|
191
|
-
verbose: bool,
|
192
|
-
) -> None:
|
193
|
-
"""
|
194
|
-
Train Pytorch model.
|
195
|
-
|
196
|
-
Parameters
|
197
|
-
----------
|
198
|
-
model
|
199
|
-
Model to train.
|
200
|
-
loss_fn
|
201
|
-
Loss function used for training.
|
202
|
-
x_train
|
203
|
-
Training data.
|
204
|
-
y_train
|
205
|
-
Training labels.
|
206
|
-
optimizer
|
207
|
-
Optimizer used for training.
|
208
|
-
preprocess_fn
|
209
|
-
Preprocessing function applied to each training batch.
|
210
|
-
epochs
|
211
|
-
Number of training epochs.
|
212
|
-
reg_loss_fn
|
213
|
-
Allows an additional regularisation term to be defined as reg_loss_fn(model)
|
214
|
-
batch_size
|
215
|
-
Batch size used for training.
|
216
|
-
buffer_size
|
217
|
-
Maximum number of elements that will be buffered when prefetching.
|
218
|
-
verbose
|
219
|
-
Whether to print training progress.
|
220
|
-
"""
|
221
|
-
if optimizer is None:
|
222
|
-
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
223
|
-
|
224
|
-
if y_train is None:
|
225
|
-
dataset = TensorDataset(torch.from_numpy(x_train).to(torch.float32))
|
226
|
-
|
227
|
-
else:
|
228
|
-
dataset = TensorDataset(
|
229
|
-
torch.from_numpy(x_train).to(torch.float32), torch.from_numpy(y_train).to(torch.float32)
|
230
|
-
)
|
231
|
-
|
232
|
-
loader = DataLoader(dataset=dataset)
|
233
|
-
|
234
|
-
model = model.to(device)
|
235
|
-
|
236
|
-
# iterate over epochs
|
237
|
-
loss = torch.nan
|
238
|
-
disable_tqdm = not verbose
|
239
|
-
for epoch in (pbar := tqdm(range(epochs), disable=disable_tqdm)):
|
240
|
-
epoch_loss = loss
|
241
|
-
for step, data in enumerate(loader):
|
242
|
-
if step % 250 == 0:
|
243
|
-
pbar.set_description(f"Epoch: {epoch} ({epoch_loss:.3f}), loss: {loss:.3f}")
|
244
|
-
|
245
|
-
x, y = [d.to(device) for d in data] if len(data) > 1 else (data[0].to(device), None)
|
246
|
-
|
247
|
-
if isinstance(preprocess_fn, Callable):
|
248
|
-
x = preprocess_fn(x)
|
249
|
-
|
250
|
-
y_hat = model(x)
|
251
|
-
y = x if y is None else y
|
252
|
-
|
253
|
-
loss = loss_fn(y, y_hat) # type: ignore
|
254
|
-
|
255
|
-
optimizer.zero_grad()
|
256
|
-
loss.backward()
|
257
|
-
optimizer.step()
|
dataeval/workflows/__init__.py
CHANGED
@@ -2,9 +2,6 @@
|
|
2
2
|
Workflows perform a sequence of actions to analyze the dataset and make predictions.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
5
|
+
__all__ = ["Sufficiency", "SufficiencyOutput"]
|
6
6
|
|
7
|
-
|
8
|
-
from dataeval.workflows.sufficiency import Sufficiency, SufficiencyOutput
|
9
|
-
|
10
|
-
__all__ = ["Sufficiency", "SufficiencyOutput"]
|
7
|
+
from dataeval.workflows.sufficiency import Sufficiency, SufficiencyOutput
|
@@ -1,16 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
import warnings
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from typing import Any, Callable, Generic, Iterable, Mapping, Sequence, TypeVar, cast
|
8
9
|
|
9
|
-
import matplotlib.pyplot as plt
|
10
10
|
import numpy as np
|
11
11
|
import torch
|
12
12
|
import torch.nn as nn
|
13
|
-
from matplotlib.figure import Figure
|
14
13
|
from numpy.typing import ArrayLike, NDArray
|
15
14
|
from scipy.optimize import basinhopping
|
16
15
|
from torch.utils.data import Dataset
|
@@ -18,11 +17,14 @@ from torch.utils.data import Dataset
|
|
18
17
|
from dataeval.interop import as_numpy
|
19
18
|
from dataeval.output import Output, set_metadata
|
20
19
|
|
20
|
+
with contextlib.suppress(ImportError):
|
21
|
+
from matplotlib.figure import Figure
|
22
|
+
|
21
23
|
|
22
24
|
@dataclass(frozen=True)
|
23
25
|
class SufficiencyOutput(Output):
|
24
26
|
"""
|
25
|
-
Output class for :class:`Sufficiency` workflow
|
27
|
+
Output class for :class:`Sufficiency` workflow.
|
26
28
|
|
27
29
|
Attributes
|
28
30
|
----------
|
@@ -97,7 +99,7 @@ class SufficiencyOutput(Output):
|
|
97
99
|
|
98
100
|
Returns
|
99
101
|
-------
|
100
|
-
list[
|
102
|
+
list[Figure]
|
101
103
|
List of Figures for each measure
|
102
104
|
|
103
105
|
Raises
|
@@ -344,7 +346,9 @@ def plot_measure(
|
|
344
346
|
params: NDArray[Any],
|
345
347
|
projection: NDArray[Any],
|
346
348
|
) -> Figure:
|
347
|
-
|
349
|
+
import matplotlib.pyplot
|
350
|
+
|
351
|
+
fig = matplotlib.pyplot.figure()
|
348
352
|
fig = cast(Figure, fig)
|
349
353
|
fig.tight_layout()
|
350
354
|
|
@@ -374,7 +378,7 @@ T = TypeVar("T")
|
|
374
378
|
|
375
379
|
class Sufficiency(Generic[T]):
|
376
380
|
"""
|
377
|
-
Project dataset :term:`sufficiency<Sufficiency>` using given a model and evaluation criteria
|
381
|
+
Project dataset :term:`sufficiency<Sufficiency>` using given a model and evaluation criteria.
|
378
382
|
|
379
383
|
Parameters
|
380
384
|
----------
|
@@ -510,7 +514,13 @@ class Sufficiency(Generic[T]):
|
|
510
514
|
Examples
|
511
515
|
--------
|
512
516
|
>>> suff = Sufficiency(
|
513
|
-
... model=model,
|
517
|
+
... model=model,
|
518
|
+
... train_ds=train_ds,
|
519
|
+
... test_ds=test_ds,
|
520
|
+
... train_fn=train_fn,
|
521
|
+
... eval_fn=eval_fn,
|
522
|
+
... runs=3,
|
523
|
+
... substeps=5,
|
514
524
|
... )
|
515
525
|
>>> suff.evaluate()
|
516
526
|
SufficiencyOutput(steps=array([ 1, 3, 10, 31, 100], dtype=uint32), params={'test': array([ 0., 42., 0.])}, measures={'test': array([1., 1., 1., 1., 1.])})
|
@@ -1,6 +1,6 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
|
-
Copyright (c)
|
3
|
+
Copyright (c) 2025 ARiA
|
4
4
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
6
|
of this software and associated documentation files (the "Software"), to deal
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,137 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: dataeval
|
3
|
+
Version: 0.76.0
|
4
|
+
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
|
+
Home-page: https://dataeval.ai/
|
6
|
+
License: MIT
|
7
|
+
Author: Andrew Weng
|
8
|
+
Author-email: andrew.weng@ariacoustics.com
|
9
|
+
Maintainer: ARiA
|
10
|
+
Maintainer-email: dataeval@ariacoustics.com
|
11
|
+
Requires-Python: >=3.9,<3.13
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Operating System :: OS Independent
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
22
|
+
Classifier: Topic :: Scientific/Engineering
|
23
|
+
Provides-Extra: all
|
24
|
+
Requires-Dist: matplotlib ; extra == "all"
|
25
|
+
Requires-Dist: numpy (>=1.24.2)
|
26
|
+
Requires-Dist: pillow (>=10.3.0)
|
27
|
+
Requires-Dist: requests
|
28
|
+
Requires-Dist: scikit-learn (>=1.5.0)
|
29
|
+
Requires-Dist: scipy (>=1.10)
|
30
|
+
Requires-Dist: torch (>=2.2.0)
|
31
|
+
Requires-Dist: torchvision (>=0.17.0)
|
32
|
+
Requires-Dist: tqdm
|
33
|
+
Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "4.0"
|
34
|
+
Requires-Dist: xxhash (>=3.3)
|
35
|
+
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
36
|
+
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
37
|
+
Description-Content-Type: text/markdown
|
38
|
+
|
39
|
+
# DataEval
|
40
|
+
|
41
|
+
To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
|
42
|
+
|
43
|
+
## About DataEval
|
44
|
+
|
45
|
+
<!-- start tagline -->
|
46
|
+
|
47
|
+
DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
|
48
|
+
|
49
|
+
<!-- end tagline -->
|
50
|
+
|
51
|
+
### Our mission
|
52
|
+
|
53
|
+
<!-- start needs -->
|
54
|
+
|
55
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
|
56
|
+
|
57
|
+
<!-- end needs -->
|
58
|
+
|
59
|
+
<!-- start JATIC interop -->
|
60
|
+
DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
|
61
|
+
DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
|
62
|
+
<!-- end JATIC interop -->
|
63
|
+
|
64
|
+
## Getting Started
|
65
|
+
|
66
|
+
**Python versions:** 3.9 - 3.12
|
67
|
+
|
68
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
|
69
|
+
|
70
|
+
Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
71
|
+
|
72
|
+
* [Installing with pip](#installing-with-pip)
|
73
|
+
* [Installing with conda/mamba](#installing-with-conda)
|
74
|
+
* [Installing from GitHub](#installing-from-github)
|
75
|
+
|
76
|
+
### **Installing with pip**
|
77
|
+
|
78
|
+
You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
|
79
|
+
|
80
|
+
```bash
|
81
|
+
pip install dataeval[all]
|
82
|
+
```
|
83
|
+
|
84
|
+
### **Installing with conda**
|
85
|
+
|
86
|
+
DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
|
87
|
+
are installed from the `pytorch` channel, the channel is specified in the below example.
|
88
|
+
|
89
|
+
```bash
|
90
|
+
micromamba create -f environment\environment.yaml -c pytorch
|
91
|
+
```
|
92
|
+
|
93
|
+
### **Installing from GitHub**
|
94
|
+
|
95
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
|
96
|
+
|
97
|
+
```bash
|
98
|
+
sudo apt-get install git-lfs
|
99
|
+
pip install poetry
|
100
|
+
```
|
101
|
+
|
102
|
+
Pull the source down and change to the DataEval project directory.
|
103
|
+
|
104
|
+
```bash
|
105
|
+
git clone https://github.com/aria-ml/dataeval.git
|
106
|
+
cd dataeval
|
107
|
+
```
|
108
|
+
|
109
|
+
Install DataEval with optional dependencies for development.
|
110
|
+
|
111
|
+
```bash
|
112
|
+
poetry install --all-extras --with dev
|
113
|
+
```
|
114
|
+
|
115
|
+
Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
|
116
|
+
|
117
|
+
```bash
|
118
|
+
poetry shell
|
119
|
+
```
|
120
|
+
|
121
|
+
## Contact Us
|
122
|
+
|
123
|
+
If you have any questions, feel free to reach out to the people below:
|
124
|
+
|
125
|
+
* **POC**: Scott Swan @scott.swan
|
126
|
+
* **DPOC**: Andrew Weng @aweng
|
127
|
+
|
128
|
+
## Acknowledgement
|
129
|
+
|
130
|
+
<!-- start acknowledgement -->
|
131
|
+
|
132
|
+
### CDAO Funding Acknowledgement
|
133
|
+
|
134
|
+
This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
|
135
|
+
|
136
|
+
<!-- end acknowledgement -->
|
137
|
+
|
@@ -0,0 +1,67 @@
|
|
1
|
+
dataeval/__init__.py,sha256=TSINwIPlGIGiYd66kY8gnBnEpBhcgWm7_029htFBgv8,1474
|
2
|
+
dataeval/detectors/__init__.py,sha256=iifG-Z08mH5B4QhkKtAieDGJBKldKvmCXpDQJD9qVY8,206
|
3
|
+
dataeval/detectors/drift/__init__.py,sha256=wO294Oz--l0GuZTAkBpyGwZphbQsot57HoiEX6kjNOc,652
|
4
|
+
dataeval/detectors/drift/base.py,sha256=8zHUnUpmgpWMzDv5C-tUX61lbpDjhJ-eAIiNxaNvWP8,14469
|
5
|
+
dataeval/detectors/drift/cvm.py,sha256=TATS6IOE0INO1pkyRkesgrhDawD_kITsRsOOGVRs420,4132
|
6
|
+
dataeval/detectors/drift/ks.py,sha256=SAd2T9CdytXD7DegCzAX1pWYJdPuttyL97KAQYF4j7Y,4265
|
7
|
+
dataeval/detectors/drift/mmd.py,sha256=z7JPFbW4fmHJhR-Qe1OQ4mM8kW6dNxnd3uHD9oXMETE,7599
|
8
|
+
dataeval/detectors/drift/torch.py,sha256=ykD-Nggys5T9FTGXXbYYOi2WRKwEzEjXhL8ZueVmTxU,7659
|
9
|
+
dataeval/detectors/drift/uncertainty.py,sha256=zkrqz5euJJtYFKsDiRqFfTnDjVOTbqpZWgZiGMrYxvI,5351
|
10
|
+
dataeval/detectors/drift/updates.py,sha256=nKsF4xrMFZd2X84GJ5XnGylUuketX_RcH7UpcdlonIo,1781
|
11
|
+
dataeval/detectors/linters/__init__.py,sha256=CZV5naeYQYL3sHXO_CXB26AXkyTeKHI-TMaewtEs8Ag,483
|
12
|
+
dataeval/detectors/linters/clusterer.py,sha256=V-bNs4ut2E6SIqU4MR1Y96WBZcs4cavQhvXBB0vFZPw,20937
|
13
|
+
dataeval/detectors/linters/duplicates.py,sha256=Ba-Nmbjqg_HDMlEBqlWW1aFO_BA-HSc-uWHc3cmI394,5620
|
14
|
+
dataeval/detectors/linters/merged_stats.py,sha256=X-bDTwjyR8RuVmzxLaHZmQ5nI3oOWvsqVlitdSncapk,1355
|
15
|
+
dataeval/detectors/linters/outliers.py,sha256=aGGGOJKs0FTObQtj1m-ench0MHADOhrhC8idf1wRB0s,13786
|
16
|
+
dataeval/detectors/ood/__init__.py,sha256=Ws6_un4pFWNknki7Bp7qjrslZVB9pYNE-K72u2lF65k,291
|
17
|
+
dataeval/detectors/ood/ae.py,sha256=SL8oKTERhMwaZTQWwDhQQ6H07UKj8ozXqEWO3TaOAos,2151
|
18
|
+
dataeval/detectors/ood/base.py,sha256=-ApcC9lyZJAgk-joMpLXF20sJqtvlAugg-W18TcAsEw,3010
|
19
|
+
dataeval/detectors/ood/metadata_ks_compare.py,sha256=-hEhDNXFC7X8wmFeoigO7A7Qn90vRLroN_nKDwNgjnE,5204
|
20
|
+
dataeval/detectors/ood/metadata_least_likely.py,sha256=rb8GOgsrlrEzc6fxccdmyZQ5PC7HtTsTY8U97D-h5OU,5088
|
21
|
+
dataeval/detectors/ood/metadata_ood_mi.py,sha256=7_Sdzf7-x1TlrIQvSyOIB98C8_UQhUwmwFQmZ9_q1Uc,4042
|
22
|
+
dataeval/detectors/ood/mixin.py,sha256=Ia-rJF6rtGhE8uavijdbzOha3ueFk2CFfA0Ah_mnF40,4976
|
23
|
+
dataeval/detectors/ood/output.py,sha256=yygnsjaIQB6v6sXh7glqX2aoqWdf3_YLINqx7BGKMtk,1710
|
24
|
+
dataeval/interop.py,sha256=P9Kwe-vOVgbn1ng60y4giCnJYmHjIOpyGpccuIA7P1g,2322
|
25
|
+
dataeval/log.py,sha256=Mn5bRWO0cgtAYd5VGYSFiPgu57ta3zoktrtHAZ1m3dU,357
|
26
|
+
dataeval/metrics/__init__.py,sha256=OMntcHmmrsOfIlRsJTZQQaF5qXEuP61Li-ElKy7Ysbk,240
|
27
|
+
dataeval/metrics/bias/__init__.py,sha256=SIg4Qxza9BqXyKNQLIY0bpqoFvZfK5-GaejpTH6efVc,601
|
28
|
+
dataeval/metrics/bias/balance.py,sha256=B1sPackyodiBct9Hs88BR4nJde_R61JyjwSBIG_CFug,9171
|
29
|
+
dataeval/metrics/bias/coverage.py,sha256=igVDWJSrO2MvaTEiDUhVzVWPGNB1QOZvngCi8UF0RwA,5746
|
30
|
+
dataeval/metrics/bias/diversity.py,sha256=nF1y2FaQIU0yHQtckoddjqoty2hsVVMqwaXWHRdGfqA,8521
|
31
|
+
dataeval/metrics/bias/parity.py,sha256=rzi7Z0Z6injCaj2vkbSsZvbKMfk1EN648oKinv5y5Dk,12760
|
32
|
+
dataeval/metrics/estimators/__init__.py,sha256=oY_9jX7V-Kg7-4KpvMNB4rUhsk8QTA0DIoM8d2VtVIg,380
|
33
|
+
dataeval/metrics/estimators/ber.py,sha256=vcndXr0PNLRlYz7u7K74f-B5g3DnUkaTO_WigGdj0cg,5012
|
34
|
+
dataeval/metrics/estimators/divergence.py,sha256=joqqlH0AQFibJkHCCb7i7dMJIGF28fmZIR-tGupQQJQ,4247
|
35
|
+
dataeval/metrics/estimators/uap.py,sha256=ZAQUjJCbdulftWk6yjILCbnXGOE8RuDqEINZRtMW3tc,2143
|
36
|
+
dataeval/metrics/stats/__init__.py,sha256=pUT84sOxDiCHW6xz6Ml1Mf1bFszQrtd3qPG0Ja3boxA,1088
|
37
|
+
dataeval/metrics/stats/base.py,sha256=1ejjwlA0FmllcAw7J9Yv1r7GMmBYKvuGPzmDk9ktASM,12613
|
38
|
+
dataeval/metrics/stats/boxratiostats.py,sha256=PS1wvWwhTCMJX56erfPW-BZymXrevvXnKl2PkE0qmLk,6315
|
39
|
+
dataeval/metrics/stats/datasetstats.py,sha256=mt5t5WhlVI7mo56dmhqgnk1eH8oBV7dahgmqkFDcKo0,7387
|
40
|
+
dataeval/metrics/stats/dimensionstats.py,sha256=AlPor23dUH718jFNiVNedHQVaQzN-6OKQEVDQbnGE50,4027
|
41
|
+
dataeval/metrics/stats/hashstats.py,sha256=5nNSJ3Tl8gPqpYlWpxl7EHfW6pJd1BtbXYUiuGgH4Eo,5070
|
42
|
+
dataeval/metrics/stats/labelstats.py,sha256=v9EAg-9h0OtuoU0r3K5TJbHj87fjmnWnNdtg0EPp8co,7030
|
43
|
+
dataeval/metrics/stats/pixelstats.py,sha256=tfvu0tYPgDS0jCCSY2sZ2Ice5r1nNuKx-LYXxZQCw7s,4220
|
44
|
+
dataeval/metrics/stats/visualstats.py,sha256=pEQnAPFg-zQ1U5orwF0-U7kfHuZGjMJDsdEMAoDZd4I,4634
|
45
|
+
dataeval/output.py,sha256=Dyfv1xlrwSbCe7HdDyq8t-kiIRJbBeaMEmMROr1FrVQ,4034
|
46
|
+
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
|
+
dataeval/utils/__init__.py,sha256=WW9e_1RbtkvLDRqu1NpDw3-V4su4mA8yJ_P3bgd_7Ho,283
|
48
|
+
dataeval/utils/dataset/__init__.py,sha256=IvRauQaa0CzJ5nZrfTSjGoaaKelyJcQDe3OPRw0-NXs,332
|
49
|
+
dataeval/utils/dataset/datasets.py,sha256=7tSqN3d8UncqmXh4eiEwarXgVxc4sMuIKPTqBCE0pN8,15080
|
50
|
+
dataeval/utils/dataset/read.py,sha256=Q_RaNTFXhkMsx3PrgJEIySdHAA-QxGuih6eq6mnJv-4,1524
|
51
|
+
dataeval/utils/dataset/split.py,sha256=1vNy5I1zZx-LIf8B0y57dUaO_UdVd1hyJggUANkwNtM,18958
|
52
|
+
dataeval/utils/image.py,sha256=AQljELyMFkYsf2AoNOH5dZG8DYE4hPw0MCk85eIXqAw,1926
|
53
|
+
dataeval/utils/metadata.py,sha256=SjYPXvM7x_3OyQbdfn4WsViqMplEjRxTdz8tjSJEP3E,22497
|
54
|
+
dataeval/utils/plot.py,sha256=YyFL1KoJgnl2Bip7m73WVBJa6zbsBnn5c1b3skFfUrA,7068
|
55
|
+
dataeval/utils/shared.py,sha256=xvF3VLfyheVwJtdtDrneOobkKf7t-JTmf_w91FWXmqo,3616
|
56
|
+
dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
|
57
|
+
dataeval/utils/torch/blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
58
|
+
dataeval/utils/torch/gmm.py,sha256=fQ8CBO4Bf6i9N1CZdeJ8VJP25fsPjgMextQkondwgvo,3693
|
59
|
+
dataeval/utils/torch/internal.py,sha256=qAzQTwTI9Qy6f01Olw3d1TIJ4HoWGf0gQzgWVcdD2x4,6653
|
60
|
+
dataeval/utils/torch/models.py,sha256=Df3B_9x5uu-Y5ZOyhRZYpKJnDvxt0hgMeJLy1E4oxpU,8519
|
61
|
+
dataeval/utils/torch/trainer.py,sha256=Qay0LK63RuyoGYiJ5zI2C5BVym309ORvp6shhpcrIU4,5589
|
62
|
+
dataeval/workflows/__init__.py,sha256=L9yfBipNFGnYuN2JbMknIHDvziwfa2XAGFnOwifZbls,216
|
63
|
+
dataeval/workflows/sufficiency.py,sha256=jf53J1PAlfRHSjGpMCWRJzImitLtCQvTMCaMm28ZuPM,18675
|
64
|
+
dataeval-0.76.0.dist-info/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
|
65
|
+
dataeval-0.76.0.dist-info/METADATA,sha256=zk12Bkp0R6Glx-VSrG7ip45aTU4y6i_P_mPw2c_SQ6w,5140
|
66
|
+
dataeval-0.76.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
67
|
+
dataeval-0.76.0.dist-info/RECORD,,
|