dataeval 0.86.7__py3-none-any.whl → 0.86.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +10 -3
- dataeval/_version.py +21 -0
- dataeval/config.py +7 -1
- dataeval/detectors/drift/_mvdc.py +2 -9
- dataeval/detectors/drift/_nml/_chunk.py +2 -2
- dataeval/detectors/ood/ae.py +1 -1
- dataeval/detectors/ood/base.py +3 -3
- dataeval/metrics/bias/_completeness.py +3 -3
- dataeval/metrics/bias/_coverage.py +2 -2
- dataeval/metrics/bias/_parity.py +1 -1
- dataeval/metrics/estimators/_ber.py +2 -2
- dataeval/metrics/estimators/_divergence.py +2 -2
- dataeval/outputs/_estimators.py +6 -6
- dataeval/utils/_array.py +20 -9
- dataeval/utils/_clusterer.py +7 -7
- dataeval/utils/torch/_internal.py +3 -3
- dataeval/utils/torch/trainer.py +1 -1
- {dataeval-0.86.7.dist-info → dataeval-0.86.8.dist-info}/METADATA +67 -47
- {dataeval-0.86.7.dist-info → dataeval-0.86.8.dist-info}/RECORD +25 -24
- {dataeval-0.86.7.dist-info → dataeval-0.86.8.dist-info}/WHEEL +1 -1
- {dataeval-0.86.7.dist-info → dataeval-0.86.8.dist-info/licenses}/LICENSE.txt +0 -0
dataeval/__init__.py
CHANGED
@@ -7,12 +7,19 @@ shifts that impact performance of deployed models.
|
|
7
7
|
|
8
8
|
from __future__ import annotations
|
9
9
|
|
10
|
-
|
11
|
-
|
10
|
+
try:
|
11
|
+
from ._version import __version__
|
12
|
+
except ImportError:
|
13
|
+
__version__ = "unknown"
|
14
|
+
|
15
|
+
# Strongly type for pyright
|
16
|
+
__version__ = str(__version__)
|
17
|
+
|
18
|
+
__all__ = ["__version__", "config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
|
12
19
|
|
13
20
|
import logging
|
14
21
|
|
15
|
-
from
|
22
|
+
from . import config, detectors, metrics, typing, utils, workflows
|
16
23
|
|
17
24
|
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
18
25
|
|
dataeval/_version.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# file generated by setuptools-scm
|
2
|
+
# don't change, don't track in version control
|
3
|
+
|
4
|
+
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
5
|
+
|
6
|
+
TYPE_CHECKING = False
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from typing import Tuple
|
9
|
+
from typing import Union
|
10
|
+
|
11
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
12
|
+
else:
|
13
|
+
VERSION_TUPLE = object
|
14
|
+
|
15
|
+
version: str
|
16
|
+
__version__: str
|
17
|
+
__version_tuple__: VERSION_TUPLE
|
18
|
+
version_tuple: VERSION_TUPLE
|
19
|
+
|
20
|
+
__version__ = version = '0.86.8'
|
21
|
+
__version_tuple__ = version_tuple = (0, 86, 8)
|
dataeval/config.py
CHANGED
@@ -77,7 +77,13 @@ def get_device(override: DeviceLike | None = None) -> torch.device:
|
|
77
77
|
"""
|
78
78
|
if override is None:
|
79
79
|
global _device
|
80
|
-
return
|
80
|
+
return (
|
81
|
+
torch.get_default_device()
|
82
|
+
if hasattr(torch, "get_default_device")
|
83
|
+
else torch.device("cpu")
|
84
|
+
if _device is None
|
85
|
+
else _device
|
86
|
+
)
|
81
87
|
return _todevice(override)
|
82
88
|
|
83
89
|
|
@@ -1,16 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from typing import TYPE_CHECKING
|
4
|
-
|
5
3
|
import numpy as np
|
6
4
|
import pandas as pd
|
7
5
|
from numpy.typing import ArrayLike
|
8
6
|
|
9
|
-
if TYPE_CHECKING:
|
10
|
-
from typing import Self
|
11
|
-
else:
|
12
|
-
from typing_extensions import Self
|
13
|
-
|
14
7
|
from dataeval.detectors.drift._nml._chunk import CountBasedChunker, SizeBasedChunker
|
15
8
|
from dataeval.detectors.drift._nml._domainclassifier import DomainClassifierCalculator
|
16
9
|
from dataeval.detectors.drift._nml._thresholds import ConstantThreshold
|
@@ -52,7 +45,7 @@ class DriftMVDC:
|
|
52
45
|
threshold=ConstantThreshold(lower=self.threshold[0], upper=self.threshold[1]),
|
53
46
|
)
|
54
47
|
|
55
|
-
def fit(self, x_ref: ArrayLike) ->
|
48
|
+
def fit(self, x_ref: ArrayLike) -> DriftMVDC:
|
56
49
|
"""
|
57
50
|
Fit the domain classifier on the training dataframe
|
58
51
|
|
@@ -63,7 +56,7 @@ class DriftMVDC:
|
|
63
56
|
|
64
57
|
Returns
|
65
58
|
-------
|
66
|
-
|
59
|
+
DriftMVDC
|
67
60
|
|
68
61
|
"""
|
69
62
|
# for 1D input, assume that is 1 sample: dim[1,n_features]
|
@@ -46,10 +46,10 @@ class Chunk(ABC):
|
|
46
46
|
return self.data.shape[0]
|
47
47
|
|
48
48
|
@abstractmethod
|
49
|
-
def __add__(self, other:
|
49
|
+
def __add__(self, other: Any) -> Any: ...
|
50
50
|
|
51
51
|
@abstractmethod
|
52
|
-
def __lt__(self, other:
|
52
|
+
def __lt__(self, other: Any) -> bool: ...
|
53
53
|
|
54
54
|
@abstractmethod
|
55
55
|
def dict(self) -> dict[str, Any]: ...
|
dataeval/detectors/ood/ae.py
CHANGED
@@ -65,7 +65,7 @@ class OOD_AE(OODBase):
|
|
65
65
|
self,
|
66
66
|
x_ref: ArrayLike,
|
67
67
|
threshold_perc: float,
|
68
|
-
loss_fn: Callable[..., torch.
|
68
|
+
loss_fn: Callable[..., torch.Tensor] | None = None,
|
69
69
|
optimizer: torch.optim.Optimizer | None = None,
|
70
70
|
epochs: int = 20,
|
71
71
|
batch_size: int = 64,
|
dataeval/detectors/ood/base.py
CHANGED
@@ -22,7 +22,7 @@ from dataeval.utils.torch._gmm import GaussianMixtureModelParams, gmm_params
|
|
22
22
|
from dataeval.utils.torch._internal import trainer
|
23
23
|
|
24
24
|
|
25
|
-
class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.
|
25
|
+
class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.Tensor], torch.optim.Optimizer]):
|
26
26
|
def __init__(self, model: torch.nn.Module, device: DeviceLike | None = None) -> None:
|
27
27
|
self.device: torch.device = get_device(device)
|
28
28
|
super().__init__(model)
|
@@ -31,7 +31,7 @@ class OODBase(OODBaseMixin[torch.nn.Module], OODFitMixin[Callable[..., torch.nn.
|
|
31
31
|
self,
|
32
32
|
x_ref: ArrayLike,
|
33
33
|
threshold_perc: float,
|
34
|
-
loss_fn: Callable[..., torch.
|
34
|
+
loss_fn: Callable[..., torch.Tensor] | None,
|
35
35
|
optimizer: torch.optim.Optimizer | None,
|
36
36
|
epochs: int,
|
37
37
|
batch_size: int,
|
@@ -82,7 +82,7 @@ class OODBaseGMM(OODBase, OODGMMMixin[GaussianMixtureModelParams]):
|
|
82
82
|
self,
|
83
83
|
x_ref: ArrayLike,
|
84
84
|
threshold_perc: float,
|
85
|
-
loss_fn: Callable[..., torch.
|
85
|
+
loss_fn: Callable[..., torch.Tensor] | None,
|
86
86
|
optimizer: torch.optim.Optimizer | None,
|
87
87
|
epochs: int,
|
88
88
|
batch_size: int,
|
@@ -9,11 +9,11 @@ import numpy as np
|
|
9
9
|
|
10
10
|
from dataeval.config import EPSILON
|
11
11
|
from dataeval.outputs import CompletenessOutput
|
12
|
-
from dataeval.typing import
|
12
|
+
from dataeval.typing import Array
|
13
13
|
from dataeval.utils._array import ensure_embeddings
|
14
14
|
|
15
15
|
|
16
|
-
def completeness(embeddings:
|
16
|
+
def completeness(embeddings: Array, quantiles: int) -> CompletenessOutput:
|
17
17
|
"""
|
18
18
|
Calculate the fraction of boxes in a grid defined by quantiles that
|
19
19
|
contain at least one data point.
|
@@ -21,7 +21,7 @@ def completeness(embeddings: ArrayLike, quantiles: int) -> CompletenessOutput:
|
|
21
21
|
|
22
22
|
Parameters
|
23
23
|
----------
|
24
|
-
embeddings :
|
24
|
+
embeddings : Array
|
25
25
|
Embedded dataset (or other low-dimensional data) (nxp)
|
26
26
|
quantiles : int
|
27
27
|
number of quantile values to use for partitioning each dimension
|
@@ -10,13 +10,13 @@ from scipy.spatial.distance import pdist, squareform
|
|
10
10
|
|
11
11
|
from dataeval.outputs import CoverageOutput
|
12
12
|
from dataeval.outputs._base import set_metadata
|
13
|
-
from dataeval.typing import
|
13
|
+
from dataeval.typing import Array
|
14
14
|
from dataeval.utils._array import ensure_embeddings, flatten
|
15
15
|
|
16
16
|
|
17
17
|
@set_metadata
|
18
18
|
def coverage(
|
19
|
-
embeddings:
|
19
|
+
embeddings: Array,
|
20
20
|
radius_type: Literal["adaptive", "naive"] = "adaptive",
|
21
21
|
num_observations: int = 20,
|
22
22
|
percent: float = 0.01,
|
dataeval/metrics/bias/_parity.py
CHANGED
@@ -271,7 +271,7 @@ def parity(metadata: Metadata) -> ParityOutput:
|
|
271
271
|
# because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
|
272
272
|
contingency_matrix = contingency_matrix[np.any(contingency_matrix, axis=1)]
|
273
273
|
|
274
|
-
chi_scores[i], p_values[i] = chi2_contingency(contingency_matrix)[:2]
|
274
|
+
chi_scores[i], p_values[i] = chi2_contingency(contingency_matrix)[:2] # type: ignore
|
275
275
|
|
276
276
|
if insufficient_data:
|
277
277
|
warnings.warn(
|
@@ -22,7 +22,7 @@ from scipy.stats import mode
|
|
22
22
|
from dataeval.config import EPSILON
|
23
23
|
from dataeval.outputs import BEROutput
|
24
24
|
from dataeval.outputs._base import set_metadata
|
25
|
-
from dataeval.typing import
|
25
|
+
from dataeval.typing import Array
|
26
26
|
from dataeval.utils._array import as_numpy, ensure_embeddings
|
27
27
|
from dataeval.utils._method import get_method
|
28
28
|
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
@@ -105,7 +105,7 @@ _BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
|
|
105
105
|
|
106
106
|
|
107
107
|
@set_metadata
|
108
|
-
def ber(embeddings:
|
108
|
+
def ber(embeddings: Array, labels: Array, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
|
109
109
|
"""
|
110
110
|
An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
|
111
111
|
using FR or KNN test statistic basis.
|
@@ -14,7 +14,7 @@ from numpy.typing import NDArray
|
|
14
14
|
|
15
15
|
from dataeval.outputs import DivergenceOutput
|
16
16
|
from dataeval.outputs._base import set_metadata
|
17
|
-
from dataeval.typing import
|
17
|
+
from dataeval.typing import Array
|
18
18
|
from dataeval.utils._array import ensure_embeddings
|
19
19
|
from dataeval.utils._method import get_method
|
20
20
|
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
@@ -65,7 +65,7 @@ _DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
|
|
65
65
|
|
66
66
|
|
67
67
|
@set_metadata
|
68
|
-
def divergence(emb_a:
|
68
|
+
def divergence(emb_a: Array, emb_b: Array, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
69
69
|
"""
|
70
70
|
Calculates the :term:`divergence` and any errors between the datasets.
|
71
71
|
|
dataeval/outputs/_estimators.py
CHANGED
@@ -47,11 +47,11 @@ class ClustererOutput(Output):
|
|
47
47
|
The strength of the data point belonging to the assigned cluster
|
48
48
|
"""
|
49
49
|
|
50
|
-
clusters: NDArray[np.
|
51
|
-
mst: NDArray[np.
|
52
|
-
linkage_tree: NDArray[np.
|
53
|
-
condensed_tree: NDArray[np.
|
54
|
-
membership_strengths: NDArray[np.
|
50
|
+
clusters: NDArray[np.intp]
|
51
|
+
mst: NDArray[np.float32]
|
52
|
+
linkage_tree: NDArray[np.float32]
|
53
|
+
condensed_tree: NDArray[np.float32]
|
54
|
+
membership_strengths: NDArray[np.float32]
|
55
55
|
|
56
56
|
def find_outliers(self) -> NDArray[np.int_]:
|
57
57
|
"""
|
@@ -77,7 +77,7 @@ class ClustererOutput(Output):
|
|
77
77
|
# Delay load numba compiled functions
|
78
78
|
from dataeval.utils._clusterer import compare_links_to_cluster_std, sorted_union_find
|
79
79
|
|
80
|
-
exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)
|
80
|
+
exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters) # type: ignore
|
81
81
|
exact_dupes = sorted_union_find(exact_indices)
|
82
82
|
near_dupes = sorted_union_find(near_indices)
|
83
83
|
|
dataeval/utils/_array.py
CHANGED
@@ -19,7 +19,7 @@ _logger = logging.getLogger(__name__)
|
|
19
19
|
|
20
20
|
_MODULE_CACHE = {}
|
21
21
|
|
22
|
-
T = TypeVar("T",
|
22
|
+
T = TypeVar("T", Array, np.ndarray, torch.Tensor)
|
23
23
|
_np_dtype = TypeVar("_np_dtype", bound=np.generic)
|
24
24
|
|
25
25
|
|
@@ -73,6 +73,19 @@ def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
|
|
73
73
|
yield to_numpy(array)
|
74
74
|
|
75
75
|
|
76
|
+
@overload
|
77
|
+
def rescale_array(array: NDArray[_np_dtype]) -> NDArray[_np_dtype]: ...
|
78
|
+
@overload
|
79
|
+
def rescale_array(array: torch.Tensor) -> torch.Tensor: ...
|
80
|
+
def rescale_array(array: Array | NDArray[_np_dtype] | torch.Tensor) -> Array | NDArray[_np_dtype] | torch.Tensor:
|
81
|
+
"""Rescale an array to the range [0, 1]"""
|
82
|
+
if isinstance(array, (np.ndarray, torch.Tensor)):
|
83
|
+
arr_min = array.min()
|
84
|
+
arr_max = array.max()
|
85
|
+
return (array - arr_min) / (arr_max - arr_min)
|
86
|
+
raise TypeError(f"Unsupported type: {type(array)}")
|
87
|
+
|
88
|
+
|
76
89
|
@overload
|
77
90
|
def ensure_embeddings(
|
78
91
|
embeddings: T,
|
@@ -137,14 +150,12 @@ def ensure_embeddings(
|
|
137
150
|
if arr.ndim != 2:
|
138
151
|
raise ValueError(f"Expected a 2D array, but got a {arr.ndim}D array.")
|
139
152
|
|
140
|
-
if unit_interval:
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
else:
|
147
|
-
raise ValueError("Embeddings must be unit interval [0, 1].")
|
153
|
+
if unit_interval and (arr.min() < 0 or arr.max() > 1):
|
154
|
+
if unit_interval == "force":
|
155
|
+
warnings.warn("Embeddings are not unit interval [0, 1]. Forcing to unit interval.")
|
156
|
+
arr = rescale_array(arr)
|
157
|
+
else:
|
158
|
+
raise ValueError("Embeddings must be unit interval [0, 1].")
|
148
159
|
|
149
160
|
if dtype is None:
|
150
161
|
return embeddings
|
dataeval/utils/_clusterer.py
CHANGED
@@ -69,12 +69,12 @@ def compare_links_to_cluster_std(
|
|
69
69
|
@dataclass
|
70
70
|
class ClusterData:
|
71
71
|
clusters: NDArray[np.intp]
|
72
|
-
mst: NDArray[np.
|
73
|
-
linkage_tree: NDArray[np.
|
72
|
+
mst: NDArray[np.float32]
|
73
|
+
linkage_tree: NDArray[np.float32]
|
74
74
|
condensed_tree: CondensedTree
|
75
|
-
membership_strengths: NDArray[np.
|
75
|
+
membership_strengths: NDArray[np.float32]
|
76
76
|
k_neighbors: NDArray[np.int32]
|
77
|
-
k_distances: NDArray[np.
|
77
|
+
k_distances: NDArray[np.float32]
|
78
78
|
|
79
79
|
|
80
80
|
def cluster(data: ArrayLike) -> ClusterData:
|
@@ -95,9 +95,9 @@ def cluster(data: ArrayLike) -> ClusterData:
|
|
95
95
|
|
96
96
|
max_neighbors = min(25, num_samples - 1)
|
97
97
|
kneighbors, kdistances = calculate_neighbor_distances(x, max_neighbors)
|
98
|
-
unsorted_mst: NDArray[np.
|
99
|
-
mst: NDArray[np.
|
100
|
-
linkage_tree: NDArray[np.
|
98
|
+
unsorted_mst: NDArray[np.float32] = minimum_spanning_tree(x, kneighbors, kdistances)
|
99
|
+
mst: NDArray[np.float32] = unsorted_mst[np.argsort(unsorted_mst.T[2])]
|
100
|
+
linkage_tree: NDArray[np.float32] = mst_to_linkage_tree(mst).astype(np.float32)
|
101
101
|
condensed_tree: CondensedTree = condense_tree(linkage_tree, min_cluster_size, None)
|
102
102
|
|
103
103
|
cluster_tree = cluster_tree_from_condensed_tree(condensed_tree)
|
@@ -65,7 +65,7 @@ def trainer(
|
|
65
65
|
model: torch.nn.Module,
|
66
66
|
x_train: NDArray[Any],
|
67
67
|
y_train: NDArray[Any] | None,
|
68
|
-
loss_fn: Callable[..., torch.Tensor
|
68
|
+
loss_fn: Callable[..., torch.Tensor] | None,
|
69
69
|
optimizer: torch.optim.Optimizer | None,
|
70
70
|
preprocess_fn: Callable[[torch.Tensor], torch.Tensor] | None,
|
71
71
|
epochs: int,
|
@@ -117,7 +117,7 @@ def trainer(
|
|
117
117
|
model = model.to(device)
|
118
118
|
|
119
119
|
# iterate over epochs
|
120
|
-
loss = torch.nan
|
120
|
+
loss = torch.scalar_tensor(torch.nan)
|
121
121
|
disable_tqdm = not verbose
|
122
122
|
for epoch in (pbar := tqdm(range(epochs), disable=disable_tqdm)):
|
123
123
|
epoch_loss = loss
|
@@ -133,7 +133,7 @@ def trainer(
|
|
133
133
|
y_hat = model(x)
|
134
134
|
y = x if y is None else y
|
135
135
|
|
136
|
-
loss = loss_fn(y, *y_hat) if isinstance(y_hat, tuple) else loss_fn(y, y_hat)
|
136
|
+
loss = loss_fn(y, *y_hat) if isinstance(y_hat, tuple) else loss_fn(y, y_hat)
|
137
137
|
|
138
138
|
optimizer.zero_grad()
|
139
139
|
loss.backward()
|
dataeval/utils/torch/trainer.py
CHANGED
@@ -172,7 +172,7 @@ class AETrainer:
|
|
172
172
|
for batch in dl:
|
173
173
|
imgs = get_images_from_batch(batch)
|
174
174
|
imgs = imgs.to(self.device)
|
175
|
-
embeddings = encode_func(imgs).to("cpu")
|
175
|
+
embeddings = encode_func(imgs).to("cpu") # type: ignore
|
176
176
|
encodings = torch.vstack((encodings, embeddings)) if len(encodings) else embeddings
|
177
177
|
|
178
178
|
return encodings
|
@@ -1,45 +1,52 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.86.
|
3
|
+
Version: 0.86.8
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
Author-email: andrew.weng@ariacoustics.com
|
9
|
-
Maintainer: ARiA
|
10
|
-
|
11
|
-
|
5
|
+
Project-URL: Homepage, https://dataeval.ai/
|
6
|
+
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
7
|
+
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
8
|
+
Author-email: Andrew Weng <andrew.weng@ariacoustics.com>, Bill Peria <bill.peria@ariacoustics.com>, Jon Botts <jonathan.botts@ariacoustics.com>, Jonathan Christian <jonathan.christian@ariacoustics.com>, Justin McMillan <justin.mcmillan@ariacoustics.com>, Ryan Wood <ryan.wood@ariacoustics.com>, Scott Swan <scott.swan@ariacoustics.com>, Shaun Jullens <shaun.jullens@ariacoustics.com>
|
9
|
+
Maintainer-email: ARiA <dataeval@ariacoustics.com>
|
10
|
+
License-Expression: MIT
|
11
|
+
License-File: LICENSE.txt
|
12
12
|
Classifier: Development Status :: 4 - Beta
|
13
13
|
Classifier: Intended Audience :: Science/Research
|
14
14
|
Classifier: License :: OSI Approved :: MIT License
|
15
15
|
Classifier: Operating System :: OS Independent
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
17
17
|
Classifier: Programming Language :: Python :: 3.9
|
18
18
|
Classifier: Programming Language :: Python :: 3.10
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
22
21
|
Classifier: Topic :: Scientific/Engineering
|
22
|
+
Requires-Python: <3.13,>=3.9
|
23
|
+
Requires-Dist: defusedxml>=0.7.1
|
24
|
+
Requires-Dist: fast-hdbscan==0.2.0
|
25
|
+
Requires-Dist: lightgbm>=4
|
26
|
+
Requires-Dist: numba>=0.59.1
|
27
|
+
Requires-Dist: numpy>=1.24.2
|
28
|
+
Requires-Dist: pandas>=2.0
|
29
|
+
Requires-Dist: pillow>=10.3.0
|
30
|
+
Requires-Dist: polars>=1.0.0
|
31
|
+
Requires-Dist: requests>=2.32.3
|
32
|
+
Requires-Dist: scikit-learn>=1.5.0
|
33
|
+
Requires-Dist: scipy>=1.10
|
34
|
+
Requires-Dist: torch>=2.2.0
|
35
|
+
Requires-Dist: torchvision>=0.17.0
|
36
|
+
Requires-Dist: tqdm>=4.66
|
37
|
+
Requires-Dist: typing-extensions>=4.12; python_version ~= '3.9'
|
38
|
+
Requires-Dist: xxhash>=3.3
|
23
39
|
Provides-Extra: all
|
24
|
-
Requires-Dist:
|
25
|
-
|
26
|
-
Requires-Dist:
|
27
|
-
Requires-Dist:
|
28
|
-
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist:
|
31
|
-
|
32
|
-
Requires-Dist:
|
33
|
-
Requires-Dist:
|
34
|
-
Requires-Dist: scikit-learn (>=1.5.0)
|
35
|
-
Requires-Dist: scipy (>=1.10)
|
36
|
-
Requires-Dist: torch (>=2.2.0)
|
37
|
-
Requires-Dist: torchvision (>=0.17.0)
|
38
|
-
Requires-Dist: tqdm
|
39
|
-
Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "4.0"
|
40
|
-
Requires-Dist: xxhash (>=3.3)
|
41
|
-
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
42
|
-
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
40
|
+
Requires-Dist: matplotlib>=3.7.1; extra == 'all'
|
41
|
+
Provides-Extra: cpu
|
42
|
+
Requires-Dist: torch>=2.2.0; extra == 'cpu'
|
43
|
+
Requires-Dist: torchvision>=0.17.0; extra == 'cpu'
|
44
|
+
Provides-Extra: cu118
|
45
|
+
Requires-Dist: torch>=2.2.0; extra == 'cu118'
|
46
|
+
Requires-Dist: torchvision>=0.17.0; extra == 'cu118'
|
47
|
+
Provides-Extra: cu124
|
48
|
+
Requires-Dist: torch>=2.2.0; extra == 'cu124'
|
49
|
+
Requires-Dist: torchvision>=0.17.0; extra == 'cu124'
|
43
50
|
Description-Content-Type: text/markdown
|
44
51
|
|
45
52
|
# DataEval
|
@@ -72,26 +79,28 @@ estimation, bias detection, and dataset linting.
|
|
72
79
|
<!-- end needs -->
|
73
80
|
|
74
81
|
<!-- start JATIC interop -->
|
82
|
+
|
75
83
|
DataEval is easy to install, supports a wide range of Python versions, and is
|
76
84
|
compatible with many of the most popular packages in the scientific and T&E
|
77
85
|
communities.
|
78
86
|
|
79
87
|
DataEval also has native interoperability between JATIC's suite of tools when
|
80
88
|
using MAITE-compliant datasets and models.
|
89
|
+
|
81
90
|
<!-- end JATIC interop -->
|
82
91
|
|
83
92
|
## Getting Started
|
84
93
|
|
85
94
|
**Python versions:** 3.9 - 3.12
|
86
95
|
|
87
|
-
**Supported packages**:
|
96
|
+
**Supported packages**: _NumPy_, _Pandas_, _Sci-kit learn_, _MAITE_, _NRTK_
|
88
97
|
|
89
98
|
Choose your preferred method of installation below or follow our
|
90
99
|
[installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
91
100
|
|
92
|
-
|
93
|
-
|
94
|
-
|
101
|
+
- [Installing with pip](#installing-with-pip)
|
102
|
+
- [Installing with conda/mamba](#installing-with-conda)
|
103
|
+
- [Installing from GitHub](#installing-from-github)
|
95
104
|
|
96
105
|
### **Installing with pip**
|
97
106
|
|
@@ -105,7 +114,7 @@ pip install dataeval[all]
|
|
105
114
|
### **Installing with conda**
|
106
115
|
|
107
116
|
DataEval can be installed in a Conda/Mamba environment using the provided
|
108
|
-
`environment.yaml` file.
|
117
|
+
`environment.yaml` file. As some dependencies are installed from the `pytorch`
|
109
118
|
channel, the channel is specified in the below example.
|
110
119
|
|
111
120
|
```bash
|
@@ -115,12 +124,10 @@ micromamba create -f environment\environment.yaml -c pytorch
|
|
115
124
|
### **Installing from GitHub**
|
116
125
|
|
117
126
|
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
118
|
-
download larger, binary source files
|
119
|
-
management.
|
127
|
+
download larger, binary source files.
|
120
128
|
|
121
129
|
```bash
|
122
130
|
sudo apt-get install git-lfs
|
123
|
-
pip install poetry
|
124
131
|
```
|
125
132
|
|
126
133
|
Pull the source down and change to the DataEval project directory.
|
@@ -130,26 +137,40 @@ git clone https://github.com/aria-ml/dataeval.git
|
|
130
137
|
cd dataeval
|
131
138
|
```
|
132
139
|
|
133
|
-
|
140
|
+
#### **Using Poetry**
|
141
|
+
|
142
|
+
Install DataEval with all extras.
|
134
143
|
|
135
144
|
```bash
|
136
|
-
poetry install --all
|
145
|
+
poetry install --extras=all
|
137
146
|
```
|
138
147
|
|
139
|
-
|
140
|
-
environment by prefixing shell commands with `poetry run`, or activate the
|
141
|
-
virtual environment directly in the shell.
|
148
|
+
Enable Poetry's virtual environment.
|
142
149
|
|
143
150
|
```bash
|
144
|
-
poetry
|
151
|
+
poetry env activate
|
152
|
+
```
|
153
|
+
|
154
|
+
#### **Using uv**
|
155
|
+
|
156
|
+
Install DataEval with all extras and dependencies for development.
|
157
|
+
|
158
|
+
```bash
|
159
|
+
uv sync --extra=all
|
160
|
+
```
|
161
|
+
|
162
|
+
Enable uv's virtual environment.
|
163
|
+
|
164
|
+
```bash
|
165
|
+
source .venv/bin/activate
|
145
166
|
```
|
146
167
|
|
147
168
|
## Contact Us
|
148
169
|
|
149
170
|
If you have any questions, feel free to reach out to the people below:
|
150
171
|
|
151
|
-
|
152
|
-
|
172
|
+
- **POC**: Scott Swan @scott.swan
|
173
|
+
- **DPOC**: Andrew Weng @aweng
|
153
174
|
|
154
175
|
## Acknowledgement
|
155
176
|
|
@@ -164,4 +185,3 @@ interpreted as necessarily representing the official policies or endorsements,
|
|
164
185
|
either expressed or implied, of the U.S. Government.
|
165
186
|
|
166
187
|
<!-- end acknowledgement -->
|
167
|
-
|
@@ -1,6 +1,9 @@
|
|
1
|
-
dataeval/__init__.py,sha256=
|
1
|
+
dataeval/__init__.py,sha256=dEDltdHOnbk4-XAbQwJLOZtCbRLZsDMnptWRwbF2r54,1773
|
2
2
|
dataeval/_log.py,sha256=C7AGkIRzymvYJ0LQXtnShiy3i5Xrp8T58JzIHHguk_Q,365
|
3
|
-
dataeval/
|
3
|
+
dataeval/_version.py,sha256=IPUOExUy8nF4kYGtCPV5bg6_IYDRLVOKnFJcNllcO1M,513
|
4
|
+
dataeval/config.py,sha256=g3Np0Q3J5Rzij6Gsz7tJh7eOxgwNPf6NsFYmAR8Atfs,4219
|
5
|
+
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
dataeval/typing.py,sha256=W8rqFFkAqE5a5ar3MmB-O5gcMJqvoDKXC8Y0ggBqAKo,7216
|
4
7
|
dataeval/data/__init__.py,sha256=wzQ6uUFLNB3VJR0a2QnRBYwEmwXT93q0WpHu7FmFW1E,486
|
5
8
|
dataeval/data/_embeddings.py,sha256=PFjpdV9bfusCB4taTIYSzx1hP8nJb_KCkZTN8kMw-Hs,12885
|
6
9
|
dataeval/data/_images.py,sha256=Rc_59CuU4zfN7Xm7an1XUx8ZghQg6a56VJWMZD9edRw,2654
|
@@ -21,21 +24,21 @@ dataeval/detectors/drift/_base.py,sha256=6aNF1LzG3w1sNUrmSBbsvuN5IkQnoRikRacqobY
|
|
21
24
|
dataeval/detectors/drift/_cvm.py,sha256=cS33zWJmFY1fft1XcANcP2jSD5ou7TxvIU2AldhTynM,3004
|
22
25
|
dataeval/detectors/drift/_ks.py,sha256=uMc5-NA-lSV1IODrY8uJe87ll3uRJT_oXLJFXy95M1w,3186
|
23
26
|
dataeval/detectors/drift/_mmd.py,sha256=uw8axM6dWxTBrCaXwkbldIDcdhe4hmim9yrsbuOwA-0,11523
|
24
|
-
dataeval/detectors/drift/_mvdc.py,sha256=
|
27
|
+
dataeval/detectors/drift/_mvdc.py,sha256=WMN6aDOWCh1q1MtdRXFIZlFcfnVi4XgBHsS0A6L5UuY,2942
|
28
|
+
dataeval/detectors/drift/_uncertainty.py,sha256=BHlykJ-r7TGLJxdPfoazXnoAJ1qVDzbk5HjAMdsnHz8,5847
|
29
|
+
dataeval/detectors/drift/updates.py,sha256=L1PnrPlIE1x6ujCc5mCwjcAZwadVTn-Zjb6MnTDvzJQ,2251
|
25
30
|
dataeval/detectors/drift/_nml/__init__.py,sha256=MNyKyZlfTjr5uQql2uBBfRkUdsuduie_WJdn09GYmqg,137
|
26
31
|
dataeval/detectors/drift/_nml/_base.py,sha256=o34LcCsD9p1A6u8UdQn-dxIVwC2CMr6uCpC0vq16JX0,2663
|
27
|
-
dataeval/detectors/drift/_nml/_chunk.py,sha256=
|
32
|
+
dataeval/detectors/drift/_nml/_chunk.py,sha256=xF3U-CAobzoKX-20yjWjGVD14IOcAV6rPaIrqCMwGdQ,13564
|
28
33
|
dataeval/detectors/drift/_nml/_domainclassifier.py,sha256=n7Ttq5Ej7sAY9Jn2iagaGj4IIWiG8gmA3wwFizlBqes,7292
|
29
34
|
dataeval/detectors/drift/_nml/_result.py,sha256=TMK17bnlgSdL0MCRHtQZJO8YoWWe4C2kh_akESrlP1g,3269
|
30
35
|
dataeval/detectors/drift/_nml/_thresholds.py,sha256=WGdkLei9w_EvvsRHQzWdDyFVoZHIwM78k_aB3eoh31Q,12060
|
31
|
-
dataeval/detectors/drift/_uncertainty.py,sha256=BHlykJ-r7TGLJxdPfoazXnoAJ1qVDzbk5HjAMdsnHz8,5847
|
32
|
-
dataeval/detectors/drift/updates.py,sha256=L1PnrPlIE1x6ujCc5mCwjcAZwadVTn-Zjb6MnTDvzJQ,2251
|
33
36
|
dataeval/detectors/linters/__init__.py,sha256=xn2zPwUcmsuf-Jd9uw6AVI11C9z1b1Y9fYtuFnXenZ0,404
|
34
37
|
dataeval/detectors/linters/duplicates.py,sha256=X5WSEvI_BHkLoXjkaHK6wTnSkx4IjpO_exMRjSlhc70,4963
|
35
38
|
dataeval/detectors/linters/outliers.py,sha256=GaM9n8yPgBPzVOL_bxJCj0eCwobEEP4JHKHD9liRdlw,10130
|
36
39
|
dataeval/detectors/ood/__init__.py,sha256=juCYBDs7CQEAtMhnEpPqF6uTrOIH9kTBSuQ_GRw6a8o,283
|
37
|
-
dataeval/detectors/ood/ae.py,sha256=
|
38
|
-
dataeval/detectors/ood/base.py,sha256=
|
40
|
+
dataeval/detectors/ood/ae.py,sha256=cJ7nq4iwTvW8uihHCUhGfTlKsAlthJ2tOhgSsB27cOY,2941
|
41
|
+
dataeval/detectors/ood/base.py,sha256=hx-TPJnUTZ7KcBkm8SbN1RGhtJyQN0XLajDyNqiZrJo,3042
|
39
42
|
dataeval/detectors/ood/mixin.py,sha256=0_o-1HPvgf3-Lf1MSOIfjj5UB8LTLEBGYtJJfyCCzwc,5431
|
40
43
|
dataeval/metadata/__init__.py,sha256=XDDmJbOZBNM6pL0r6Nbu6oMRoyAh22IDkPYGndNlkZU,316
|
41
44
|
dataeval/metadata/_distance.py,sha256=MbXM9idsooNWnGLaTKg8j4ZqavUeJUjuW7EPW3-UQyg,4234
|
@@ -44,14 +47,14 @@ dataeval/metadata/_utils.py,sha256=BcGoYVfA4AkAWpInY5txOc3QBpsGf6cnnUAsHOQTJAE,1
|
|
44
47
|
dataeval/metrics/__init__.py,sha256=8VC8q3HuJN3o_WN51Ae2_wXznl3RMXIvA5GYVcy7vr8,225
|
45
48
|
dataeval/metrics/bias/__init__.py,sha256=329S1_3WnWqeU4-qVcbe0fMy4lDrj9uKslWHIQf93yg,839
|
46
49
|
dataeval/metrics/bias/_balance.py,sha256=fREtoMLUZPOf_ivqNKwij6oPiKMTk02ECO5rWURf3KY,5541
|
47
|
-
dataeval/metrics/bias/_completeness.py,sha256=
|
48
|
-
dataeval/metrics/bias/_coverage.py,sha256=
|
50
|
+
dataeval/metrics/bias/_completeness.py,sha256=2cvOXe7fhtxZGH_4QBuiCafIeamxFBarMiUBuEP7QGI,4596
|
51
|
+
dataeval/metrics/bias/_coverage.py,sha256=v2x2hbOf2za9jFcSVSJUAoJ2BJfzzlCzt0mFIGtBL0A,3639
|
49
52
|
dataeval/metrics/bias/_diversity.py,sha256=25udDKmel9IjeVT5nM4dOa1apda66QdRxBc922yuUvI,5830
|
50
|
-
dataeval/metrics/bias/_parity.py,sha256=
|
53
|
+
dataeval/metrics/bias/_parity.py,sha256=MKpqL4aoqEHkRl0vtGvVq9V3KBOtDFTtAo5I2GfIG4A,11443
|
51
54
|
dataeval/metrics/estimators/__init__.py,sha256=Pnds8uIyAovt2fKqZjiHCIP_kVoBWlVllekYuK5UmmU,568
|
52
|
-
dataeval/metrics/estimators/_ber.py,sha256=
|
55
|
+
dataeval/metrics/estimators/_ber.py,sha256=7noeRyOJJYqrJ_jt90nRHtR2t2u5MIvTCmWt0_rd4EU,5370
|
53
56
|
dataeval/metrics/estimators/_clusterer.py,sha256=1HrpihGTJ63IkNSOy4Ibw633Gllkm1RxKmoKT5MOgt0,1434
|
54
|
-
dataeval/metrics/estimators/_divergence.py,sha256
|
57
|
+
dataeval/metrics/estimators/_divergence.py,sha256=t-Z_7Bq4V4FunxKlq7G4ThtgLany8n4iEU0n0afr7F8,3991
|
55
58
|
dataeval/metrics/estimators/_uap.py,sha256=BULEBbJ9BQ1IcTeZf0x7iI60QHAWCccBOM97FIu9VXA,1928
|
56
59
|
dataeval/metrics/stats/__init__.py,sha256=6tA_9nbbM5ObJ6cds8Y1VBtTQiTOxrpGQSFLu_lWGGA,1098
|
57
60
|
dataeval/metrics/stats/_base.py,sha256=R-hxoEPLreZcxYxBfyjbKfdoGMMTPiqJ5g2zSO-1UYM,12541
|
@@ -66,19 +69,17 @@ dataeval/outputs/__init__.py,sha256=geHB5M3QOiFFaQGV4ZwDTTKpqZPvPePbqG7lzaPhaXQ,
|
|
66
69
|
dataeval/outputs/_base.py,sha256=-Wa0gFcBVLbfWPMZyCql7x4vGsnkLP4pecsQIeUZ2_Y,5904
|
67
70
|
dataeval/outputs/_bias.py,sha256=1OZpKncYTryjPLRHb4d6NlhE27uPT57gCob_5jtjKDI,10456
|
68
71
|
dataeval/outputs/_drift.py,sha256=hXILED_soY8ppIQZgftQvmumtwDrTnABbYl-flIGEU4,4588
|
69
|
-
dataeval/outputs/_estimators.py,sha256=
|
72
|
+
dataeval/outputs/_estimators.py,sha256=IQgSbOPHYzzxn1X64XF2XxQhDlWy6jwy6RNyoyvsipE,3111
|
70
73
|
dataeval/outputs/_linters.py,sha256=k8lkd8EZ23q0m-HOD-FgqMcLQFy1UH7vws2ucLPyn08,6697
|
71
74
|
dataeval/outputs/_metadata.py,sha256=ffZgpX8KWURPHXpOWjbvJ2KRqWQkS2nWuIjKUzoHhMI,1710
|
72
75
|
dataeval/outputs/_ood.py,sha256=suLKVXULGtXH0rq9eXHI1d3d2jhGmItJtz4QiQd47A4,1718
|
73
76
|
dataeval/outputs/_stats.py,sha256=_ItGjs9YaMHqjivkR1YBcSErD5ICfa_-iV9nq0l8bTM,17451
|
74
77
|
dataeval/outputs/_utils.py,sha256=NfhYaGT2PZlhIs8ICKUsPWHZXjhWYDkEJqBDdqMeaOM,929
|
75
78
|
dataeval/outputs/_workflows.py,sha256=K786mOgegxVi81diUA-qpbwGEkwa8YA7Fk4ttgjJeaY,10831
|
76
|
-
dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
-
dataeval/typing.py,sha256=W8rqFFkAqE5a5ar3MmB-O5gcMJqvoDKXC8Y0ggBqAKo,7216
|
78
79
|
dataeval/utils/__init__.py,sha256=hRvyUK7b3d6JBEV5u47rFcOHEcmDYqAvZQw_T5pDAWw,264
|
79
|
-
dataeval/utils/_array.py,sha256=
|
80
|
+
dataeval/utils/_array.py,sha256=bIDbnv15_hNzFn2Uc4WV1qRyFzubQj2nNYsFUDIdwT0,6335
|
80
81
|
dataeval/utils/_bin.py,sha256=w3eJ2Szw5eapqQ0cGv731rhNgLFGW0cCz2pXo9I6CuY,7296
|
81
|
-
dataeval/utils/_clusterer.py,sha256=
|
82
|
+
dataeval/utils/_clusterer.py,sha256=rUvEdyMwp95lffmt6xKMEwsjRXNoBS0n5mAS_HNOnck,5656
|
82
83
|
dataeval/utils/_fast_mst.py,sha256=pv42flr1Uf5RBa9qDG0YLDXWH7Mr7a9zpauO1HqZXaY,8061
|
83
84
|
dataeval/utils/_image.py,sha256=4uxTIOYZZlRJOfNmdA3ek3no3FrLWCK5un48kStMDt8,3578
|
84
85
|
dataeval/utils/_method.py,sha256=9B9JQbgqWJBRhQJb7glajUtWaQzUTIUuvrZ9_bisxsM,394
|
@@ -102,12 +103,12 @@ dataeval/utils/datasets/_voc.py,sha256=pafY112O80isYkrdy7Quie9SBm_TmYhREuyl8Sxts
|
|
102
103
|
dataeval/utils/torch/__init__.py,sha256=dn5mjCrFp0b1aL_UEURhONU0Ag0cmXoTOBSGagpkTiA,325
|
103
104
|
dataeval/utils/torch/_blocks.py,sha256=HVhBTMMD5NA4qheMUgyol1KWiKZDIuc8k5j4RcMKmhk,1466
|
104
105
|
dataeval/utils/torch/_gmm.py,sha256=XM68GNEP97EjaB1U49-ZXRb81d0CEFnPS910alrcB3g,3740
|
105
|
-
dataeval/utils/torch/_internal.py,sha256=
|
106
|
+
dataeval/utils/torch/_internal.py,sha256=9rzlMeM8i3p-ctulh9WDQATMXtlp-Jk2pBX7NGC8l2I,4146
|
106
107
|
dataeval/utils/torch/models.py,sha256=1idpXyjrYcCBSsbxxRUOto8xr4MJNjDEqQHiIXVU5Zc,9700
|
107
|
-
dataeval/utils/torch/trainer.py,sha256=
|
108
|
+
dataeval/utils/torch/trainer.py,sha256=DRyPScGdE4o5Xo3BmD9p2PGOApzi1E-QfsBRNZ5IXW8,5544
|
108
109
|
dataeval/workflows/__init__.py,sha256=ou8y0KO-d6W5lgmcyLjKlf-J_ckP3vilW7wHkgiDlZ4,255
|
109
110
|
dataeval/workflows/sufficiency.py,sha256=j-R8dg4XE6a66p_oTXG2GNzgg3vGk85CTblxhFXaxog,8513
|
110
|
-
dataeval-0.86.
|
111
|
-
dataeval-0.86.
|
112
|
-
dataeval-0.86.
|
113
|
-
dataeval-0.86.
|
111
|
+
dataeval-0.86.8.dist-info/METADATA,sha256=rCf58-uzgjsTNZkY3LOBMSi5fhQ2cdAtnrrDI_eYR_I,5925
|
112
|
+
dataeval-0.86.8.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
113
|
+
dataeval-0.86.8.dist-info/licenses/LICENSE.txt,sha256=uAooygKWvX6NbU9Ran9oG2msttoG8aeTeHSTe5JeCnY,1061
|
114
|
+
dataeval-0.86.8.dist-info/RECORD,,
|
File without changes
|