dataeval 0.76.0__tar.gz → 0.81.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.76.0 → dataeval-0.81.0}/PKG-INFO +44 -15
- {dataeval-0.76.0 → dataeval-0.81.0}/README.md +38 -13
- {dataeval-0.76.0 → dataeval-0.81.0}/pyproject.toml +22 -9
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/__init__.py +3 -3
- dataeval-0.76.0/src/dataeval/output.py → dataeval-0.81.0/src/dataeval/_output.py +14 -0
- dataeval-0.81.0/src/dataeval/config.py +77 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/__init__.py +1 -1
- dataeval-0.81.0/src/dataeval/detectors/drift/__init__.py +22 -0
- dataeval-0.76.0/src/dataeval/detectors/drift/base.py → dataeval-0.81.0/src/dataeval/detectors/drift/_base.py +41 -30
- dataeval-0.76.0/src/dataeval/detectors/drift/cvm.py → dataeval-0.81.0/src/dataeval/detectors/drift/_cvm.py +21 -28
- dataeval-0.76.0/src/dataeval/detectors/drift/ks.py → dataeval-0.81.0/src/dataeval/detectors/drift/_ks.py +20 -26
- dataeval-0.76.0/src/dataeval/detectors/drift/mmd.py → dataeval-0.81.0/src/dataeval/detectors/drift/_mmd.py +33 -19
- dataeval-0.76.0/src/dataeval/detectors/drift/torch.py → dataeval-0.81.0/src/dataeval/detectors/drift/_torch.py +2 -1
- dataeval-0.76.0/src/dataeval/detectors/drift/uncertainty.py → dataeval-0.81.0/src/dataeval/detectors/drift/_uncertainty.py +23 -7
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/drift/updates.py +1 -1
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/linters/__init__.py +0 -3
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/linters/duplicates.py +17 -8
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/linters/outliers.py +52 -43
- dataeval-0.81.0/src/dataeval/detectors/ood/ae.py +93 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/base.py +5 -4
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/metadata_ks_compare.py +1 -1
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/mixin.py +20 -5
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/output.py +1 -1
- dataeval-0.76.0/src/dataeval/detectors/ood/ae.py → dataeval-0.81.0/src/dataeval/detectors/ood/vae.py +13 -12
- dataeval-0.81.0/src/dataeval/metadata/__init__.py +5 -0
- dataeval-0.81.0/src/dataeval/metadata/_ood.py +238 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/metrics/__init__.py +1 -1
- dataeval-0.81.0/src/dataeval/metrics/bias/__init__.py +22 -0
- dataeval-0.76.0/src/dataeval/metrics/bias/balance.py → dataeval-0.81.0/src/dataeval/metrics/bias/_balance.py +67 -17
- dataeval-0.76.0/src/dataeval/metrics/bias/coverage.py → dataeval-0.81.0/src/dataeval/metrics/bias/_coverage.py +41 -35
- dataeval-0.76.0/src/dataeval/metrics/bias/diversity.py → dataeval-0.81.0/src/dataeval/metrics/bias/_diversity.py +17 -12
- dataeval-0.76.0/src/dataeval/metrics/bias/parity.py → dataeval-0.81.0/src/dataeval/metrics/bias/_parity.py +89 -63
- dataeval-0.81.0/src/dataeval/metrics/estimators/__init__.py +19 -0
- dataeval-0.76.0/src/dataeval/metrics/estimators/ber.py → dataeval-0.81.0/src/dataeval/metrics/estimators/_ber.py +42 -11
- dataeval-0.81.0/src/dataeval/metrics/estimators/_clusterer.py +104 -0
- dataeval-0.76.0/src/dataeval/metrics/estimators/divergence.py → dataeval-0.81.0/src/dataeval/metrics/estimators/_divergence.py +18 -13
- dataeval-0.76.0/src/dataeval/metrics/estimators/uap.py → dataeval-0.81.0/src/dataeval/metrics/estimators/_uap.py +4 -4
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/metrics/stats/__init__.py +7 -7
- dataeval-0.76.0/src/dataeval/metrics/stats/base.py → dataeval-0.81.0/src/dataeval/metrics/stats/_base.py +52 -16
- dataeval-0.76.0/src/dataeval/metrics/stats/boxratiostats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_boxratiostats.py +6 -9
- dataeval-0.76.0/src/dataeval/metrics/stats/datasetstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_datasetstats.py +10 -14
- dataeval-0.76.0/src/dataeval/metrics/stats/dimensionstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_dimensionstats.py +6 -5
- dataeval-0.76.0/src/dataeval/metrics/stats/hashstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_hashstats.py +6 -6
- dataeval-0.76.0/src/dataeval/metrics/stats/labelstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_labelstats.py +25 -25
- dataeval-0.76.0/src/dataeval/metrics/stats/pixelstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_pixelstats.py +5 -4
- dataeval-0.76.0/src/dataeval/metrics/stats/visualstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_visualstats.py +9 -8
- dataeval-0.81.0/src/dataeval/typing.py +54 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/__init__.py +2 -2
- dataeval-0.81.0/src/dataeval/utils/_array.py +169 -0
- dataeval-0.81.0/src/dataeval/utils/_bin.py +199 -0
- dataeval-0.81.0/src/dataeval/utils/_clusterer.py +144 -0
- dataeval-0.81.0/src/dataeval/utils/_fast_mst.py +189 -0
- dataeval-0.76.0/src/dataeval/utils/image.py → dataeval-0.81.0/src/dataeval/utils/_image.py +6 -4
- dataeval-0.81.0/src/dataeval/utils/_method.py +18 -0
- dataeval-0.76.0/src/dataeval/utils/shared.py → dataeval-0.81.0/src/dataeval/utils/_mst.py +3 -65
- dataeval-0.76.0/src/dataeval/utils/plot.py → dataeval-0.81.0/src/dataeval/utils/_plot.py +4 -4
- dataeval-0.81.0/src/dataeval/utils/data/__init__.py +22 -0
- dataeval-0.81.0/src/dataeval/utils/data/_embeddings.py +105 -0
- dataeval-0.81.0/src/dataeval/utils/data/_images.py +65 -0
- dataeval-0.81.0/src/dataeval/utils/data/_metadata.py +352 -0
- dataeval-0.81.0/src/dataeval/utils/data/_selection.py +119 -0
- dataeval-0.76.0/src/dataeval/utils/dataset/split.py → dataeval-0.81.0/src/dataeval/utils/data/_split.py +13 -14
- dataeval-0.81.0/src/dataeval/utils/data/_targets.py +73 -0
- dataeval-0.81.0/src/dataeval/utils/data/_types.py +58 -0
- dataeval-0.81.0/src/dataeval/utils/data/collate.py +103 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_base.py +254 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval-0.81.0/src/dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval-0.81.0/src/dataeval/utils/data/selections/__init__.py +15 -0
- dataeval-0.81.0/src/dataeval/utils/data/selections/_classfilter.py +60 -0
- dataeval-0.81.0/src/dataeval/utils/data/selections/_indices.py +26 -0
- dataeval-0.81.0/src/dataeval/utils/data/selections/_limit.py +26 -0
- dataeval-0.81.0/src/dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval-0.81.0/src/dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval-0.81.0/src/dataeval/utils/metadata.py +403 -0
- dataeval-0.76.0/src/dataeval/utils/torch/gmm.py → dataeval-0.81.0/src/dataeval/utils/torch/_gmm.py +4 -2
- dataeval-0.76.0/src/dataeval/utils/torch/internal.py → dataeval-0.81.0/src/dataeval/utils/torch/_internal.py +21 -51
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/torch/models.py +43 -2
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/workflows/sufficiency.py +10 -9
- dataeval-0.76.0/src/dataeval/detectors/drift/__init__.py +0 -22
- dataeval-0.76.0/src/dataeval/detectors/linters/clusterer.py +0 -512
- dataeval-0.76.0/src/dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval-0.76.0/src/dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval-0.76.0/src/dataeval/interop.py +0 -69
- dataeval-0.76.0/src/dataeval/metrics/bias/__init__.py +0 -21
- dataeval-0.76.0/src/dataeval/metrics/estimators/__init__.py +0 -9
- dataeval-0.76.0/src/dataeval/utils/dataset/__init__.py +0 -7
- dataeval-0.76.0/src/dataeval/utils/dataset/datasets.py +0 -412
- dataeval-0.76.0/src/dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.0/src/dataeval/utils/metadata.py +0 -581
- {dataeval-0.76.0 → dataeval-0.81.0}/LICENSE.txt +0 -0
- /dataeval-0.76.0/src/dataeval/log.py → /dataeval-0.81.0/src/dataeval/_log.py +0 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/__init__.py +0 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/py.typed +0 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/torch/__init__.py +0 -0
- /dataeval-0.76.0/src/dataeval/utils/torch/blocks.py → /dataeval-0.81.0/src/dataeval/utils/torch/_blocks.py +0 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/workflows/__init__.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.81.0
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -21,8 +21,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Programming Language :: Python :: 3 :: Only
|
22
22
|
Classifier: Topic :: Scientific/Engineering
|
23
23
|
Provides-Extra: all
|
24
|
-
Requires-Dist:
|
24
|
+
Requires-Dist: defusedxml (>=0.7.1)
|
25
|
+
Requires-Dist: fast_hdbscan (==0.2.0)
|
26
|
+
Requires-Dist: matplotlib (>=3.7.1) ; extra == "all"
|
27
|
+
Requires-Dist: numba (>=0.59.1)
|
25
28
|
Requires-Dist: numpy (>=1.24.2)
|
29
|
+
Requires-Dist: pandas (>=2.0) ; extra == "all"
|
26
30
|
Requires-Dist: pillow (>=10.3.0)
|
27
31
|
Requires-Dist: requests
|
28
32
|
Requires-Dist: scikit-learn (>=1.5.0)
|
@@ -38,13 +42,17 @@ Description-Content-Type: text/markdown
|
|
38
42
|
|
39
43
|
# DataEval
|
40
44
|
|
41
|
-
To view our extensive collection of tutorials, how-to's, explanation guides,
|
45
|
+
To view our extensive collection of tutorials, how-to's, explanation guides,
|
46
|
+
and reference material, please visit our documentation on
|
47
|
+
**[Read the Docs](https://dataeval.readthedocs.io/)**
|
42
48
|
|
43
49
|
## About DataEval
|
44
50
|
|
45
51
|
<!-- start tagline -->
|
46
52
|
|
47
|
-
DataEval curates datasets to train and test performant, robust, unbiased and
|
53
|
+
DataEval curates datasets to train and test performant, robust, unbiased and
|
54
|
+
reliable AI models and monitors for data shifts that impact performance of
|
55
|
+
deployed models.
|
48
56
|
|
49
57
|
<!-- end tagline -->
|
50
58
|
|
@@ -52,22 +60,33 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
|
|
52
60
|
|
53
61
|
<!-- start needs -->
|
54
62
|
|
55
|
-
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
63
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
64
|
+
engineer. Throughout all stages of the machine learning lifecycle, DataEval
|
65
|
+
supports model development, data analysis, and monitoring with state-of-the-art
|
66
|
+
algorithms to help you solve difficult problems. With a focus on computer
|
67
|
+
vision tasks, DataEval provides simple, but effective metrics for performance
|
68
|
+
estimation, bias detection, and dataset linting.
|
56
69
|
|
57
70
|
<!-- end needs -->
|
58
71
|
|
59
72
|
<!-- start JATIC interop -->
|
60
|
-
DataEval is easy to install, supports a wide range of Python versions, and is
|
61
|
-
|
73
|
+
DataEval is easy to install, supports a wide range of Python versions, and is
|
74
|
+
compatible with many of the most popular packages in the scientific and T&E
|
75
|
+
communities.
|
76
|
+
|
77
|
+
DataEval also has native interopability between JATIC's suite of tools when
|
78
|
+
using MAITE-compliant datasets and models.
|
62
79
|
<!-- end JATIC interop -->
|
63
80
|
|
64
81
|
## Getting Started
|
65
82
|
|
66
83
|
**Python versions:** 3.9 - 3.12
|
67
84
|
|
68
|
-
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
85
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
86
|
+
*Gradient*
|
69
87
|
|
70
|
-
Choose your preferred method of installation below or follow our
|
88
|
+
Choose your preferred method of installation below or follow our
|
89
|
+
[installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
71
90
|
|
72
91
|
* [Installing with pip](#installing-with-pip)
|
73
92
|
* [Installing with conda/mamba](#installing-with-conda)
|
@@ -75,7 +94,8 @@ Choose your preferred method of installation below or follow our [installation g
|
|
75
94
|
|
76
95
|
### **Installing with pip**
|
77
96
|
|
78
|
-
You can install DataEval directly from pypi.org using the following command.
|
97
|
+
You can install DataEval directly from pypi.org using the following command.
|
98
|
+
The optional dependencies of DataEval are `all`.
|
79
99
|
|
80
100
|
```bash
|
81
101
|
pip install dataeval[all]
|
@@ -83,8 +103,9 @@ pip install dataeval[all]
|
|
83
103
|
|
84
104
|
### **Installing with conda**
|
85
105
|
|
86
|
-
DataEval can be installed in a Conda/Mamba environment using the provided
|
87
|
-
are installed from the `pytorch`
|
106
|
+
DataEval can be installed in a Conda/Mamba environment using the provided
|
107
|
+
`environment.yaml` file. As some dependencies are installed from the `pytorch`
|
108
|
+
channel, the channel is specified in the below example.
|
88
109
|
|
89
110
|
```bash
|
90
111
|
micromamba create -f environment\environment.yaml -c pytorch
|
@@ -92,7 +113,9 @@ micromamba create -f environment\environment.yaml -c pytorch
|
|
92
113
|
|
93
114
|
### **Installing from GitHub**
|
94
115
|
|
95
|
-
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
116
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
117
|
+
download larger, binary source files and `poetry` for project dependency
|
118
|
+
management.
|
96
119
|
|
97
120
|
```bash
|
98
121
|
sudo apt-get install git-lfs
|
@@ -112,7 +135,9 @@ Install DataEval with optional dependencies for development.
|
|
112
135
|
poetry install --all-extras --with dev
|
113
136
|
```
|
114
137
|
|
115
|
-
Now that DataEval is installed, you can run commands in the poetry virtual
|
138
|
+
Now that DataEval is installed, you can run commands in the poetry virtual
|
139
|
+
environment by prefixing shell commands with `poetry run`, or activate the
|
140
|
+
virtual environment directly in the shell.
|
116
141
|
|
117
142
|
```bash
|
118
143
|
poetry shell
|
@@ -131,7 +156,11 @@ If you have any questions, feel free to reach out to the people below:
|
|
131
156
|
|
132
157
|
### CDAO Funding Acknowledgement
|
133
158
|
|
134
|
-
This material is based upon work supported by the Chief Digital and Artificial
|
159
|
+
This material is based upon work supported by the Chief Digital and Artificial
|
160
|
+
Intelligence Office under Contract No. W519TC-23-9-2033. The views and
|
161
|
+
conclusions contained herein are those of the author(s) and should not be
|
162
|
+
interpreted as necessarily representing the official policies or endorsements,
|
163
|
+
either expressed or implied, of the U.S. Government.
|
135
164
|
|
136
165
|
<!-- end acknowledgement -->
|
137
166
|
|
@@ -1,12 +1,16 @@
|
|
1
1
|
# DataEval
|
2
2
|
|
3
|
-
To view our extensive collection of tutorials, how-to's, explanation guides,
|
3
|
+
To view our extensive collection of tutorials, how-to's, explanation guides,
|
4
|
+
and reference material, please visit our documentation on
|
5
|
+
**[Read the Docs](https://dataeval.readthedocs.io/)**
|
4
6
|
|
5
7
|
## About DataEval
|
6
8
|
|
7
9
|
<!-- start tagline -->
|
8
10
|
|
9
|
-
DataEval curates datasets to train and test performant, robust, unbiased and
|
11
|
+
DataEval curates datasets to train and test performant, robust, unbiased and
|
12
|
+
reliable AI models and monitors for data shifts that impact performance of
|
13
|
+
deployed models.
|
10
14
|
|
11
15
|
<!-- end tagline -->
|
12
16
|
|
@@ -14,22 +18,33 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
|
|
14
18
|
|
15
19
|
<!-- start needs -->
|
16
20
|
|
17
|
-
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
21
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
22
|
+
engineer. Throughout all stages of the machine learning lifecycle, DataEval
|
23
|
+
supports model development, data analysis, and monitoring with state-of-the-art
|
24
|
+
algorithms to help you solve difficult problems. With a focus on computer
|
25
|
+
vision tasks, DataEval provides simple, but effective metrics for performance
|
26
|
+
estimation, bias detection, and dataset linting.
|
18
27
|
|
19
28
|
<!-- end needs -->
|
20
29
|
|
21
30
|
<!-- start JATIC interop -->
|
22
|
-
DataEval is easy to install, supports a wide range of Python versions, and is
|
23
|
-
|
31
|
+
DataEval is easy to install, supports a wide range of Python versions, and is
|
32
|
+
compatible with many of the most popular packages in the scientific and T&E
|
33
|
+
communities.
|
34
|
+
|
35
|
+
DataEval also has native interopability between JATIC's suite of tools when
|
36
|
+
using MAITE-compliant datasets and models.
|
24
37
|
<!-- end JATIC interop -->
|
25
38
|
|
26
39
|
## Getting Started
|
27
40
|
|
28
41
|
**Python versions:** 3.9 - 3.12
|
29
42
|
|
30
|
-
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
43
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
44
|
+
*Gradient*
|
31
45
|
|
32
|
-
Choose your preferred method of installation below or follow our
|
46
|
+
Choose your preferred method of installation below or follow our
|
47
|
+
[installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
33
48
|
|
34
49
|
* [Installing with pip](#installing-with-pip)
|
35
50
|
* [Installing with conda/mamba](#installing-with-conda)
|
@@ -37,7 +52,8 @@ Choose your preferred method of installation below or follow our [installation g
|
|
37
52
|
|
38
53
|
### **Installing with pip**
|
39
54
|
|
40
|
-
You can install DataEval directly from pypi.org using the following command.
|
55
|
+
You can install DataEval directly from pypi.org using the following command.
|
56
|
+
The optional dependencies of DataEval are `all`.
|
41
57
|
|
42
58
|
```bash
|
43
59
|
pip install dataeval[all]
|
@@ -45,8 +61,9 @@ pip install dataeval[all]
|
|
45
61
|
|
46
62
|
### **Installing with conda**
|
47
63
|
|
48
|
-
DataEval can be installed in a Conda/Mamba environment using the provided
|
49
|
-
are installed from the `pytorch`
|
64
|
+
DataEval can be installed in a Conda/Mamba environment using the provided
|
65
|
+
`environment.yaml` file. As some dependencies are installed from the `pytorch`
|
66
|
+
channel, the channel is specified in the below example.
|
50
67
|
|
51
68
|
```bash
|
52
69
|
micromamba create -f environment\environment.yaml -c pytorch
|
@@ -54,7 +71,9 @@ micromamba create -f environment\environment.yaml -c pytorch
|
|
54
71
|
|
55
72
|
### **Installing from GitHub**
|
56
73
|
|
57
|
-
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
74
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
75
|
+
download larger, binary source files and `poetry` for project dependency
|
76
|
+
management.
|
58
77
|
|
59
78
|
```bash
|
60
79
|
sudo apt-get install git-lfs
|
@@ -74,7 +93,9 @@ Install DataEval with optional dependencies for development.
|
|
74
93
|
poetry install --all-extras --with dev
|
75
94
|
```
|
76
95
|
|
77
|
-
Now that DataEval is installed, you can run commands in the poetry virtual
|
96
|
+
Now that DataEval is installed, you can run commands in the poetry virtual
|
97
|
+
environment by prefixing shell commands with `poetry run`, or activate the
|
98
|
+
virtual environment directly in the shell.
|
78
99
|
|
79
100
|
```bash
|
80
101
|
poetry shell
|
@@ -93,6 +114,10 @@ If you have any questions, feel free to reach out to the people below:
|
|
93
114
|
|
94
115
|
### CDAO Funding Acknowledgement
|
95
116
|
|
96
|
-
This material is based upon work supported by the Chief Digital and Artificial
|
117
|
+
This material is based upon work supported by the Chief Digital and Artificial
|
118
|
+
Intelligence Office under Contract No. W519TC-23-9-2033. The views and
|
119
|
+
conclusions contained herein are those of the author(s) and should not be
|
120
|
+
interpreted as necessarily representing the official policies or endorsements,
|
121
|
+
either expressed or implied, of the U.S. Government.
|
97
122
|
|
98
123
|
<!-- end acknowledgement -->
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.81.0" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -42,6 +42,9 @@ packages = [
|
|
42
42
|
[tool.poetry.dependencies]
|
43
43
|
# required
|
44
44
|
python = ">=3.9,<3.13"
|
45
|
+
defusedxml = {version = ">=0.7.1"}
|
46
|
+
fast_hdbscan = {version = "0.2.0"} # 0.2.1 hits a bug in condense_tree comparing float to none
|
47
|
+
numba = {version = ">=0.59.1"}
|
45
48
|
numpy = {version = ">=1.24.2"}
|
46
49
|
pillow = {version = ">=10.3.0"}
|
47
50
|
requests = {version = "*"}
|
@@ -54,10 +57,11 @@ typing-extensions = {version = ">=4.12", python = "^3.9"} # ParamSpec
|
|
54
57
|
xxhash = {version = ">=3.3"}
|
55
58
|
|
56
59
|
# optional
|
57
|
-
matplotlib = {version = "
|
60
|
+
matplotlib = {version = ">=3.7.1", optional = true}
|
61
|
+
pandas = {version = ">=2.0", optional = true}
|
58
62
|
|
59
63
|
[tool.poetry.extras]
|
60
|
-
all = ["matplotlib"]
|
64
|
+
all = ["matplotlib", "pandas"]
|
61
65
|
|
62
66
|
[tool.poetry.group.dev]
|
63
67
|
optional = true
|
@@ -81,20 +85,20 @@ coverage = {version = "*", extras = ["toml"]}
|
|
81
85
|
pyright = {version = "*", extras = ["nodejs"]}
|
82
86
|
# prototype
|
83
87
|
maite = {version = "*"}
|
84
|
-
pandas = {version = "*"}
|
85
88
|
seaborn = {version = "*"}
|
86
89
|
# docs
|
87
90
|
certifi = {version = ">=2024.07.04"}
|
88
91
|
enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
|
89
92
|
ipykernel = {version = ">=6.26.0"}
|
90
93
|
ipywidgets = {version = ">=8.1.1"}
|
91
|
-
jinja2 = {version = ">=3.1.
|
94
|
+
jinja2 = {version = ">=3.1.6"}
|
92
95
|
jupyter-client = {version = ">=8.6.0"}
|
93
96
|
jupyter-cache = {version = "*"}
|
94
97
|
myst-nb = {version = ">=1.0.0"}
|
95
|
-
sphinx-immaterial = {version = "*"}
|
96
98
|
sphinx-autoapi = {version = "*"}
|
97
99
|
sphinx-design = {version = "*"}
|
100
|
+
sphinx-immaterial = {version = "*"}
|
101
|
+
sphinx-new-tab-link = {version = "*"}
|
98
102
|
sphinx-tabs = {version = "*"}
|
99
103
|
Sphinx = {version = ">=7.2.6"}
|
100
104
|
torchmetrics = {version = ">=1.0.0", source = "pytorch"}
|
@@ -128,6 +132,11 @@ reportMissingImports = false
|
|
128
132
|
norecursedirs = ["prototype"]
|
129
133
|
testpaths = ["tests"]
|
130
134
|
addopts = ["--pythonwarnings=ignore::DeprecationWarning", "--verbose", "--durations=20", "--durations-min=1.0"]
|
135
|
+
markers = [
|
136
|
+
"required: marks tests for required features",
|
137
|
+
"optional: marks tests for optional features",
|
138
|
+
"requires_all: marks tests that require the all extras",
|
139
|
+
]
|
131
140
|
|
132
141
|
[tool.coverage.run]
|
133
142
|
source = ["src/dataeval"]
|
@@ -142,8 +151,9 @@ exclude_also = [
|
|
142
151
|
]
|
143
152
|
include = ["*/src/dataeval/*"]
|
144
153
|
omit = [
|
145
|
-
"*/torch/
|
146
|
-
"*/
|
154
|
+
"*/torch/_blocks.py",
|
155
|
+
"*/_clusterer.py",
|
156
|
+
"*/_fast_mst.py",
|
147
157
|
]
|
148
158
|
fail_under = 90
|
149
159
|
|
@@ -177,6 +187,9 @@ per-file-ignores = { "*.ipynb" = ["E402"] }
|
|
177
187
|
[tool.ruff.lint.isort]
|
178
188
|
known-first-party = ["dataeval"]
|
179
189
|
|
190
|
+
[tool.ruff.lint.flake8-builtins]
|
191
|
+
builtins-strict-checking = false
|
192
|
+
|
180
193
|
[tool.ruff.format]
|
181
194
|
quote-style = "double"
|
182
195
|
indent-style = "space"
|
@@ -186,7 +199,7 @@ docstring-code-format = true
|
|
186
199
|
docstring-code-line-length = "dynamic"
|
187
200
|
|
188
201
|
[tool.codespell]
|
189
|
-
skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
|
202
|
+
skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html,./docs/source/*/data'
|
190
203
|
ignore-words-list = ["Hart"]
|
191
204
|
|
192
205
|
[build-system]
|
@@ -7,12 +7,12 @@ shifts that impact performance of deployed models.
|
|
7
7
|
|
8
8
|
from __future__ import annotations
|
9
9
|
|
10
|
-
__all__ = ["detectors", "log", "metrics", "utils", "workflows"]
|
11
|
-
__version__ = "0.
|
10
|
+
__all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
|
11
|
+
__version__ = "0.81.0"
|
12
12
|
|
13
13
|
import logging
|
14
14
|
|
15
|
-
from dataeval import detectors, metrics, utils, workflows
|
15
|
+
from dataeval import config, detectors, metrics, typing, utils, workflows
|
16
16
|
|
17
17
|
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
18
18
|
|
@@ -32,9 +32,23 @@ class Output:
|
|
32
32
|
return f"{self.__class__.__name__}: {str(self.dict())}"
|
33
33
|
|
34
34
|
def dict(self) -> dict[str, Any]:
|
35
|
+
"""
|
36
|
+
Output attributes as a dictionary.
|
37
|
+
|
38
|
+
Returns
|
39
|
+
-------
|
40
|
+
dict[str, Any]
|
41
|
+
"""
|
35
42
|
return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
|
36
43
|
|
37
44
|
def meta(self) -> dict[str, Any]:
|
45
|
+
"""
|
46
|
+
Execution metadata as a dictionary.
|
47
|
+
|
48
|
+
Returns
|
49
|
+
-------
|
50
|
+
dict[str, Any]
|
51
|
+
"""
|
38
52
|
return {k.removeprefix("_"): v for k, v in self.__dict__.items() if k.startswith("_")}
|
39
53
|
|
40
54
|
|
@@ -0,0 +1,77 @@
|
|
1
|
+
"""
|
2
|
+
Global configuration settings for DataEval.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
|
7
|
+
__all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes"]
|
8
|
+
|
9
|
+
import torch
|
10
|
+
from torch import device
|
11
|
+
|
12
|
+
_device: device | None = None
|
13
|
+
_processes: int | None = None
|
14
|
+
|
15
|
+
|
16
|
+
def set_device(device: str | device | int) -> None:
|
17
|
+
"""
|
18
|
+
Sets the default device to use when executing against a PyTorch backend.
|
19
|
+
|
20
|
+
Parameters
|
21
|
+
----------
|
22
|
+
device : str or int or `torch.device`
|
23
|
+
The default device to use. See `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
|
24
|
+
documentation for more information.
|
25
|
+
"""
|
26
|
+
global _device
|
27
|
+
_device = torch.device(device)
|
28
|
+
|
29
|
+
|
30
|
+
def get_device(override: str | device | int | None = None) -> torch.device:
|
31
|
+
"""
|
32
|
+
Returns the PyTorch device to use.
|
33
|
+
|
34
|
+
Parameters
|
35
|
+
----------
|
36
|
+
override : str or int or `torch.device` or None, default None
|
37
|
+
The user specified override if provided, otherwise returns the default device.
|
38
|
+
|
39
|
+
Returns
|
40
|
+
-------
|
41
|
+
`torch.device`
|
42
|
+
"""
|
43
|
+
if override is None:
|
44
|
+
global _device
|
45
|
+
return torch.get_default_device() if _device is None else _device
|
46
|
+
else:
|
47
|
+
return torch.device(override)
|
48
|
+
|
49
|
+
|
50
|
+
def set_max_processes(processes: int | None) -> None:
|
51
|
+
"""
|
52
|
+
Sets the maximum number of worker processes to use when running tasks that support parallel processing.
|
53
|
+
|
54
|
+
Parameters
|
55
|
+
----------
|
56
|
+
processes : int or None
|
57
|
+
The maximum number of worker processes to use, or None to use
|
58
|
+
`os.process_cpu_count <https://docs.python.org/3/library/os.html#os.process_cpu_count>`_
|
59
|
+
to determine the number of worker processes.
|
60
|
+
"""
|
61
|
+
global _processes
|
62
|
+
_processes = processes
|
63
|
+
|
64
|
+
|
65
|
+
def get_max_processes() -> int | None:
|
66
|
+
"""
|
67
|
+
Returns the maximum number of worker processes to use when running tasks that support parallel processing.
|
68
|
+
|
69
|
+
Returns
|
70
|
+
-------
|
71
|
+
int or None
|
72
|
+
The maximum number of worker processes to use, or None to use
|
73
|
+
`os.process_cpu_count <https://docs.python.org/3/library/os.html#os.process_cpu_count>`_
|
74
|
+
to determine the number of worker processes.
|
75
|
+
"""
|
76
|
+
global _processes
|
77
|
+
return _processes
|
@@ -0,0 +1,22 @@
|
|
1
|
+
"""
|
2
|
+
:term:`Drift` detectors identify if the statistical properties of the data has changed.
|
3
|
+
"""
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"DriftCVM",
|
7
|
+
"DriftKS",
|
8
|
+
"DriftMMD",
|
9
|
+
"DriftMMDOutput",
|
10
|
+
"DriftOutput",
|
11
|
+
"DriftUncertainty",
|
12
|
+
"preprocess_drift",
|
13
|
+
"updates",
|
14
|
+
]
|
15
|
+
|
16
|
+
from dataeval.detectors.drift import updates
|
17
|
+
from dataeval.detectors.drift._base import DriftOutput
|
18
|
+
from dataeval.detectors.drift._cvm import DriftCVM
|
19
|
+
from dataeval.detectors.drift._ks import DriftKS
|
20
|
+
from dataeval.detectors.drift._mmd import DriftMMD, DriftMMDOutput
|
21
|
+
from dataeval.detectors.drift._torch import preprocess_drift
|
22
|
+
from dataeval.detectors.drift._uncertainty import DriftUncertainty
|
@@ -10,16 +10,18 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = []
|
12
12
|
|
13
|
+
import math
|
13
14
|
from abc import ABC, abstractmethod
|
14
15
|
from dataclasses import dataclass
|
15
16
|
from functools import wraps
|
16
17
|
from typing import Any, Callable, Literal, TypeVar
|
17
18
|
|
18
19
|
import numpy as np
|
19
|
-
from numpy.typing import
|
20
|
+
from numpy.typing import NDArray
|
20
21
|
|
21
|
-
from dataeval.
|
22
|
-
from dataeval.
|
22
|
+
from dataeval._output import Output, set_metadata
|
23
|
+
from dataeval.typing import Array, ArrayLike
|
24
|
+
from dataeval.utils._array import as_numpy, to_numpy
|
23
25
|
|
24
26
|
R = TypeVar("R")
|
25
27
|
|
@@ -46,16 +48,9 @@ class UpdateStrategy(ABC):
|
|
46
48
|
class DriftBaseOutput(Output):
|
47
49
|
"""
|
48
50
|
Base output class for Drift Detector classes
|
49
|
-
|
50
|
-
Attributes
|
51
|
-
----------
|
52
|
-
is_drift : bool
|
53
|
-
Drift prediction for the images
|
54
|
-
threshold : float
|
55
|
-
Threshold after multivariate correction if needed
|
56
51
|
"""
|
57
52
|
|
58
|
-
|
53
|
+
drifted: bool
|
59
54
|
threshold: float
|
60
55
|
p_val: float
|
61
56
|
distance: float
|
@@ -64,14 +59,18 @@ class DriftBaseOutput(Output):
|
|
64
59
|
@dataclass(frozen=True)
|
65
60
|
class DriftOutput(DriftBaseOutput):
|
66
61
|
"""
|
67
|
-
Output class for :class
|
62
|
+
Output class for :class:`.DriftCVM`, :class:`.DriftKS`, and :class:`.DriftUncertainty` drift detectors.
|
68
63
|
|
69
64
|
Attributes
|
70
65
|
----------
|
71
|
-
|
66
|
+
drifted : bool
|
72
67
|
:term:`Drift` prediction for the images
|
73
68
|
threshold : float
|
74
69
|
Threshold after multivariate correction if needed
|
70
|
+
p_val : float
|
71
|
+
Instance-level p-value
|
72
|
+
distance : float
|
73
|
+
Instance-level distance
|
75
74
|
feature_drift : NDArray
|
76
75
|
Feature-level array of images detected to have drifted
|
77
76
|
feature_threshold : float
|
@@ -82,7 +81,7 @@ class DriftOutput(DriftBaseOutput):
|
|
82
81
|
Feature-level distances
|
83
82
|
"""
|
84
83
|
|
85
|
-
#
|
84
|
+
# drifted: bool
|
86
85
|
# threshold: float
|
87
86
|
# p_val: float
|
88
87
|
# distance: float
|
@@ -196,7 +195,7 @@ class BaseDrift:
|
|
196
195
|
if correction not in ["bonferroni", "fdr"]:
|
197
196
|
raise ValueError("`correction` must be `bonferroni` or `fdr`.")
|
198
197
|
|
199
|
-
self._x_ref =
|
198
|
+
self._x_ref = x_ref
|
200
199
|
self.x_ref_preprocessed: bool = x_ref_preprocessed
|
201
200
|
|
202
201
|
# Other attributes
|
@@ -204,25 +203,25 @@ class BaseDrift:
|
|
204
203
|
self.update_x_ref = update_x_ref
|
205
204
|
self.preprocess_fn = preprocess_fn
|
206
205
|
self.correction = correction
|
207
|
-
self.n: int = len(
|
206
|
+
self.n: int = len(x_ref)
|
208
207
|
|
209
208
|
# Ref counter for preprocessed x
|
210
209
|
self._x_refcount = 0
|
211
210
|
|
212
211
|
@property
|
213
|
-
def x_ref(self) ->
|
212
|
+
def x_ref(self) -> ArrayLike:
|
214
213
|
"""
|
215
214
|
Retrieve the reference data, applying preprocessing if not already done.
|
216
215
|
|
217
216
|
Returns
|
218
217
|
-------
|
219
|
-
|
218
|
+
ArrayLike
|
220
219
|
The reference dataset (`x_ref`), preprocessed if needed.
|
221
220
|
"""
|
222
221
|
if not self.x_ref_preprocessed:
|
223
222
|
self.x_ref_preprocessed = True
|
224
223
|
if self.preprocess_fn is not None:
|
225
|
-
self._x_ref =
|
224
|
+
self._x_ref = self.preprocess_fn(self._x_ref)
|
226
225
|
|
227
226
|
return self._x_ref
|
228
227
|
|
@@ -323,32 +322,44 @@ class BaseDriftUnivariate(BaseDrift):
|
|
323
322
|
# lazy process n_features as needed
|
324
323
|
if not isinstance(self._n_features, int):
|
325
324
|
# compute number of features for the univariate tests
|
326
|
-
|
327
|
-
|
328
|
-
self.
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
325
|
+
x_ref = (
|
326
|
+
self.x_ref
|
327
|
+
if self.preprocess_fn is None or self.x_ref_preprocessed
|
328
|
+
else self.preprocess_fn(self._x_ref[0:1])
|
329
|
+
)
|
330
|
+
# infer features from preprocessed reference data
|
331
|
+
shape = x_ref.shape if isinstance(x_ref, Array) else as_numpy(x_ref).shape
|
332
|
+
self._n_features = int(math.prod(shape[1:])) # Multiplies all channel sizes after first
|
333
333
|
|
334
334
|
return self._n_features
|
335
335
|
|
336
336
|
@preprocess_x
|
337
|
-
@abstractmethod
|
338
337
|
def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
|
339
338
|
"""
|
340
|
-
|
339
|
+
Calculates p-values and test statistics per feature.
|
341
340
|
|
342
341
|
Parameters
|
343
342
|
----------
|
344
343
|
x : ArrayLike
|
345
|
-
|
344
|
+
Batch of instances
|
346
345
|
|
347
346
|
Returns
|
348
347
|
-------
|
349
348
|
tuple[NDArray, NDArray]
|
350
|
-
|
349
|
+
Feature level p-values and test statistics
|
351
350
|
"""
|
351
|
+
x_np = to_numpy(x)
|
352
|
+
x_np = x_np.reshape(x_np.shape[0], -1)
|
353
|
+
x_ref_np = as_numpy(self.x_ref)
|
354
|
+
x_ref_np = x_ref_np.reshape(x_ref_np.shape[0], -1)
|
355
|
+
p_val = np.zeros(self.n_features, dtype=np.float32)
|
356
|
+
dist = np.zeros_like(p_val)
|
357
|
+
for f in range(self.n_features):
|
358
|
+
dist[f], p_val[f] = self._score_fn(x_ref_np[:, f], x_np[:, f])
|
359
|
+
return p_val, dist
|
360
|
+
|
361
|
+
@abstractmethod
|
362
|
+
def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]: ...
|
352
363
|
|
353
364
|
def _apply_correction(self, p_vals: NDArray) -> tuple[bool, float]:
|
354
365
|
"""
|