dataeval 0.74.2__tar.gz → 0.76.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.74.2 → dataeval-0.76.0}/LICENSE.txt +2 -2
- dataeval-0.76.0/PKG-INFO +137 -0
- dataeval-0.76.0/README.md +98 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/pyproject.toml +16 -15
- dataeval-0.76.0/src/dataeval/__init__.py +40 -0
- dataeval-0.76.0/src/dataeval/detectors/drift/__init__.py +22 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/base.py +3 -3
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/cvm.py +1 -1
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/ks.py +3 -2
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/mmd.py +9 -7
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/torch.py +12 -12
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/uncertainty.py +5 -4
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/drift/updates.py +1 -1
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/linters/clusterer.py +5 -9
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/linters/duplicates.py +10 -14
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/linters/outliers.py +100 -5
- dataeval-0.76.0/src/dataeval/detectors/ood/__init__.py +8 -0
- dataeval-0.74.2/src/dataeval/detectors/ood/ae_torch.py → dataeval-0.76.0/src/dataeval/detectors/ood/ae.py +6 -4
- dataeval-0.74.2/src/dataeval/detectors/ood/base_torch.py → dataeval-0.76.0/src/dataeval/detectors/ood/base.py +7 -22
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/ood/metadata_least_likely.py +3 -3
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval-0.74.2/src/dataeval/detectors/ood/base.py → dataeval-0.76.0/src/dataeval/detectors/ood/mixin.py +11 -72
- dataeval-0.76.0/src/dataeval/detectors/ood/output.py +63 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/interop.py +7 -6
- dataeval-0.74.2/src/dataeval/logging.py → dataeval-0.76.0/src/dataeval/log.py +2 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/__init__.py +3 -3
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/bias/__init__.py +10 -13
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/bias/balance.py +13 -11
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/bias/coverage.py +53 -5
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/bias/diversity.py +56 -24
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/bias/parity.py +20 -17
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/estimators/ber.py +7 -4
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/estimators/divergence.py +4 -4
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/estimators/uap.py +4 -4
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/__init__.py +19 -19
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/base.py +28 -12
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/boxratiostats.py +13 -14
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/datasetstats.py +49 -20
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/dimensionstats.py +8 -8
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/hashstats.py +14 -10
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/labelstats.py +94 -11
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/pixelstats.py +11 -14
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/stats/visualstats.py +10 -13
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/output.py +23 -14
- dataeval-0.76.0/src/dataeval/utils/__init__.py +9 -0
- dataeval-0.76.0/src/dataeval/utils/dataset/__init__.py +7 -0
- {dataeval-0.74.2/src/dataeval/utils/torch → dataeval-0.76.0/src/dataeval/utils/dataset}/datasets.py +2 -0
- dataeval-0.76.0/src/dataeval/utils/dataset/read.py +63 -0
- dataeval-0.74.2/src/dataeval/utils/split_dataset.py → dataeval-0.76.0/src/dataeval/utils/dataset/split.py +38 -30
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/utils/image.py +2 -2
- dataeval-0.76.0/src/dataeval/utils/metadata.py +581 -0
- dataeval-0.74.2/src/dataeval/metrics/bias/metadata_utils.py → dataeval-0.76.0/src/dataeval/utils/plot.py +91 -71
- dataeval-0.76.0/src/dataeval/utils/torch/__init__.py +10 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/utils/torch/gmm.py +29 -6
- dataeval-0.74.2/src/dataeval/utils/torch/utils.py → dataeval-0.76.0/src/dataeval/utils/torch/internal.py +82 -58
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/utils/torch/models.py +10 -8
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/utils/torch/trainer.py +6 -85
- dataeval-0.76.0/src/dataeval/workflows/__init__.py +7 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/workflows/sufficiency.py +18 -8
- dataeval-0.74.2/PKG-INFO +0 -120
- dataeval-0.74.2/README.md +0 -81
- dataeval-0.74.2/src/dataeval/__init__.py +0 -36
- dataeval-0.74.2/src/dataeval/detectors/drift/__init__.py +0 -20
- dataeval-0.74.2/src/dataeval/detectors/ood/__init__.py +0 -15
- dataeval-0.74.2/src/dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval-0.74.2/src/dataeval/utils/__init__.py +0 -18
- dataeval-0.74.2/src/dataeval/utils/gmm.py +0 -26
- dataeval-0.74.2/src/dataeval/utils/metadata.py +0 -278
- dataeval-0.74.2/src/dataeval/utils/torch/__init__.py +0 -25
- dataeval-0.74.2/src/dataeval/workflows/__init__.py +0 -10
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/__init__.py +2 -2
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/linters/__init__.py +4 -4
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/detectors/linters/merged_stats.py +0 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/metrics/estimators/__init__.py +2 -2
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/py.typed +0 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/utils/shared.py +0 -0
- {dataeval-0.74.2 → dataeval-0.76.0}/src/dataeval/utils/torch/blocks.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
|
-
Copyright (c)
|
3
|
+
Copyright (c) 2025 ARiA
|
4
4
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
6
|
of this software and associated documentation files (the "Software"), to deal
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
21
|
+
SOFTWARE.
|
dataeval-0.76.0/PKG-INFO
ADDED
@@ -0,0 +1,137 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: dataeval
|
3
|
+
Version: 0.76.0
|
4
|
+
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
|
+
Home-page: https://dataeval.ai/
|
6
|
+
License: MIT
|
7
|
+
Author: Andrew Weng
|
8
|
+
Author-email: andrew.weng@ariacoustics.com
|
9
|
+
Maintainer: ARiA
|
10
|
+
Maintainer-email: dataeval@ariacoustics.com
|
11
|
+
Requires-Python: >=3.9,<3.13
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
15
|
+
Classifier: Operating System :: OS Independent
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
21
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
22
|
+
Classifier: Topic :: Scientific/Engineering
|
23
|
+
Provides-Extra: all
|
24
|
+
Requires-Dist: matplotlib ; extra == "all"
|
25
|
+
Requires-Dist: numpy (>=1.24.2)
|
26
|
+
Requires-Dist: pillow (>=10.3.0)
|
27
|
+
Requires-Dist: requests
|
28
|
+
Requires-Dist: scikit-learn (>=1.5.0)
|
29
|
+
Requires-Dist: scipy (>=1.10)
|
30
|
+
Requires-Dist: torch (>=2.2.0)
|
31
|
+
Requires-Dist: torchvision (>=0.17.0)
|
32
|
+
Requires-Dist: tqdm
|
33
|
+
Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "4.0"
|
34
|
+
Requires-Dist: xxhash (>=3.3)
|
35
|
+
Project-URL: Documentation, https://dataeval.readthedocs.io/
|
36
|
+
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
37
|
+
Description-Content-Type: text/markdown
|
38
|
+
|
39
|
+
# DataEval
|
40
|
+
|
41
|
+
To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
|
42
|
+
|
43
|
+
## About DataEval
|
44
|
+
|
45
|
+
<!-- start tagline -->
|
46
|
+
|
47
|
+
DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
|
48
|
+
|
49
|
+
<!-- end tagline -->
|
50
|
+
|
51
|
+
### Our mission
|
52
|
+
|
53
|
+
<!-- start needs -->
|
54
|
+
|
55
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
|
56
|
+
|
57
|
+
<!-- end needs -->
|
58
|
+
|
59
|
+
<!-- start JATIC interop -->
|
60
|
+
DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
|
61
|
+
DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
|
62
|
+
<!-- end JATIC interop -->
|
63
|
+
|
64
|
+
## Getting Started
|
65
|
+
|
66
|
+
**Python versions:** 3.9 - 3.12
|
67
|
+
|
68
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
|
69
|
+
|
70
|
+
Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
71
|
+
|
72
|
+
* [Installing with pip](#installing-with-pip)
|
73
|
+
* [Installing with conda/mamba](#installing-with-conda)
|
74
|
+
* [Installing from GitHub](#installing-from-github)
|
75
|
+
|
76
|
+
### **Installing with pip**
|
77
|
+
|
78
|
+
You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
|
79
|
+
|
80
|
+
```bash
|
81
|
+
pip install dataeval[all]
|
82
|
+
```
|
83
|
+
|
84
|
+
### **Installing with conda**
|
85
|
+
|
86
|
+
DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
|
87
|
+
are installed from the `pytorch` channel, the channel is specified in the below example.
|
88
|
+
|
89
|
+
```bash
|
90
|
+
micromamba create -f environment\environment.yaml -c pytorch
|
91
|
+
```
|
92
|
+
|
93
|
+
### **Installing from GitHub**
|
94
|
+
|
95
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
|
96
|
+
|
97
|
+
```bash
|
98
|
+
sudo apt-get install git-lfs
|
99
|
+
pip install poetry
|
100
|
+
```
|
101
|
+
|
102
|
+
Pull the source down and change to the DataEval project directory.
|
103
|
+
|
104
|
+
```bash
|
105
|
+
git clone https://github.com/aria-ml/dataeval.git
|
106
|
+
cd dataeval
|
107
|
+
```
|
108
|
+
|
109
|
+
Install DataEval with optional dependencies for development.
|
110
|
+
|
111
|
+
```bash
|
112
|
+
poetry install --all-extras --with dev
|
113
|
+
```
|
114
|
+
|
115
|
+
Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
|
116
|
+
|
117
|
+
```bash
|
118
|
+
poetry shell
|
119
|
+
```
|
120
|
+
|
121
|
+
## Contact Us
|
122
|
+
|
123
|
+
If you have any questions, feel free to reach out to the people below:
|
124
|
+
|
125
|
+
* **POC**: Scott Swan @scott.swan
|
126
|
+
* **DPOC**: Andrew Weng @aweng
|
127
|
+
|
128
|
+
## Acknowledgement
|
129
|
+
|
130
|
+
<!-- start acknowledgement -->
|
131
|
+
|
132
|
+
### CDAO Funding Acknowledgement
|
133
|
+
|
134
|
+
This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
|
135
|
+
|
136
|
+
<!-- end acknowledgement -->
|
137
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# DataEval
|
2
|
+
|
3
|
+
To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
|
4
|
+
|
5
|
+
## About DataEval
|
6
|
+
|
7
|
+
<!-- start tagline -->
|
8
|
+
|
9
|
+
DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
|
10
|
+
|
11
|
+
<!-- end tagline -->
|
12
|
+
|
13
|
+
### Our mission
|
14
|
+
|
15
|
+
<!-- start needs -->
|
16
|
+
|
17
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
|
18
|
+
|
19
|
+
<!-- end needs -->
|
20
|
+
|
21
|
+
<!-- start JATIC interop -->
|
22
|
+
DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
|
23
|
+
DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
|
24
|
+
<!-- end JATIC interop -->
|
25
|
+
|
26
|
+
## Getting Started
|
27
|
+
|
28
|
+
**Python versions:** 3.9 - 3.12
|
29
|
+
|
30
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
|
31
|
+
|
32
|
+
Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
33
|
+
|
34
|
+
* [Installing with pip](#installing-with-pip)
|
35
|
+
* [Installing with conda/mamba](#installing-with-conda)
|
36
|
+
* [Installing from GitHub](#installing-from-github)
|
37
|
+
|
38
|
+
### **Installing with pip**
|
39
|
+
|
40
|
+
You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
|
41
|
+
|
42
|
+
```bash
|
43
|
+
pip install dataeval[all]
|
44
|
+
```
|
45
|
+
|
46
|
+
### **Installing with conda**
|
47
|
+
|
48
|
+
DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
|
49
|
+
are installed from the `pytorch` channel, the channel is specified in the below example.
|
50
|
+
|
51
|
+
```bash
|
52
|
+
micromamba create -f environment\environment.yaml -c pytorch
|
53
|
+
```
|
54
|
+
|
55
|
+
### **Installing from GitHub**
|
56
|
+
|
57
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
|
58
|
+
|
59
|
+
```bash
|
60
|
+
sudo apt-get install git-lfs
|
61
|
+
pip install poetry
|
62
|
+
```
|
63
|
+
|
64
|
+
Pull the source down and change to the DataEval project directory.
|
65
|
+
|
66
|
+
```bash
|
67
|
+
git clone https://github.com/aria-ml/dataeval.git
|
68
|
+
cd dataeval
|
69
|
+
```
|
70
|
+
|
71
|
+
Install DataEval with optional dependencies for development.
|
72
|
+
|
73
|
+
```bash
|
74
|
+
poetry install --all-extras --with dev
|
75
|
+
```
|
76
|
+
|
77
|
+
Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
|
78
|
+
|
79
|
+
```bash
|
80
|
+
poetry shell
|
81
|
+
```
|
82
|
+
|
83
|
+
## Contact Us
|
84
|
+
|
85
|
+
If you have any questions, feel free to reach out to the people below:
|
86
|
+
|
87
|
+
* **POC**: Scott Swan @scott.swan
|
88
|
+
* **DPOC**: Andrew Weng @aweng
|
89
|
+
|
90
|
+
## Acknowledgement
|
91
|
+
|
92
|
+
<!-- start acknowledgement -->
|
93
|
+
|
94
|
+
### CDAO Funding Acknowledgement
|
95
|
+
|
96
|
+
This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
|
97
|
+
|
98
|
+
<!-- end acknowledgement -->
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.76.0" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -42,22 +42,22 @@ packages = [
|
|
42
42
|
[tool.poetry.dependencies]
|
43
43
|
# required
|
44
44
|
python = ">=3.9,<3.13"
|
45
|
-
numpy = {version = ">=1.24.
|
45
|
+
numpy = {version = ">=1.24.2"}
|
46
46
|
pillow = {version = ">=10.3.0"}
|
47
|
+
requests = {version = "*"}
|
47
48
|
scipy = {version = ">=1.10"}
|
48
49
|
scikit-learn = {version = ">=1.5.0"}
|
50
|
+
torch = {version = ">=2.2.0", source = "pytorch"}
|
51
|
+
torchvision = {version = ">=0.17.0", source = "pytorch"}
|
49
52
|
tqdm = {version = "*"}
|
50
|
-
typing-extensions = {version = ">=4.12", python = "
|
53
|
+
typing-extensions = {version = ">=4.12", python = "^3.9"} # ParamSpec
|
51
54
|
xxhash = {version = ">=3.3"}
|
52
55
|
|
53
56
|
# optional
|
54
57
|
matplotlib = {version = "*", optional = true}
|
55
|
-
torch = {version = ">=2.2.0", source = "pytorch", optional = true}
|
56
|
-
torchvision = {version = ">=0.17.0", source = "pytorch", optional = true}
|
57
58
|
|
58
59
|
[tool.poetry.extras]
|
59
|
-
|
60
|
-
all = ["matplotlib", "torch", "torchvision"]
|
60
|
+
all = ["matplotlib"]
|
61
61
|
|
62
62
|
[tool.poetry.group.dev]
|
63
63
|
optional = true
|
@@ -65,9 +65,10 @@ optional = true
|
|
65
65
|
[tool.poetry.group.dev.dependencies]
|
66
66
|
nox = {version = "*", extras = ["uv"]}
|
67
67
|
uv = {version = "*"}
|
68
|
-
poetry = {version = "
|
68
|
+
poetry = {version = "<2"}
|
69
69
|
poetry-lock-groups-plugin = {version = "*"}
|
70
70
|
poetry2conda = {version = "*"}
|
71
|
+
numpy = {version = ">=2.0.2"}
|
71
72
|
# lint
|
72
73
|
ruff = {version = "*"}
|
73
74
|
codespell = {version = "*", extras = ["toml"]}
|
@@ -76,26 +77,27 @@ pytest = {version = "*"}
|
|
76
77
|
pytest-cov = {version = "*"}
|
77
78
|
pytest-xdist = {version = "*"}
|
78
79
|
coverage = {version = "*", extras = ["toml"]}
|
79
|
-
torchmetrics = {version = ">=1.0.0", source = "pytorch"}
|
80
80
|
# type
|
81
81
|
pyright = {version = "*", extras = ["nodejs"]}
|
82
82
|
# prototype
|
83
83
|
maite = {version = "*"}
|
84
84
|
pandas = {version = "*"}
|
85
85
|
seaborn = {version = "*"}
|
86
|
-
numpy = {version = ">=2.0.2"}
|
87
86
|
# docs
|
88
87
|
certifi = {version = ">=2024.07.04"}
|
89
88
|
enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
|
90
89
|
ipykernel = {version = ">=6.26.0"}
|
91
90
|
ipywidgets = {version = ">=8.1.1"}
|
91
|
+
jinja2 = {version = ">=3.1.5"}
|
92
92
|
jupyter-client = {version = ">=8.6.0"}
|
93
93
|
jupyter-cache = {version = "*"}
|
94
94
|
myst-nb = {version = ">=1.0.0"}
|
95
|
-
|
95
|
+
sphinx-immaterial = {version = "*"}
|
96
|
+
sphinx-autoapi = {version = "*"}
|
96
97
|
sphinx-design = {version = "*"}
|
97
98
|
sphinx-tabs = {version = "*"}
|
98
99
|
Sphinx = {version = ">=7.2.6"}
|
100
|
+
torchmetrics = {version = ">=1.0.0", source = "pytorch"}
|
99
101
|
markupsafe = {version = "<3.0.2", optional = true}
|
100
102
|
|
101
103
|
[[tool.poetry.source]]
|
@@ -136,8 +138,7 @@ parallel = true
|
|
136
138
|
[tool.coverage.report]
|
137
139
|
exclude_also = [
|
138
140
|
"raise NotImplementedError",
|
139
|
-
"
|
140
|
-
"if _IS_TORCHVISION_AVAILABLE",
|
141
|
+
": \\.\\.\\."
|
141
142
|
]
|
142
143
|
include = ["*/src/dataeval/*"]
|
143
144
|
omit = [
|
@@ -155,7 +156,7 @@ exclude = [
|
|
155
156
|
".jupyter_cache",
|
156
157
|
"*env*",
|
157
158
|
"output",
|
158
|
-
"
|
159
|
+
"build",
|
159
160
|
".nox",
|
160
161
|
".tox",
|
161
162
|
"prototype",
|
@@ -185,7 +186,7 @@ docstring-code-format = true
|
|
185
186
|
docstring-code-line-length = "dynamic"
|
186
187
|
|
187
188
|
[tool.codespell]
|
188
|
-
skip = './*env*,./prototype,./output,./docs/
|
189
|
+
skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
|
189
190
|
ignore-words-list = ["Hart"]
|
190
191
|
|
191
192
|
[build-system]
|
@@ -0,0 +1,40 @@
|
|
1
|
+
"""
|
2
|
+
DataEval provides a simple interface to characterize image data and its impact on model performance
|
3
|
+
across classification and object-detection tasks. It also provides capabilities to select and curate
|
4
|
+
datasets to test and train performant, robust, unbiased and reliable AI models and monitor for data
|
5
|
+
shifts that impact performance of deployed models.
|
6
|
+
"""
|
7
|
+
|
8
|
+
from __future__ import annotations
|
9
|
+
|
10
|
+
__all__ = ["detectors", "log", "metrics", "utils", "workflows"]
|
11
|
+
__version__ = "0.76.0"
|
12
|
+
|
13
|
+
import logging
|
14
|
+
|
15
|
+
from dataeval import detectors, metrics, utils, workflows
|
16
|
+
|
17
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
18
|
+
|
19
|
+
|
20
|
+
def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> None:
|
21
|
+
"""
|
22
|
+
Helper for quickly adding a StreamHandler to the logger. Useful for debugging.
|
23
|
+
|
24
|
+
Parameters
|
25
|
+
----------
|
26
|
+
level : int, default logging.DEBUG(10)
|
27
|
+
Set the logging level for the logger.
|
28
|
+
handler : logging.Handler, optional
|
29
|
+
Sets the logging handler for the logger if provided, otherwise logger will be
|
30
|
+
provided with a StreamHandler.
|
31
|
+
"""
|
32
|
+
import logging
|
33
|
+
|
34
|
+
logger = logging.getLogger(__name__)
|
35
|
+
if handler is None:
|
36
|
+
handler = logging.StreamHandler() if handler is None else handler
|
37
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
38
|
+
logger.addHandler(handler)
|
39
|
+
logger.setLevel(level)
|
40
|
+
logger.debug(f"Added logging handler {handler} to logger: {__name__}")
|
@@ -0,0 +1,22 @@
|
|
1
|
+
"""
|
2
|
+
:term:`Drift` detectors identify if the statistical properties of the data has changed.
|
3
|
+
"""
|
4
|
+
|
5
|
+
__all__ = [
|
6
|
+
"DriftCVM",
|
7
|
+
"DriftKS",
|
8
|
+
"DriftMMD",
|
9
|
+
"DriftMMDOutput",
|
10
|
+
"DriftOutput",
|
11
|
+
"DriftUncertainty",
|
12
|
+
"preprocess_drift",
|
13
|
+
"updates",
|
14
|
+
]
|
15
|
+
|
16
|
+
from dataeval.detectors.drift import updates
|
17
|
+
from dataeval.detectors.drift.base import DriftOutput
|
18
|
+
from dataeval.detectors.drift.cvm import DriftCVM
|
19
|
+
from dataeval.detectors.drift.ks import DriftKS
|
20
|
+
from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
|
21
|
+
from dataeval.detectors.drift.torch import preprocess_drift
|
22
|
+
from dataeval.detectors.drift.uncertainty import DriftUncertainty
|
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
__all__ = [
|
11
|
+
__all__ = []
|
12
12
|
|
13
13
|
from abc import ABC, abstractmethod
|
14
14
|
from dataclasses import dataclass
|
@@ -45,7 +45,7 @@ class UpdateStrategy(ABC):
|
|
45
45
|
@dataclass(frozen=True)
|
46
46
|
class DriftBaseOutput(Output):
|
47
47
|
"""
|
48
|
-
Base output class for Drift
|
48
|
+
Base output class for Drift Detector classes
|
49
49
|
|
50
50
|
Attributes
|
51
51
|
----------
|
@@ -64,7 +64,7 @@ class DriftBaseOutput(Output):
|
|
64
64
|
@dataclass(frozen=True)
|
65
65
|
class DriftOutput(DriftBaseOutput):
|
66
66
|
"""
|
67
|
-
Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors
|
67
|
+
Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
|
68
68
|
|
69
69
|
Attributes
|
70
70
|
----------
|
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
__all__ = [
|
11
|
+
__all__ = []
|
12
12
|
|
13
13
|
from typing import Callable, Literal
|
14
14
|
|
@@ -22,7 +22,8 @@ from dataeval.interop import to_numpy
|
|
22
22
|
|
23
23
|
class DriftKS(BaseDriftUnivariate):
|
24
24
|
"""
|
25
|
-
:term:`Drift` detector employing the Kolmogorov-Smirnov (KS)
|
25
|
+
:term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
|
26
|
+
distribution<Kolmogorov-Smirnov (K-S) test>` test.
|
26
27
|
|
27
28
|
The KS test detects changes in the maximum distance between two data
|
28
29
|
distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction
|
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
__all__ = [
|
11
|
+
__all__ = []
|
12
12
|
|
13
13
|
from dataclasses import dataclass
|
14
14
|
from typing import Callable
|
@@ -17,15 +17,16 @@ import torch
|
|
17
17
|
from numpy.typing import ArrayLike
|
18
18
|
|
19
19
|
from dataeval.detectors.drift.base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
|
20
|
-
from dataeval.detectors.drift.torch import
|
20
|
+
from dataeval.detectors.drift.torch import GaussianRBF, mmd2_from_kernel_matrix
|
21
21
|
from dataeval.interop import as_numpy
|
22
22
|
from dataeval.output import set_metadata
|
23
|
+
from dataeval.utils.torch.internal import get_device
|
23
24
|
|
24
25
|
|
25
26
|
@dataclass(frozen=True)
|
26
27
|
class DriftMMDOutput(DriftBaseOutput):
|
27
28
|
"""
|
28
|
-
Output class for :class:`DriftMMD` :term:`drift<Drift>` detector
|
29
|
+
Output class for :class:`DriftMMD` :term:`drift<Drift>` detector.
|
29
30
|
|
30
31
|
Attributes
|
31
32
|
----------
|
@@ -50,7 +51,8 @@ class DriftMMDOutput(DriftBaseOutput):
|
|
50
51
|
|
51
52
|
class DriftMMD(BaseDrift):
|
52
53
|
"""
|
53
|
-
:term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm
|
54
|
+
:term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
|
55
|
+
using a permutation test.
|
54
56
|
|
55
57
|
Parameters
|
56
58
|
----------
|
@@ -109,7 +111,7 @@ class DriftMMD(BaseDrift):
|
|
109
111
|
|
110
112
|
# initialize kernel
|
111
113
|
sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
|
112
|
-
self._kernel =
|
114
|
+
self._kernel = GaussianRBF(sigma_tensor).to(self.device)
|
113
115
|
|
114
116
|
# compute kernel matrix for the reference data
|
115
117
|
if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
|
@@ -150,9 +152,9 @@ class DriftMMD(BaseDrift):
|
|
150
152
|
n = x.shape[0]
|
151
153
|
kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
|
152
154
|
kernel_mat = kernel_mat - torch.diag(kernel_mat.diag()) # zero diagonal
|
153
|
-
mmd2 =
|
155
|
+
mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
|
154
156
|
mmd2_permuted = torch.Tensor(
|
155
|
-
[
|
157
|
+
[mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
|
156
158
|
)
|
157
159
|
mmd2, mmd2_permuted = mmd2.detach().cpu(), mmd2_permuted.detach().cpu()
|
158
160
|
p_val = (mmd2 <= mmd2_permuted).float().mean()
|
@@ -17,10 +17,10 @@ import torch
|
|
17
17
|
import torch.nn as nn
|
18
18
|
from numpy.typing import NDArray
|
19
19
|
|
20
|
-
from dataeval.utils.torch.
|
20
|
+
from dataeval.utils.torch.internal import get_device, predict_batch
|
21
21
|
|
22
22
|
|
23
|
-
def
|
23
|
+
def mmd2_from_kernel_matrix(
|
24
24
|
kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
|
25
25
|
) -> torch.Tensor:
|
26
26
|
"""
|
@@ -127,7 +127,7 @@ def _squared_pairwise_distance(
|
|
127
127
|
|
128
128
|
def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
|
129
129
|
"""
|
130
|
-
Bandwidth estimation using the median heuristic
|
130
|
+
Bandwidth estimation using the median heuristic `Gretton2012`
|
131
131
|
|
132
132
|
Parameters
|
133
133
|
----------
|
@@ -151,7 +151,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
|
|
151
151
|
return sigma
|
152
152
|
|
153
153
|
|
154
|
-
class
|
154
|
+
class GaussianRBF(nn.Module):
|
155
155
|
"""
|
156
156
|
Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
|
157
157
|
|
@@ -179,18 +179,18 @@ class _GaussianRBF(nn.Module):
|
|
179
179
|
) -> None:
|
180
180
|
super().__init__()
|
181
181
|
init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
|
182
|
-
self.config = {
|
182
|
+
self.config: dict[str, Any] = {
|
183
183
|
"sigma": sigma,
|
184
184
|
"trainable": trainable,
|
185
185
|
"init_sigma_fn": init_sigma_fn,
|
186
186
|
}
|
187
187
|
if sigma is None:
|
188
|
-
self.log_sigma = nn.Parameter(torch.empty(1), requires_grad=trainable)
|
189
|
-
self.init_required = True
|
188
|
+
self.log_sigma: nn.Parameter = nn.Parameter(torch.empty(1), requires_grad=trainable)
|
189
|
+
self.init_required: bool = True
|
190
190
|
else:
|
191
191
|
sigma = sigma.reshape(-1) # [Ns,]
|
192
|
-
self.log_sigma = nn.Parameter(sigma.log(), requires_grad=trainable)
|
193
|
-
self.init_required = False
|
192
|
+
self.log_sigma: nn.Parameter = nn.Parameter(sigma.log(), requires_grad=trainable)
|
193
|
+
self.init_required: bool = False
|
194
194
|
self.init_sigma_fn = init_sigma_fn
|
195
195
|
self.trainable = trainable
|
196
196
|
|
@@ -200,8 +200,8 @@ class _GaussianRBF(nn.Module):
|
|
200
200
|
|
201
201
|
def forward(
|
202
202
|
self,
|
203
|
-
x: np.ndarray | torch.Tensor,
|
204
|
-
y: np.ndarray | torch.Tensor,
|
203
|
+
x: np.ndarray[Any, Any] | torch.Tensor,
|
204
|
+
y: np.ndarray[Any, Any] | torch.Tensor,
|
205
205
|
infer_sigma: bool = False,
|
206
206
|
) -> torch.Tensor:
|
207
207
|
x, y = torch.as_tensor(x), torch.as_tensor(y)
|
@@ -213,7 +213,7 @@ class _GaussianRBF(nn.Module):
|
|
213
213
|
sigma = self.init_sigma_fn(x, y, dist)
|
214
214
|
with torch.no_grad():
|
215
215
|
self.log_sigma.copy_(sigma.log().clone())
|
216
|
-
self.init_required = False
|
216
|
+
self.init_required: bool = False
|
217
217
|
|
218
218
|
gamma = 1.0 / (2.0 * self.sigma**2) # [Ns,]
|
219
219
|
# TODO: do matrix multiplication after all?
|
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
|
|
8
8
|
|
9
9
|
from __future__ import annotations
|
10
10
|
|
11
|
-
__all__ = [
|
11
|
+
__all__ = []
|
12
12
|
|
13
13
|
from functools import partial
|
14
14
|
from typing import Callable, Literal
|
@@ -20,7 +20,8 @@ from scipy.stats import entropy
|
|
20
20
|
|
21
21
|
from dataeval.detectors.drift.base import DriftOutput, UpdateStrategy
|
22
22
|
from dataeval.detectors.drift.ks import DriftKS
|
23
|
-
from dataeval.detectors.drift.torch import
|
23
|
+
from dataeval.detectors.drift.torch import preprocess_drift
|
24
|
+
from dataeval.utils.torch.internal import get_device
|
24
25
|
|
25
26
|
|
26
27
|
def classifier_uncertainty(
|
@@ -65,8 +66,8 @@ def classifier_uncertainty(
|
|
65
66
|
|
66
67
|
class DriftUncertainty:
|
67
68
|
"""
|
68
|
-
Test for a change in the number of instances falling into regions on which
|
69
|
-
|
69
|
+
Test for a change in the number of instances falling into regions on which \
|
70
|
+
the model is uncertain.
|
70
71
|
|
71
72
|
Performs a K-S test on prediction entropies.
|
72
73
|
|