dataeval 0.75.0__tar.gz → 0.76.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.75.0 → dataeval-0.76.1}/LICENSE.txt +2 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/PKG-INFO +57 -30
- dataeval-0.76.1/README.md +123 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/pyproject.toml +10 -7
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/__init__.py +3 -3
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/base.py +2 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/ks.py +2 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/mmd.py +3 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/uncertainty.py +2 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/updates.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/clusterer.py +3 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/duplicates.py +4 -4
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/outliers.py +96 -3
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/__init__.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/base.py +1 -17
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/output.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/interop.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/__init__.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/__init__.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/balance.py +3 -3
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/coverage.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/diversity.py +14 -10
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/parity.py +7 -9
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/ber.py +4 -3
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/divergence.py +3 -3
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/uap.py +3 -3
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/__init__.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/base.py +24 -8
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/boxratiostats.py +5 -5
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/datasetstats.py +39 -6
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/dimensionstats.py +4 -4
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/hashstats.py +2 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/labelstats.py +89 -6
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/pixelstats.py +7 -5
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/visualstats.py +6 -4
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/output.py +23 -14
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/__init__.py +2 -2
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/read.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/split.py +1 -1
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/metadata.py +255 -110
- dataeval-0.76.1/src/dataeval/utils/plot.py +249 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/workflows/sufficiency.py +2 -2
- dataeval-0.75.0/README.md +0 -97
- dataeval-0.75.0/src/dataeval/utils/plot.py +0 -126
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/__init__.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/__init__.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/cvm.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/torch.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/__init__.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/merged_stats.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/ae.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/metadata_least_likely.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/mixin.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/log.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/__init__.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/py.typed +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/__init__.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/datasets.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/image.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/shared.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/blocks.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/gmm.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/internal.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/workflows/__init__.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
MIT License
|
2
2
|
|
3
|
-
Copyright (c)
|
3
|
+
Copyright (c) 2025 ARiA
|
4
4
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
6
|
of this software and associated documentation files (the "Software"), to deal
|
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
18
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
19
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
20
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
-
SOFTWARE.
|
21
|
+
SOFTWARE.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.76.1
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -21,8 +21,9 @@ Classifier: Programming Language :: Python :: 3.12
|
|
21
21
|
Classifier: Programming Language :: Python :: 3 :: Only
|
22
22
|
Classifier: Topic :: Scientific/Engineering
|
23
23
|
Provides-Extra: all
|
24
|
-
Requires-Dist: matplotlib ; extra == "all"
|
25
|
-
Requires-Dist: numpy (>=1.24.
|
24
|
+
Requires-Dist: matplotlib (>=3.7.1) ; extra == "all"
|
25
|
+
Requires-Dist: numpy (>=1.24.2)
|
26
|
+
Requires-Dist: pandas (>=2.0) ; extra == "all"
|
26
27
|
Requires-Dist: pillow (>=10.3.0)
|
27
28
|
Requires-Dist: requests
|
28
29
|
Requires-Dist: scikit-learn (>=1.5.0)
|
@@ -38,13 +39,17 @@ Description-Content-Type: text/markdown
|
|
38
39
|
|
39
40
|
# DataEval
|
40
41
|
|
41
|
-
To view our extensive collection of tutorials, how-to's, explanation guides,
|
42
|
+
To view our extensive collection of tutorials, how-to's, explanation guides,
|
43
|
+
and reference material, please visit our documentation on
|
44
|
+
**[Read the Docs](https://dataeval.readthedocs.io/)**
|
42
45
|
|
43
46
|
## About DataEval
|
44
47
|
|
45
48
|
<!-- start tagline -->
|
46
49
|
|
47
|
-
DataEval curates datasets to train and test performant, robust, unbiased and
|
50
|
+
DataEval curates datasets to train and test performant, robust, unbiased and
|
51
|
+
reliable AI models and monitors for data shifts that impact performance of
|
52
|
+
deployed models.
|
48
53
|
|
49
54
|
<!-- end tagline -->
|
50
55
|
|
@@ -52,65 +57,86 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
|
|
52
57
|
|
53
58
|
<!-- start needs -->
|
54
59
|
|
55
|
-
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
60
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
61
|
+
engineer. Throughout all stages of the machine learning lifecycle, DataEval
|
62
|
+
supports model development, data analysis, and monitoring with state-of-the-art
|
63
|
+
algorithms to help you solve difficult problems. With a focus on computer
|
64
|
+
vision tasks, DataEval provides simple, but effective metrics for performance
|
65
|
+
estimation, bias detection, and dataset linting.
|
56
66
|
|
57
67
|
<!-- end needs -->
|
58
68
|
|
59
69
|
<!-- start JATIC interop -->
|
60
|
-
DataEval is easy to install, supports a wide range of Python versions, and is
|
61
|
-
|
70
|
+
DataEval is easy to install, supports a wide range of Python versions, and is
|
71
|
+
compatible with many of the most popular packages in the scientific and T&E
|
72
|
+
communities.
|
73
|
+
|
74
|
+
DataEval also has native interopability between JATIC's suite of tools when
|
75
|
+
using MAITE-compliant datasets and models.
|
62
76
|
<!-- end JATIC interop -->
|
63
77
|
|
64
78
|
## Getting Started
|
65
79
|
|
66
80
|
**Python versions:** 3.9 - 3.12
|
67
81
|
|
68
|
-
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
82
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
83
|
+
*Gradient*
|
69
84
|
|
70
|
-
Choose your preferred method of installation below or follow our
|
85
|
+
Choose your preferred method of installation below or follow our
|
86
|
+
[installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
71
87
|
|
72
88
|
* [Installing with pip](#installing-with-pip)
|
73
89
|
* [Installing with conda/mamba](#installing-with-conda)
|
74
90
|
* [Installing from GitHub](#installing-from-github)
|
75
91
|
|
76
92
|
### **Installing with pip**
|
77
|
-
You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
|
78
93
|
|
79
|
-
|
94
|
+
You can install DataEval directly from pypi.org using the following command.
|
95
|
+
The optional dependencies of DataEval are `all`.
|
96
|
+
|
97
|
+
```bash
|
80
98
|
pip install dataeval[all]
|
81
99
|
```
|
82
100
|
|
83
101
|
### **Installing with conda**
|
84
102
|
|
85
|
-
DataEval can be installed in a Conda/Mamba environment using the provided
|
86
|
-
are installed from the `pytorch`
|
103
|
+
DataEval can be installed in a Conda/Mamba environment using the provided
|
104
|
+
`environment.yaml` file. As some dependencies are installed from the `pytorch`
|
105
|
+
channel, the channel is specified in the below example.
|
87
106
|
|
88
|
-
```
|
107
|
+
```bash
|
89
108
|
micromamba create -f environment\environment.yaml -c pytorch
|
90
109
|
```
|
91
110
|
|
92
111
|
### **Installing from GitHub**
|
93
112
|
|
94
|
-
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
113
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
114
|
+
download larger, binary source files and `poetry` for project dependency
|
115
|
+
management.
|
95
116
|
|
96
|
-
```
|
117
|
+
```bash
|
97
118
|
sudo apt-get install git-lfs
|
98
119
|
pip install poetry
|
99
120
|
```
|
100
121
|
|
101
122
|
Pull the source down and change to the DataEval project directory.
|
102
|
-
|
123
|
+
|
124
|
+
```bash
|
103
125
|
git clone https://github.com/aria-ml/dataeval.git
|
104
126
|
cd dataeval
|
105
127
|
```
|
106
128
|
|
107
129
|
Install DataEval with optional dependencies for development.
|
108
|
-
|
130
|
+
|
131
|
+
```bash
|
109
132
|
poetry install --all-extras --with dev
|
110
133
|
```
|
111
134
|
|
112
|
-
Now that DataEval is installed, you can run commands in the poetry virtual
|
113
|
-
|
135
|
+
Now that DataEval is installed, you can run commands in the poetry virtual
|
136
|
+
environment by prefixing shell commands with `poetry run`, or activate the
|
137
|
+
virtual environment directly in the shell.
|
138
|
+
|
139
|
+
```bash
|
114
140
|
poetry shell
|
115
141
|
```
|
116
142
|
|
@@ -118,19 +144,20 @@ poetry shell
|
|
118
144
|
|
119
145
|
If you have any questions, feel free to reach out to the people below:
|
120
146
|
|
121
|
-
|
122
|
-
|
147
|
+
* **POC**: Scott Swan @scott.swan
|
148
|
+
* **DPOC**: Andrew Weng @aweng
|
123
149
|
|
124
150
|
## Acknowledgement
|
125
151
|
|
126
|
-
<!-- start
|
127
|
-
|
128
|
-
### Alibi-Detect
|
129
|
-
This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) Python library developed by SeldonIO.\
|
130
|
-
Additional documentation from their developers is available on the [Alibi-Detect documentation page](https://docs.seldon.io/projects/alibi-detect/en/stable/).
|
152
|
+
<!-- start acknowledgement -->
|
131
153
|
|
132
154
|
### CDAO Funding Acknowledgement
|
133
|
-
This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
|
134
155
|
|
135
|
-
|
156
|
+
This material is based upon work supported by the Chief Digital and Artificial
|
157
|
+
Intelligence Office under Contract No. W519TC-23-9-2033. The views and
|
158
|
+
conclusions contained herein are those of the author(s) and should not be
|
159
|
+
interpreted as necessarily representing the official policies or endorsements,
|
160
|
+
either expressed or implied, of the U.S. Government.
|
161
|
+
|
162
|
+
<!-- end acknowledgement -->
|
136
163
|
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# DataEval
|
2
|
+
|
3
|
+
To view our extensive collection of tutorials, how-to's, explanation guides,
|
4
|
+
and reference material, please visit our documentation on
|
5
|
+
**[Read the Docs](https://dataeval.readthedocs.io/)**
|
6
|
+
|
7
|
+
## About DataEval
|
8
|
+
|
9
|
+
<!-- start tagline -->
|
10
|
+
|
11
|
+
DataEval curates datasets to train and test performant, robust, unbiased and
|
12
|
+
reliable AI models and monitors for data shifts that impact performance of
|
13
|
+
deployed models.
|
14
|
+
|
15
|
+
<!-- end tagline -->
|
16
|
+
|
17
|
+
### Our mission
|
18
|
+
|
19
|
+
<!-- start needs -->
|
20
|
+
|
21
|
+
DataEval is an effective, powerful, and reliable set of tools for any T&E
|
22
|
+
engineer. Throughout all stages of the machine learning lifecycle, DataEval
|
23
|
+
supports model development, data analysis, and monitoring with state-of-the-art
|
24
|
+
algorithms to help you solve difficult problems. With a focus on computer
|
25
|
+
vision tasks, DataEval provides simple, but effective metrics for performance
|
26
|
+
estimation, bias detection, and dataset linting.
|
27
|
+
|
28
|
+
<!-- end needs -->
|
29
|
+
|
30
|
+
<!-- start JATIC interop -->
|
31
|
+
DataEval is easy to install, supports a wide range of Python versions, and is
|
32
|
+
compatible with many of the most popular packages in the scientific and T&E
|
33
|
+
communities.
|
34
|
+
|
35
|
+
DataEval also has native interopability between JATIC's suite of tools when
|
36
|
+
using MAITE-compliant datasets and models.
|
37
|
+
<!-- end JATIC interop -->
|
38
|
+
|
39
|
+
## Getting Started
|
40
|
+
|
41
|
+
**Python versions:** 3.9 - 3.12
|
42
|
+
|
43
|
+
**Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
|
44
|
+
*Gradient*
|
45
|
+
|
46
|
+
Choose your preferred method of installation below or follow our
|
47
|
+
[installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
|
48
|
+
|
49
|
+
* [Installing with pip](#installing-with-pip)
|
50
|
+
* [Installing with conda/mamba](#installing-with-conda)
|
51
|
+
* [Installing from GitHub](#installing-from-github)
|
52
|
+
|
53
|
+
### **Installing with pip**
|
54
|
+
|
55
|
+
You can install DataEval directly from pypi.org using the following command.
|
56
|
+
The optional dependencies of DataEval are `all`.
|
57
|
+
|
58
|
+
```bash
|
59
|
+
pip install dataeval[all]
|
60
|
+
```
|
61
|
+
|
62
|
+
### **Installing with conda**
|
63
|
+
|
64
|
+
DataEval can be installed in a Conda/Mamba environment using the provided
|
65
|
+
`environment.yaml` file. As some dependencies are installed from the `pytorch`
|
66
|
+
channel, the channel is specified in the below example.
|
67
|
+
|
68
|
+
```bash
|
69
|
+
micromamba create -f environment\environment.yaml -c pytorch
|
70
|
+
```
|
71
|
+
|
72
|
+
### **Installing from GitHub**
|
73
|
+
|
74
|
+
To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
|
75
|
+
download larger, binary source files and `poetry` for project dependency
|
76
|
+
management.
|
77
|
+
|
78
|
+
```bash
|
79
|
+
sudo apt-get install git-lfs
|
80
|
+
pip install poetry
|
81
|
+
```
|
82
|
+
|
83
|
+
Pull the source down and change to the DataEval project directory.
|
84
|
+
|
85
|
+
```bash
|
86
|
+
git clone https://github.com/aria-ml/dataeval.git
|
87
|
+
cd dataeval
|
88
|
+
```
|
89
|
+
|
90
|
+
Install DataEval with optional dependencies for development.
|
91
|
+
|
92
|
+
```bash
|
93
|
+
poetry install --all-extras --with dev
|
94
|
+
```
|
95
|
+
|
96
|
+
Now that DataEval is installed, you can run commands in the poetry virtual
|
97
|
+
environment by prefixing shell commands with `poetry run`, or activate the
|
98
|
+
virtual environment directly in the shell.
|
99
|
+
|
100
|
+
```bash
|
101
|
+
poetry shell
|
102
|
+
```
|
103
|
+
|
104
|
+
## Contact Us
|
105
|
+
|
106
|
+
If you have any questions, feel free to reach out to the people below:
|
107
|
+
|
108
|
+
* **POC**: Scott Swan @scott.swan
|
109
|
+
* **DPOC**: Andrew Weng @aweng
|
110
|
+
|
111
|
+
## Acknowledgement
|
112
|
+
|
113
|
+
<!-- start acknowledgement -->
|
114
|
+
|
115
|
+
### CDAO Funding Acknowledgement
|
116
|
+
|
117
|
+
This material is based upon work supported by the Chief Digital and Artificial
|
118
|
+
Intelligence Office under Contract No. W519TC-23-9-2033. The views and
|
119
|
+
conclusions contained herein are those of the author(s) and should not be
|
120
|
+
interpreted as necessarily representing the official policies or endorsements,
|
121
|
+
either expressed or implied, of the U.S. Government.
|
122
|
+
|
123
|
+
<!-- end acknowledgement -->
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.76.1" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -42,7 +42,7 @@ packages = [
|
|
42
42
|
[tool.poetry.dependencies]
|
43
43
|
# required
|
44
44
|
python = ">=3.9,<3.13"
|
45
|
-
numpy = {version = ">=1.24.
|
45
|
+
numpy = {version = ">=1.24.2"}
|
46
46
|
pillow = {version = ">=10.3.0"}
|
47
47
|
requests = {version = "*"}
|
48
48
|
scipy = {version = ">=1.10"}
|
@@ -54,10 +54,11 @@ typing-extensions = {version = ">=4.12", python = "^3.9"} # ParamSpec
|
|
54
54
|
xxhash = {version = ">=3.3"}
|
55
55
|
|
56
56
|
# optional
|
57
|
-
matplotlib = {version = "
|
57
|
+
matplotlib = {version = ">=3.7.1", optional = true}
|
58
|
+
pandas = {version = ">=2.0", optional = true}
|
58
59
|
|
59
60
|
[tool.poetry.extras]
|
60
|
-
all = ["matplotlib"]
|
61
|
+
all = ["matplotlib", "pandas"]
|
61
62
|
|
62
63
|
[tool.poetry.group.dev]
|
63
64
|
optional = true
|
@@ -81,19 +82,20 @@ coverage = {version = "*", extras = ["toml"]}
|
|
81
82
|
pyright = {version = "*", extras = ["nodejs"]}
|
82
83
|
# prototype
|
83
84
|
maite = {version = "*"}
|
84
|
-
pandas = {version = "*"}
|
85
85
|
seaborn = {version = "*"}
|
86
86
|
# docs
|
87
87
|
certifi = {version = ">=2024.07.04"}
|
88
88
|
enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
|
89
89
|
ipykernel = {version = ">=6.26.0"}
|
90
90
|
ipywidgets = {version = ">=8.1.1"}
|
91
|
+
jinja2 = {version = ">=3.1.5"}
|
91
92
|
jupyter-client = {version = ">=8.6.0"}
|
92
93
|
jupyter-cache = {version = "*"}
|
93
94
|
myst-nb = {version = ">=1.0.0"}
|
94
|
-
pydata-sphinx-theme = {version = ">=0.15.4"}
|
95
95
|
sphinx-autoapi = {version = "*"}
|
96
96
|
sphinx-design = {version = "*"}
|
97
|
+
sphinx-immaterial = {version = "*"}
|
98
|
+
sphinx-new-tab-link = {version = "*"}
|
97
99
|
sphinx-tabs = {version = "*"}
|
98
100
|
Sphinx = {version = ">=7.2.6"}
|
99
101
|
torchmetrics = {version = ">=1.0.0", source = "pytorch"}
|
@@ -137,6 +139,7 @@ parallel = true
|
|
137
139
|
[tool.coverage.report]
|
138
140
|
exclude_also = [
|
139
141
|
"raise NotImplementedError",
|
142
|
+
": \\.\\.\\."
|
140
143
|
]
|
141
144
|
include = ["*/src/dataeval/*"]
|
142
145
|
omit = [
|
@@ -184,7 +187,7 @@ docstring-code-format = true
|
|
184
187
|
docstring-code-line-length = "dynamic"
|
185
188
|
|
186
189
|
[tool.codespell]
|
187
|
-
skip = './*env*,./prototype,./output,./docs/build,./docs/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
|
190
|
+
skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
|
188
191
|
ignore-words-list = ["Hart"]
|
189
192
|
|
190
193
|
[build-system]
|
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
|
|
8
8
|
from __future__ import annotations
|
9
9
|
|
10
10
|
__all__ = ["detectors", "log", "metrics", "utils", "workflows"]
|
11
|
-
__version__ = "0.
|
11
|
+
__version__ = "0.76.1"
|
12
12
|
|
13
13
|
import logging
|
14
14
|
|
@@ -24,10 +24,10 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
|
|
24
24
|
Parameters
|
25
25
|
----------
|
26
26
|
level : int, default logging.DEBUG(10)
|
27
|
-
Set the logging level for the logger
|
27
|
+
Set the logging level for the logger.
|
28
28
|
handler : logging.Handler, optional
|
29
29
|
Sets the logging handler for the logger if provided, otherwise logger will be
|
30
|
-
provided with a StreamHandler
|
30
|
+
provided with a StreamHandler.
|
31
31
|
"""
|
32
32
|
import logging
|
33
33
|
|
@@ -45,7 +45,7 @@ class UpdateStrategy(ABC):
|
|
45
45
|
@dataclass(frozen=True)
|
46
46
|
class DriftBaseOutput(Output):
|
47
47
|
"""
|
48
|
-
Base output class for Drift
|
48
|
+
Base output class for Drift Detector classes
|
49
49
|
|
50
50
|
Attributes
|
51
51
|
----------
|
@@ -64,7 +64,7 @@ class DriftBaseOutput(Output):
|
|
64
64
|
@dataclass(frozen=True)
|
65
65
|
class DriftOutput(DriftBaseOutput):
|
66
66
|
"""
|
67
|
-
Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors
|
67
|
+
Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
|
68
68
|
|
69
69
|
Attributes
|
70
70
|
----------
|
@@ -22,7 +22,8 @@ from dataeval.interop import to_numpy
|
|
22
22
|
|
23
23
|
class DriftKS(BaseDriftUnivariate):
|
24
24
|
"""
|
25
|
-
:term:`Drift` detector employing the Kolmogorov-Smirnov (KS)
|
25
|
+
:term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
|
26
|
+
distribution<Kolmogorov-Smirnov (K-S) test>` test.
|
26
27
|
|
27
28
|
The KS test detects changes in the maximum distance between two data
|
28
29
|
distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction
|
@@ -26,7 +26,7 @@ from dataeval.utils.torch.internal import get_device
|
|
26
26
|
@dataclass(frozen=True)
|
27
27
|
class DriftMMDOutput(DriftBaseOutput):
|
28
28
|
"""
|
29
|
-
Output class for :class:`DriftMMD` :term:`drift<Drift>` detector
|
29
|
+
Output class for :class:`DriftMMD` :term:`drift<Drift>` detector.
|
30
30
|
|
31
31
|
Attributes
|
32
32
|
----------
|
@@ -51,7 +51,8 @@ class DriftMMDOutput(DriftBaseOutput):
|
|
51
51
|
|
52
52
|
class DriftMMD(BaseDrift):
|
53
53
|
"""
|
54
|
-
:term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm
|
54
|
+
:term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
|
55
|
+
using a permutation test.
|
55
56
|
|
56
57
|
Parameters
|
57
58
|
----------
|
@@ -66,8 +66,8 @@ def classifier_uncertainty(
|
|
66
66
|
|
67
67
|
class DriftUncertainty:
|
68
68
|
"""
|
69
|
-
Test for a change in the number of instances falling into regions on which
|
70
|
-
|
69
|
+
Test for a change in the number of instances falling into regions on which \
|
70
|
+
the model is uncertain.
|
71
71
|
|
72
72
|
Performs a K-S test on prediction entropies.
|
73
73
|
|
@@ -18,7 +18,7 @@ from dataeval.utils.shared import flatten
|
|
18
18
|
@dataclass(frozen=True)
|
19
19
|
class ClustererOutput(Output):
|
20
20
|
"""
|
21
|
-
Output class for :class:`Clusterer` lint detector
|
21
|
+
Output class for :class:`Clusterer` lint detector.
|
22
22
|
|
23
23
|
Attributes
|
24
24
|
----------
|
@@ -131,7 +131,8 @@ class _ClusterMergeEntry:
|
|
131
131
|
|
132
132
|
class Clusterer:
|
133
133
|
"""
|
134
|
-
Uses hierarchical clustering to flag dataset properties of interest like
|
134
|
+
Uses hierarchical clustering to flag dataset properties of interest like outliers \
|
135
|
+
and :term:`duplicates<Duplicates>`.
|
135
136
|
|
136
137
|
Parameters
|
137
138
|
----------
|
@@ -19,7 +19,7 @@ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateG
|
|
19
19
|
@dataclass(frozen=True)
|
20
20
|
class DuplicatesOutput(Generic[TIndexCollection], Output):
|
21
21
|
"""
|
22
|
-
Output class for :class:`Duplicates` lint detector
|
22
|
+
Output class for :class:`Duplicates` lint detector.
|
23
23
|
|
24
24
|
Attributes
|
25
25
|
----------
|
@@ -39,8 +39,8 @@ class DuplicatesOutput(Generic[TIndexCollection], Output):
|
|
39
39
|
|
40
40
|
class Duplicates:
|
41
41
|
"""
|
42
|
-
Finds the duplicate images in a dataset using xxhash for exact
|
43
|
-
and pchash for near duplicates
|
42
|
+
Finds the duplicate images in a dataset using xxhash for exact \
|
43
|
+
:term:`duplicates<Duplicates>` and pchash for near duplicates.
|
44
44
|
|
45
45
|
Attributes
|
46
46
|
----------
|
@@ -92,7 +92,7 @@ class Duplicates:
|
|
92
92
|
|
93
93
|
Parameters
|
94
94
|
----------
|
95
|
-
|
95
|
+
hashes : HashStatsOutput | Sequence[HashStatsOutput]
|
96
96
|
The output(s) from a hashstats analysis
|
97
97
|
|
98
98
|
Returns
|
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
from dataclasses import dataclass
|
6
7
|
from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
|
7
8
|
|
@@ -12,19 +13,78 @@ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_s
|
|
12
13
|
from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
|
13
14
|
from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
|
14
15
|
from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
|
16
|
+
from dataeval.metrics.stats.labelstats import LabelStatsOutput
|
15
17
|
from dataeval.metrics.stats.pixelstats import PixelStatsOutput
|
16
18
|
from dataeval.metrics.stats.visualstats import VisualStatsOutput
|
17
19
|
from dataeval.output import Output, set_metadata
|
18
20
|
|
21
|
+
with contextlib.suppress(ImportError):
|
22
|
+
import pandas as pd
|
23
|
+
|
24
|
+
|
19
25
|
IndexIssueMap = dict[int, dict[str, float]]
|
20
26
|
OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
21
27
|
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
|
22
28
|
|
23
29
|
|
30
|
+
def _reorganize_by_class_and_metric(result, lstats):
|
31
|
+
"""Flip result from grouping by image to grouping by class and metric"""
|
32
|
+
metrics = {}
|
33
|
+
class_wise = {label: {} for label in lstats.image_indices_per_label}
|
34
|
+
|
35
|
+
# Group metrics and calculate class-wise counts
|
36
|
+
for img, group in result.items():
|
37
|
+
for extreme in group:
|
38
|
+
metrics.setdefault(extreme, []).append(img)
|
39
|
+
for label, images in lstats.image_indices_per_label.items():
|
40
|
+
if img in images:
|
41
|
+
class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
|
42
|
+
|
43
|
+
return metrics, class_wise
|
44
|
+
|
45
|
+
|
46
|
+
def _create_table(metrics, class_wise):
|
47
|
+
"""Create table for displaying the results"""
|
48
|
+
max_class_length = max(len(str(label)) for label in class_wise) + 2
|
49
|
+
max_total = max(len(metrics[group]) for group in metrics) + 2
|
50
|
+
|
51
|
+
table_header = " | ".join(
|
52
|
+
[f"{'Class':>{max_class_length}}"]
|
53
|
+
+ [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
|
54
|
+
+ [f"{'Total':<{max_total}}"]
|
55
|
+
)
|
56
|
+
table_rows = []
|
57
|
+
|
58
|
+
for class_cat, results in class_wise.items():
|
59
|
+
table_value = [f"{class_cat:>{max_class_length}}"]
|
60
|
+
total = 0
|
61
|
+
for group in sorted(metrics.keys()):
|
62
|
+
count = results.get(group, 0)
|
63
|
+
table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
|
64
|
+
total += count
|
65
|
+
table_value.append(f"{total:^{max_total}}")
|
66
|
+
table_rows.append(" | ".join(table_value))
|
67
|
+
|
68
|
+
table = [table_header] + table_rows
|
69
|
+
return table
|
70
|
+
|
71
|
+
|
72
|
+
def _create_pandas_dataframe(class_wise):
|
73
|
+
"""Create data for pandas dataframe"""
|
74
|
+
data = []
|
75
|
+
for label, metrics_dict in class_wise.items():
|
76
|
+
row = {"Class": label}
|
77
|
+
total = sum(metrics_dict.values())
|
78
|
+
row.update(metrics_dict) # Add metric counts
|
79
|
+
row["Total"] = total
|
80
|
+
data.append(row)
|
81
|
+
return data
|
82
|
+
|
83
|
+
|
24
84
|
@dataclass(frozen=True)
|
25
85
|
class OutliersOutput(Generic[TIndexIssueMap], Output):
|
26
86
|
"""
|
27
|
-
Output class for :class:`Outliers` lint detector
|
87
|
+
Output class for :class:`Outliers` lint detector.
|
28
88
|
|
29
89
|
Attributes
|
30
90
|
----------
|
@@ -45,6 +105,39 @@ class OutliersOutput(Generic[TIndexIssueMap], Output):
|
|
45
105
|
else:
|
46
106
|
return sum(len(d) for d in self.issues)
|
47
107
|
|
108
|
+
def to_table(self, labelstats: LabelStatsOutput) -> str:
|
109
|
+
if isinstance(self.issues, dict):
|
110
|
+
metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
111
|
+
listed_table = _create_table(metrics, classwise)
|
112
|
+
table = "\n".join(listed_table)
|
113
|
+
else:
|
114
|
+
outertable = []
|
115
|
+
for d in self.issues:
|
116
|
+
metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
|
117
|
+
listed_table = _create_table(metrics, classwise)
|
118
|
+
str_table = "\n".join(listed_table)
|
119
|
+
outertable.append(str_table)
|
120
|
+
table = "\n\n".join(outertable)
|
121
|
+
return table
|
122
|
+
|
123
|
+
def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
|
124
|
+
import pandas as pd
|
125
|
+
|
126
|
+
if isinstance(self.issues, dict):
|
127
|
+
_, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
128
|
+
data = _create_pandas_dataframe(classwise)
|
129
|
+
df = pd.DataFrame(data)
|
130
|
+
else:
|
131
|
+
df_list = []
|
132
|
+
for i, d in enumerate(self.issues):
|
133
|
+
_, classwise = _reorganize_by_class_and_metric(d, labelstats)
|
134
|
+
data = _create_pandas_dataframe(classwise)
|
135
|
+
single_df = pd.DataFrame(data)
|
136
|
+
single_df["Dataset"] = i
|
137
|
+
df_list.append(single_df)
|
138
|
+
df = pd.concat(df_list)
|
139
|
+
return df
|
140
|
+
|
48
141
|
|
49
142
|
def _get_outlier_mask(
|
50
143
|
values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
|
@@ -71,7 +164,7 @@ def _get_outlier_mask(
|
|
71
164
|
|
72
165
|
class Outliers:
|
73
166
|
r"""
|
74
|
-
Calculates statistical
|
167
|
+
Calculates statistical outliers of a dataset using various statistical tests applied to each image.
|
75
168
|
|
76
169
|
Parameters
|
77
170
|
----------
|
@@ -164,7 +257,7 @@ class Outliers:
|
|
164
257
|
self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
|
165
258
|
) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
|
166
259
|
"""
|
167
|
-
Returns indices of Outliers with the issues identified for each
|
260
|
+
Returns indices of Outliers with the issues identified for each.
|
168
261
|
|
169
262
|
Parameters
|
170
263
|
----------
|
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
Out-of-distribution (OOD)
|
2
|
+
Out-of-distribution (OOD) detectors identify data that is different from the data used to train a particular model.
|
3
3
|
"""
|
4
4
|
|
5
5
|
__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
|