dataeval 0.75.0__tar.gz → 0.76.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {dataeval-0.75.0 → dataeval-0.76.1}/LICENSE.txt +2 -2
  2. {dataeval-0.75.0 → dataeval-0.76.1}/PKG-INFO +57 -30
  3. dataeval-0.76.1/README.md +123 -0
  4. {dataeval-0.75.0 → dataeval-0.76.1}/pyproject.toml +10 -7
  5. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/__init__.py +3 -3
  6. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/base.py +2 -2
  7. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/ks.py +2 -1
  8. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/mmd.py +3 -2
  9. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/uncertainty.py +2 -2
  10. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/updates.py +1 -1
  11. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/clusterer.py +3 -2
  12. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/duplicates.py +4 -4
  13. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/outliers.py +96 -3
  14. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/__init__.py +1 -1
  15. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/base.py +1 -17
  16. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/output.py +1 -1
  17. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/interop.py +1 -1
  18. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/__init__.py +1 -1
  19. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/__init__.py +1 -1
  20. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/balance.py +3 -3
  21. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/coverage.py +1 -1
  22. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/diversity.py +14 -10
  23. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/bias/parity.py +7 -9
  24. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/ber.py +4 -3
  25. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/divergence.py +3 -3
  26. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/uap.py +3 -3
  27. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/__init__.py +1 -1
  28. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/base.py +24 -8
  29. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/boxratiostats.py +5 -5
  30. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/datasetstats.py +39 -6
  31. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/dimensionstats.py +4 -4
  32. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/hashstats.py +2 -2
  33. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/labelstats.py +89 -6
  34. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/pixelstats.py +7 -5
  35. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/stats/visualstats.py +6 -4
  36. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/output.py +23 -14
  37. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/__init__.py +2 -2
  38. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/read.py +1 -1
  39. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/split.py +1 -1
  40. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/metadata.py +255 -110
  41. dataeval-0.76.1/src/dataeval/utils/plot.py +249 -0
  42. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/workflows/sufficiency.py +2 -2
  43. dataeval-0.75.0/README.md +0 -97
  44. dataeval-0.75.0/src/dataeval/utils/plot.py +0 -126
  45. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/__init__.py +0 -0
  46. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/__init__.py +0 -0
  47. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/cvm.py +0 -0
  48. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/drift/torch.py +0 -0
  49. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/__init__.py +0 -0
  50. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/linters/merged_stats.py +0 -0
  51. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/ae.py +0 -0
  52. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -0
  53. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/metadata_least_likely.py +0 -0
  54. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
  55. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/detectors/ood/mixin.py +0 -0
  56. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/log.py +0 -0
  57. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/metrics/estimators/__init__.py +0 -0
  58. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/py.typed +0 -0
  59. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/__init__.py +0 -0
  60. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/dataset/datasets.py +0 -0
  61. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/image.py +0 -0
  62. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/shared.py +0 -0
  63. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/__init__.py +0 -0
  64. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/blocks.py +0 -0
  65. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/gmm.py +0 -0
  66. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/internal.py +0 -0
  67. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/models.py +0 -0
  68. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/utils/torch/trainer.py +0 -0
  69. {dataeval-0.75.0 → dataeval-0.76.1}/src/dataeval/workflows/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2024 ARiA
3
+ Copyright (c) 2025 ARiA
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.75.0
3
+ Version: 0.76.1
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -21,8 +21,9 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3 :: Only
22
22
  Classifier: Topic :: Scientific/Engineering
23
23
  Provides-Extra: all
24
- Requires-Dist: matplotlib ; extra == "all"
25
- Requires-Dist: numpy (>=1.24.3)
24
+ Requires-Dist: matplotlib (>=3.7.1) ; extra == "all"
25
+ Requires-Dist: numpy (>=1.24.2)
26
+ Requires-Dist: pandas (>=2.0) ; extra == "all"
26
27
  Requires-Dist: pillow (>=10.3.0)
27
28
  Requires-Dist: requests
28
29
  Requires-Dist: scikit-learn (>=1.5.0)
@@ -38,13 +39,17 @@ Description-Content-Type: text/markdown
38
39
 
39
40
  # DataEval
40
41
 
41
- To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
42
+ To view our extensive collection of tutorials, how-to's, explanation guides,
43
+ and reference material, please visit our documentation on
44
+ **[Read the Docs](https://dataeval.readthedocs.io/)**
42
45
 
43
46
  ## About DataEval
44
47
 
45
48
  <!-- start tagline -->
46
49
 
47
- DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
50
+ DataEval curates datasets to train and test performant, robust, unbiased and
51
+ reliable AI models and monitors for data shifts that impact performance of
52
+ deployed models.
48
53
 
49
54
  <!-- end tagline -->
50
55
 
@@ -52,65 +57,86 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
52
57
 
53
58
  <!-- start needs -->
54
59
 
55
- DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports **model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
60
+ DataEval is an effective, powerful, and reliable set of tools for any T&E
61
+ engineer. Throughout all stages of the machine learning lifecycle, DataEval
62
+ supports model development, data analysis, and monitoring with state-of-the-art
63
+ algorithms to help you solve difficult problems. With a focus on computer
64
+ vision tasks, DataEval provides simple, but effective metrics for performance
65
+ estimation, bias detection, and dataset linting.
56
66
 
57
67
  <!-- end needs -->
58
68
 
59
69
  <!-- start JATIC interop -->
60
- DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
61
- DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
70
+ DataEval is easy to install, supports a wide range of Python versions, and is
71
+ compatible with many of the most popular packages in the scientific and T&E
72
+ communities.
73
+
74
+ DataEval also has native interopability between JATIC's suite of tools when
75
+ using MAITE-compliant datasets and models.
62
76
  <!-- end JATIC interop -->
63
77
 
64
78
  ## Getting Started
65
79
 
66
80
  **Python versions:** 3.9 - 3.12
67
81
 
68
- **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
82
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
83
+ *Gradient*
69
84
 
70
- Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
85
+ Choose your preferred method of installation below or follow our
86
+ [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
71
87
 
72
88
  * [Installing with pip](#installing-with-pip)
73
89
  * [Installing with conda/mamba](#installing-with-conda)
74
90
  * [Installing from GitHub](#installing-from-github)
75
91
 
76
92
  ### **Installing with pip**
77
- You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
78
93
 
79
- ```
94
+ You can install DataEval directly from pypi.org using the following command.
95
+ The optional dependencies of DataEval are `all`.
96
+
97
+ ```bash
80
98
  pip install dataeval[all]
81
99
  ```
82
100
 
83
101
  ### **Installing with conda**
84
102
 
85
- DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
86
- are installed from the `pytorch` channel, the channel is specified in the below example.
103
+ DataEval can be installed in a Conda/Mamba environment using the provided
104
+ `environment.yaml` file. As some dependencies are installed from the `pytorch`
105
+ channel, the channel is specified in the below example.
87
106
 
88
- ```
107
+ ```bash
89
108
  micromamba create -f environment\environment.yaml -c pytorch
90
109
  ```
91
110
 
92
111
  ### **Installing from GitHub**
93
112
 
94
- To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
113
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
114
+ download larger, binary source files and `poetry` for project dependency
115
+ management.
95
116
 
96
- ```
117
+ ```bash
97
118
  sudo apt-get install git-lfs
98
119
  pip install poetry
99
120
  ```
100
121
 
101
122
  Pull the source down and change to the DataEval project directory.
102
- ```
123
+
124
+ ```bash
103
125
  git clone https://github.com/aria-ml/dataeval.git
104
126
  cd dataeval
105
127
  ```
106
128
 
107
129
  Install DataEval with optional dependencies for development.
108
- ```
130
+
131
+ ```bash
109
132
  poetry install --all-extras --with dev
110
133
  ```
111
134
 
112
- Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
113
- ```
135
+ Now that DataEval is installed, you can run commands in the poetry virtual
136
+ environment by prefixing shell commands with `poetry run`, or activate the
137
+ virtual environment directly in the shell.
138
+
139
+ ```bash
114
140
  poetry shell
115
141
  ```
116
142
 
@@ -118,19 +144,20 @@ poetry shell
118
144
 
119
145
  If you have any questions, feel free to reach out to the people below:
120
146
 
121
- - **POC**: Scott Swan @scott.swan
122
- - **DPOC**: Andrew Weng @aweng
147
+ * **POC**: Scott Swan @scott.swan
148
+ * **DPOC**: Andrew Weng @aweng
123
149
 
124
150
  ## Acknowledgement
125
151
 
126
- <!-- start attribution -->
127
-
128
- ### Alibi-Detect
129
- This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) Python library developed by SeldonIO.\
130
- Additional documentation from their developers is available on the [Alibi-Detect documentation page](https://docs.seldon.io/projects/alibi-detect/en/stable/).
152
+ <!-- start acknowledgement -->
131
153
 
132
154
  ### CDAO Funding Acknowledgement
133
- This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
134
155
 
135
- <!-- end attribution -->
156
+ This material is based upon work supported by the Chief Digital and Artificial
157
+ Intelligence Office under Contract No. W519TC-23-9-2033. The views and
158
+ conclusions contained herein are those of the author(s) and should not be
159
+ interpreted as necessarily representing the official policies or endorsements,
160
+ either expressed or implied, of the U.S. Government.
161
+
162
+ <!-- end acknowledgement -->
136
163
 
@@ -0,0 +1,123 @@
1
+ # DataEval
2
+
3
+ To view our extensive collection of tutorials, how-to's, explanation guides,
4
+ and reference material, please visit our documentation on
5
+ **[Read the Docs](https://dataeval.readthedocs.io/)**
6
+
7
+ ## About DataEval
8
+
9
+ <!-- start tagline -->
10
+
11
+ DataEval curates datasets to train and test performant, robust, unbiased and
12
+ reliable AI models and monitors for data shifts that impact performance of
13
+ deployed models.
14
+
15
+ <!-- end tagline -->
16
+
17
+ ### Our mission
18
+
19
+ <!-- start needs -->
20
+
21
+ DataEval is an effective, powerful, and reliable set of tools for any T&E
22
+ engineer. Throughout all stages of the machine learning lifecycle, DataEval
23
+ supports model development, data analysis, and monitoring with state-of-the-art
24
+ algorithms to help you solve difficult problems. With a focus on computer
25
+ vision tasks, DataEval provides simple, but effective metrics for performance
26
+ estimation, bias detection, and dataset linting.
27
+
28
+ <!-- end needs -->
29
+
30
+ <!-- start JATIC interop -->
31
+ DataEval is easy to install, supports a wide range of Python versions, and is
32
+ compatible with many of the most popular packages in the scientific and T&E
33
+ communities.
34
+
35
+ DataEval also has native interopability between JATIC's suite of tools when
36
+ using MAITE-compliant datasets and models.
37
+ <!-- end JATIC interop -->
38
+
39
+ ## Getting Started
40
+
41
+ **Python versions:** 3.9 - 3.12
42
+
43
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
44
+ *Gradient*
45
+
46
+ Choose your preferred method of installation below or follow our
47
+ [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
48
+
49
+ * [Installing with pip](#installing-with-pip)
50
+ * [Installing with conda/mamba](#installing-with-conda)
51
+ * [Installing from GitHub](#installing-from-github)
52
+
53
+ ### **Installing with pip**
54
+
55
+ You can install DataEval directly from pypi.org using the following command.
56
+ The optional dependencies of DataEval are `all`.
57
+
58
+ ```bash
59
+ pip install dataeval[all]
60
+ ```
61
+
62
+ ### **Installing with conda**
63
+
64
+ DataEval can be installed in a Conda/Mamba environment using the provided
65
+ `environment.yaml` file. As some dependencies are installed from the `pytorch`
66
+ channel, the channel is specified in the below example.
67
+
68
+ ```bash
69
+ micromamba create -f environment\environment.yaml -c pytorch
70
+ ```
71
+
72
+ ### **Installing from GitHub**
73
+
74
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
75
+ download larger, binary source files and `poetry` for project dependency
76
+ management.
77
+
78
+ ```bash
79
+ sudo apt-get install git-lfs
80
+ pip install poetry
81
+ ```
82
+
83
+ Pull the source down and change to the DataEval project directory.
84
+
85
+ ```bash
86
+ git clone https://github.com/aria-ml/dataeval.git
87
+ cd dataeval
88
+ ```
89
+
90
+ Install DataEval with optional dependencies for development.
91
+
92
+ ```bash
93
+ poetry install --all-extras --with dev
94
+ ```
95
+
96
+ Now that DataEval is installed, you can run commands in the poetry virtual
97
+ environment by prefixing shell commands with `poetry run`, or activate the
98
+ virtual environment directly in the shell.
99
+
100
+ ```bash
101
+ poetry shell
102
+ ```
103
+
104
+ ## Contact Us
105
+
106
+ If you have any questions, feel free to reach out to the people below:
107
+
108
+ * **POC**: Scott Swan @scott.swan
109
+ * **DPOC**: Andrew Weng @aweng
110
+
111
+ ## Acknowledgement
112
+
113
+ <!-- start acknowledgement -->
114
+
115
+ ### CDAO Funding Acknowledgement
116
+
117
+ This material is based upon work supported by the Chief Digital and Artificial
118
+ Intelligence Office under Contract No. W519TC-23-9-2033. The views and
119
+ conclusions contained herein are those of the author(s) and should not be
120
+ interpreted as necessarily representing the official policies or endorsements,
121
+ either expressed or implied, of the U.S. Government.
122
+
123
+ <!-- end acknowledgement -->
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.75.0" # dynamic
3
+ version = "0.76.1" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -42,7 +42,7 @@ packages = [
42
42
  [tool.poetry.dependencies]
43
43
  # required
44
44
  python = ">=3.9,<3.13"
45
- numpy = {version = ">=1.24.3"}
45
+ numpy = {version = ">=1.24.2"}
46
46
  pillow = {version = ">=10.3.0"}
47
47
  requests = {version = "*"}
48
48
  scipy = {version = ">=1.10"}
@@ -54,10 +54,11 @@ typing-extensions = {version = ">=4.12", python = "^3.9"} # ParamSpec
54
54
  xxhash = {version = ">=3.3"}
55
55
 
56
56
  # optional
57
- matplotlib = {version = "*", optional = true}
57
+ matplotlib = {version = ">=3.7.1", optional = true}
58
+ pandas = {version = ">=2.0", optional = true}
58
59
 
59
60
  [tool.poetry.extras]
60
- all = ["matplotlib"]
61
+ all = ["matplotlib", "pandas"]
61
62
 
62
63
  [tool.poetry.group.dev]
63
64
  optional = true
@@ -81,19 +82,20 @@ coverage = {version = "*", extras = ["toml"]}
81
82
  pyright = {version = "*", extras = ["nodejs"]}
82
83
  # prototype
83
84
  maite = {version = "*"}
84
- pandas = {version = "*"}
85
85
  seaborn = {version = "*"}
86
86
  # docs
87
87
  certifi = {version = ">=2024.07.04"}
88
88
  enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
89
89
  ipykernel = {version = ">=6.26.0"}
90
90
  ipywidgets = {version = ">=8.1.1"}
91
+ jinja2 = {version = ">=3.1.5"}
91
92
  jupyter-client = {version = ">=8.6.0"}
92
93
  jupyter-cache = {version = "*"}
93
94
  myst-nb = {version = ">=1.0.0"}
94
- pydata-sphinx-theme = {version = ">=0.15.4"}
95
95
  sphinx-autoapi = {version = "*"}
96
96
  sphinx-design = {version = "*"}
97
+ sphinx-immaterial = {version = "*"}
98
+ sphinx-new-tab-link = {version = "*"}
97
99
  sphinx-tabs = {version = "*"}
98
100
  Sphinx = {version = ">=7.2.6"}
99
101
  torchmetrics = {version = ">=1.0.0", source = "pytorch"}
@@ -137,6 +139,7 @@ parallel = true
137
139
  [tool.coverage.report]
138
140
  exclude_also = [
139
141
  "raise NotImplementedError",
142
+ ": \\.\\.\\."
140
143
  ]
141
144
  include = ["*/src/dataeval/*"]
142
145
  omit = [
@@ -184,7 +187,7 @@ docstring-code-format = true
184
187
  docstring-code-line-length = "dynamic"
185
188
 
186
189
  [tool.codespell]
187
- skip = './*env*,./prototype,./output,./docs/build,./docs/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
190
+ skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
188
191
  ignore-words-list = ["Hart"]
189
192
 
190
193
  [build-system]
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["detectors", "log", "metrics", "utils", "workflows"]
11
- __version__ = "0.75.0"
11
+ __version__ = "0.76.1"
12
12
 
13
13
  import logging
14
14
 
@@ -24,10 +24,10 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
24
24
  Parameters
25
25
  ----------
26
26
  level : int, default logging.DEBUG(10)
27
- Set the logging level for the logger
27
+ Set the logging level for the logger.
28
28
  handler : logging.Handler, optional
29
29
  Sets the logging handler for the logger if provided, otherwise logger will be
30
- provided with a StreamHandler
30
+ provided with a StreamHandler.
31
31
  """
32
32
  import logging
33
33
 
@@ -45,7 +45,7 @@ class UpdateStrategy(ABC):
45
45
  @dataclass(frozen=True)
46
46
  class DriftBaseOutput(Output):
47
47
  """
48
- Base output class for Drift detector classes
48
+ Base output class for Drift Detector classes
49
49
 
50
50
  Attributes
51
51
  ----------
@@ -64,7 +64,7 @@ class DriftBaseOutput(Output):
64
64
  @dataclass(frozen=True)
65
65
  class DriftOutput(DriftBaseOutput):
66
66
  """
67
- Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors
67
+ Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
68
68
 
69
69
  Attributes
70
70
  ----------
@@ -22,7 +22,8 @@ from dataeval.interop import to_numpy
22
22
 
23
23
  class DriftKS(BaseDriftUnivariate):
24
24
  """
25
- :term:`Drift` detector employing the Kolmogorov-Smirnov (KS) distribution test.
25
+ :term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
26
+ distribution<Kolmogorov-Smirnov (K-S) test>` test.
26
27
 
27
28
  The KS test detects changes in the maximum distance between two data
28
29
  distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction
@@ -26,7 +26,7 @@ from dataeval.utils.torch.internal import get_device
26
26
  @dataclass(frozen=True)
27
27
  class DriftMMDOutput(DriftBaseOutput):
28
28
  """
29
- Output class for :class:`DriftMMD` :term:`drift<Drift>` detector
29
+ Output class for :class:`DriftMMD` :term:`drift<Drift>` detector.
30
30
 
31
31
  Attributes
32
32
  ----------
@@ -51,7 +51,8 @@ class DriftMMDOutput(DriftBaseOutput):
51
51
 
52
52
  class DriftMMD(BaseDrift):
53
53
  """
54
- :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm using a permutation test.
54
+ :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
55
+ using a permutation test.
55
56
 
56
57
  Parameters
57
58
  ----------
@@ -66,8 +66,8 @@ def classifier_uncertainty(
66
66
 
67
67
  class DriftUncertainty:
68
68
  """
69
- Test for a change in the number of instances falling into regions on which the
70
- model is uncertain.
69
+ Test for a change in the number of instances falling into regions on which \
70
+ the model is uncertain.
71
71
 
72
72
  Performs a K-S test on prediction entropies.
73
73
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
2
+ Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring.
3
3
  for drift.
4
4
  """
5
5
 
@@ -18,7 +18,7 @@ from dataeval.utils.shared import flatten
18
18
  @dataclass(frozen=True)
19
19
  class ClustererOutput(Output):
20
20
  """
21
- Output class for :class:`Clusterer` lint detector
21
+ Output class for :class:`Clusterer` lint detector.
22
22
 
23
23
  Attributes
24
24
  ----------
@@ -131,7 +131,8 @@ class _ClusterMergeEntry:
131
131
 
132
132
  class Clusterer:
133
133
  """
134
- Uses hierarchical clustering to flag dataset properties of interest like Outliers and :term:`duplicates<Duplicates>`
134
+ Uses hierarchical clustering to flag dataset properties of interest like outliers \
135
+ and :term:`duplicates<Duplicates>`.
135
136
 
136
137
  Parameters
137
138
  ----------
@@ -19,7 +19,7 @@ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateG
19
19
  @dataclass(frozen=True)
20
20
  class DuplicatesOutput(Generic[TIndexCollection], Output):
21
21
  """
22
- Output class for :class:`Duplicates` lint detector
22
+ Output class for :class:`Duplicates` lint detector.
23
23
 
24
24
  Attributes
25
25
  ----------
@@ -39,8 +39,8 @@ class DuplicatesOutput(Generic[TIndexCollection], Output):
39
39
 
40
40
  class Duplicates:
41
41
  """
42
- Finds the duplicate images in a dataset using xxhash for exact :term:`duplicates<Duplicates>`
43
- and pchash for near duplicates
42
+ Finds the duplicate images in a dataset using xxhash for exact \
43
+ :term:`duplicates<Duplicates>` and pchash for near duplicates.
44
44
 
45
45
  Attributes
46
46
  ----------
@@ -92,7 +92,7 @@ class Duplicates:
92
92
 
93
93
  Parameters
94
94
  ----------
95
- data : HashStatsOutput | Sequence[HashStatsOutput]
95
+ hashes : HashStatsOutput | Sequence[HashStatsOutput]
96
96
  The output(s) from a hashstats analysis
97
97
 
98
98
  Returns
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
+ import contextlib
5
6
  from dataclasses import dataclass
6
7
  from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
7
8
 
@@ -12,19 +13,78 @@ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_s
12
13
  from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
13
14
  from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
14
15
  from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
16
+ from dataeval.metrics.stats.labelstats import LabelStatsOutput
15
17
  from dataeval.metrics.stats.pixelstats import PixelStatsOutput
16
18
  from dataeval.metrics.stats.visualstats import VisualStatsOutput
17
19
  from dataeval.output import Output, set_metadata
18
20
 
21
+ with contextlib.suppress(ImportError):
22
+ import pandas as pd
23
+
24
+
19
25
  IndexIssueMap = dict[int, dict[str, float]]
20
26
  OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
21
27
  TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
22
28
 
23
29
 
30
+ def _reorganize_by_class_and_metric(result, lstats):
31
+ """Flip result from grouping by image to grouping by class and metric"""
32
+ metrics = {}
33
+ class_wise = {label: {} for label in lstats.image_indices_per_label}
34
+
35
+ # Group metrics and calculate class-wise counts
36
+ for img, group in result.items():
37
+ for extreme in group:
38
+ metrics.setdefault(extreme, []).append(img)
39
+ for label, images in lstats.image_indices_per_label.items():
40
+ if img in images:
41
+ class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
42
+
43
+ return metrics, class_wise
44
+
45
+
46
+ def _create_table(metrics, class_wise):
47
+ """Create table for displaying the results"""
48
+ max_class_length = max(len(str(label)) for label in class_wise) + 2
49
+ max_total = max(len(metrics[group]) for group in metrics) + 2
50
+
51
+ table_header = " | ".join(
52
+ [f"{'Class':>{max_class_length}}"]
53
+ + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
54
+ + [f"{'Total':<{max_total}}"]
55
+ )
56
+ table_rows = []
57
+
58
+ for class_cat, results in class_wise.items():
59
+ table_value = [f"{class_cat:>{max_class_length}}"]
60
+ total = 0
61
+ for group in sorted(metrics.keys()):
62
+ count = results.get(group, 0)
63
+ table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
64
+ total += count
65
+ table_value.append(f"{total:^{max_total}}")
66
+ table_rows.append(" | ".join(table_value))
67
+
68
+ table = [table_header] + table_rows
69
+ return table
70
+
71
+
72
+ def _create_pandas_dataframe(class_wise):
73
+ """Create data for pandas dataframe"""
74
+ data = []
75
+ for label, metrics_dict in class_wise.items():
76
+ row = {"Class": label}
77
+ total = sum(metrics_dict.values())
78
+ row.update(metrics_dict) # Add metric counts
79
+ row["Total"] = total
80
+ data.append(row)
81
+ return data
82
+
83
+
24
84
  @dataclass(frozen=True)
25
85
  class OutliersOutput(Generic[TIndexIssueMap], Output):
26
86
  """
27
- Output class for :class:`Outliers` lint detector
87
+ Output class for :class:`Outliers` lint detector.
28
88
 
29
89
  Attributes
30
90
  ----------
@@ -45,6 +105,39 @@ class OutliersOutput(Generic[TIndexIssueMap], Output):
45
105
  else:
46
106
  return sum(len(d) for d in self.issues)
47
107
 
108
+ def to_table(self, labelstats: LabelStatsOutput) -> str:
109
+ if isinstance(self.issues, dict):
110
+ metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
111
+ listed_table = _create_table(metrics, classwise)
112
+ table = "\n".join(listed_table)
113
+ else:
114
+ outertable = []
115
+ for d in self.issues:
116
+ metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
117
+ listed_table = _create_table(metrics, classwise)
118
+ str_table = "\n".join(listed_table)
119
+ outertable.append(str_table)
120
+ table = "\n\n".join(outertable)
121
+ return table
122
+
123
+ def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
124
+ import pandas as pd
125
+
126
+ if isinstance(self.issues, dict):
127
+ _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
128
+ data = _create_pandas_dataframe(classwise)
129
+ df = pd.DataFrame(data)
130
+ else:
131
+ df_list = []
132
+ for i, d in enumerate(self.issues):
133
+ _, classwise = _reorganize_by_class_and_metric(d, labelstats)
134
+ data = _create_pandas_dataframe(classwise)
135
+ single_df = pd.DataFrame(data)
136
+ single_df["Dataset"] = i
137
+ df_list.append(single_df)
138
+ df = pd.concat(df_list)
139
+ return df
140
+
48
141
 
49
142
  def _get_outlier_mask(
50
143
  values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
@@ -71,7 +164,7 @@ def _get_outlier_mask(
71
164
 
72
165
  class Outliers:
73
166
  r"""
74
- Calculates statistical Outliers of a dataset using various statistical tests applied to each image
167
+ Calculates statistical outliers of a dataset using various statistical tests applied to each image.
75
168
 
76
169
  Parameters
77
170
  ----------
@@ -164,7 +257,7 @@ class Outliers:
164
257
  self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
165
258
  ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
166
259
  """
167
- Returns indices of Outliers with the issues identified for each
260
+ Returns indices of Outliers with the issues identified for each.
168
261
 
169
262
  Parameters
170
263
  ----------
@@ -1,5 +1,5 @@
1
1
  """
2
- Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
2
+ Out-of-distribution (OOD) detectors identify data that is different from the data used to train a particular model.
3
3
  """
4
4
 
5
5
  __all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]