dataeval 0.76.0__tar.gz → 0.81.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. {dataeval-0.76.0 → dataeval-0.81.0}/PKG-INFO +44 -15
  2. {dataeval-0.76.0 → dataeval-0.81.0}/README.md +38 -13
  3. {dataeval-0.76.0 → dataeval-0.81.0}/pyproject.toml +22 -9
  4. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/__init__.py +3 -3
  5. dataeval-0.76.0/src/dataeval/output.py → dataeval-0.81.0/src/dataeval/_output.py +14 -0
  6. dataeval-0.81.0/src/dataeval/config.py +77 -0
  7. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/__init__.py +1 -1
  8. dataeval-0.81.0/src/dataeval/detectors/drift/__init__.py +22 -0
  9. dataeval-0.76.0/src/dataeval/detectors/drift/base.py → dataeval-0.81.0/src/dataeval/detectors/drift/_base.py +41 -30
  10. dataeval-0.76.0/src/dataeval/detectors/drift/cvm.py → dataeval-0.81.0/src/dataeval/detectors/drift/_cvm.py +21 -28
  11. dataeval-0.76.0/src/dataeval/detectors/drift/ks.py → dataeval-0.81.0/src/dataeval/detectors/drift/_ks.py +20 -26
  12. dataeval-0.76.0/src/dataeval/detectors/drift/mmd.py → dataeval-0.81.0/src/dataeval/detectors/drift/_mmd.py +33 -19
  13. dataeval-0.76.0/src/dataeval/detectors/drift/torch.py → dataeval-0.81.0/src/dataeval/detectors/drift/_torch.py +2 -1
  14. dataeval-0.76.0/src/dataeval/detectors/drift/uncertainty.py → dataeval-0.81.0/src/dataeval/detectors/drift/_uncertainty.py +23 -7
  15. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/drift/updates.py +1 -1
  16. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/linters/__init__.py +0 -3
  17. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/linters/duplicates.py +17 -8
  18. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/linters/outliers.py +52 -43
  19. dataeval-0.81.0/src/dataeval/detectors/ood/ae.py +93 -0
  20. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/base.py +5 -4
  21. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/metadata_ks_compare.py +1 -1
  22. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/mixin.py +20 -5
  23. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/output.py +1 -1
  24. dataeval-0.76.0/src/dataeval/detectors/ood/ae.py → dataeval-0.81.0/src/dataeval/detectors/ood/vae.py +13 -12
  25. dataeval-0.81.0/src/dataeval/metadata/__init__.py +5 -0
  26. dataeval-0.81.0/src/dataeval/metadata/_ood.py +238 -0
  27. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/metrics/__init__.py +1 -1
  28. dataeval-0.81.0/src/dataeval/metrics/bias/__init__.py +22 -0
  29. dataeval-0.76.0/src/dataeval/metrics/bias/balance.py → dataeval-0.81.0/src/dataeval/metrics/bias/_balance.py +67 -17
  30. dataeval-0.76.0/src/dataeval/metrics/bias/coverage.py → dataeval-0.81.0/src/dataeval/metrics/bias/_coverage.py +41 -35
  31. dataeval-0.76.0/src/dataeval/metrics/bias/diversity.py → dataeval-0.81.0/src/dataeval/metrics/bias/_diversity.py +17 -12
  32. dataeval-0.76.0/src/dataeval/metrics/bias/parity.py → dataeval-0.81.0/src/dataeval/metrics/bias/_parity.py +89 -63
  33. dataeval-0.81.0/src/dataeval/metrics/estimators/__init__.py +19 -0
  34. dataeval-0.76.0/src/dataeval/metrics/estimators/ber.py → dataeval-0.81.0/src/dataeval/metrics/estimators/_ber.py +42 -11
  35. dataeval-0.81.0/src/dataeval/metrics/estimators/_clusterer.py +104 -0
  36. dataeval-0.76.0/src/dataeval/metrics/estimators/divergence.py → dataeval-0.81.0/src/dataeval/metrics/estimators/_divergence.py +18 -13
  37. dataeval-0.76.0/src/dataeval/metrics/estimators/uap.py → dataeval-0.81.0/src/dataeval/metrics/estimators/_uap.py +4 -4
  38. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval-0.76.0/src/dataeval/metrics/stats/base.py → dataeval-0.81.0/src/dataeval/metrics/stats/_base.py +52 -16
  40. dataeval-0.76.0/src/dataeval/metrics/stats/boxratiostats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_boxratiostats.py +6 -9
  41. dataeval-0.76.0/src/dataeval/metrics/stats/datasetstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_datasetstats.py +10 -14
  42. dataeval-0.76.0/src/dataeval/metrics/stats/dimensionstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_dimensionstats.py +6 -5
  43. dataeval-0.76.0/src/dataeval/metrics/stats/hashstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_hashstats.py +6 -6
  44. dataeval-0.76.0/src/dataeval/metrics/stats/labelstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_labelstats.py +25 -25
  45. dataeval-0.76.0/src/dataeval/metrics/stats/pixelstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_pixelstats.py +5 -4
  46. dataeval-0.76.0/src/dataeval/metrics/stats/visualstats.py → dataeval-0.81.0/src/dataeval/metrics/stats/_visualstats.py +9 -8
  47. dataeval-0.81.0/src/dataeval/typing.py +54 -0
  48. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/__init__.py +2 -2
  49. dataeval-0.81.0/src/dataeval/utils/_array.py +169 -0
  50. dataeval-0.81.0/src/dataeval/utils/_bin.py +199 -0
  51. dataeval-0.81.0/src/dataeval/utils/_clusterer.py +144 -0
  52. dataeval-0.81.0/src/dataeval/utils/_fast_mst.py +189 -0
  53. dataeval-0.76.0/src/dataeval/utils/image.py → dataeval-0.81.0/src/dataeval/utils/_image.py +6 -4
  54. dataeval-0.81.0/src/dataeval/utils/_method.py +18 -0
  55. dataeval-0.76.0/src/dataeval/utils/shared.py → dataeval-0.81.0/src/dataeval/utils/_mst.py +3 -65
  56. dataeval-0.76.0/src/dataeval/utils/plot.py → dataeval-0.81.0/src/dataeval/utils/_plot.py +4 -4
  57. dataeval-0.81.0/src/dataeval/utils/data/__init__.py +22 -0
  58. dataeval-0.81.0/src/dataeval/utils/data/_embeddings.py +105 -0
  59. dataeval-0.81.0/src/dataeval/utils/data/_images.py +65 -0
  60. dataeval-0.81.0/src/dataeval/utils/data/_metadata.py +352 -0
  61. dataeval-0.81.0/src/dataeval/utils/data/_selection.py +119 -0
  62. dataeval-0.76.0/src/dataeval/utils/dataset/split.py → dataeval-0.81.0/src/dataeval/utils/data/_split.py +13 -14
  63. dataeval-0.81.0/src/dataeval/utils/data/_targets.py +73 -0
  64. dataeval-0.81.0/src/dataeval/utils/data/_types.py +58 -0
  65. dataeval-0.81.0/src/dataeval/utils/data/collate.py +103 -0
  66. dataeval-0.81.0/src/dataeval/utils/data/datasets/__init__.py +17 -0
  67. dataeval-0.81.0/src/dataeval/utils/data/datasets/_base.py +254 -0
  68. dataeval-0.81.0/src/dataeval/utils/data/datasets/_cifar10.py +134 -0
  69. dataeval-0.81.0/src/dataeval/utils/data/datasets/_fileio.py +168 -0
  70. dataeval-0.81.0/src/dataeval/utils/data/datasets/_milco.py +153 -0
  71. dataeval-0.81.0/src/dataeval/utils/data/datasets/_mixin.py +56 -0
  72. dataeval-0.81.0/src/dataeval/utils/data/datasets/_mnist.py +183 -0
  73. dataeval-0.81.0/src/dataeval/utils/data/datasets/_ships.py +123 -0
  74. dataeval-0.81.0/src/dataeval/utils/data/datasets/_voc.py +352 -0
  75. dataeval-0.81.0/src/dataeval/utils/data/selections/__init__.py +15 -0
  76. dataeval-0.81.0/src/dataeval/utils/data/selections/_classfilter.py +60 -0
  77. dataeval-0.81.0/src/dataeval/utils/data/selections/_indices.py +26 -0
  78. dataeval-0.81.0/src/dataeval/utils/data/selections/_limit.py +26 -0
  79. dataeval-0.81.0/src/dataeval/utils/data/selections/_reverse.py +18 -0
  80. dataeval-0.81.0/src/dataeval/utils/data/selections/_shuffle.py +29 -0
  81. dataeval-0.81.0/src/dataeval/utils/metadata.py +403 -0
  82. dataeval-0.76.0/src/dataeval/utils/torch/gmm.py → dataeval-0.81.0/src/dataeval/utils/torch/_gmm.py +4 -2
  83. dataeval-0.76.0/src/dataeval/utils/torch/internal.py → dataeval-0.81.0/src/dataeval/utils/torch/_internal.py +21 -51
  84. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/torch/models.py +43 -2
  85. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/workflows/sufficiency.py +10 -9
  86. dataeval-0.76.0/src/dataeval/detectors/drift/__init__.py +0 -22
  87. dataeval-0.76.0/src/dataeval/detectors/linters/clusterer.py +0 -512
  88. dataeval-0.76.0/src/dataeval/detectors/linters/merged_stats.py +0 -49
  89. dataeval-0.76.0/src/dataeval/detectors/ood/metadata_least_likely.py +0 -119
  90. dataeval-0.76.0/src/dataeval/interop.py +0 -69
  91. dataeval-0.76.0/src/dataeval/metrics/bias/__init__.py +0 -21
  92. dataeval-0.76.0/src/dataeval/metrics/estimators/__init__.py +0 -9
  93. dataeval-0.76.0/src/dataeval/utils/dataset/__init__.py +0 -7
  94. dataeval-0.76.0/src/dataeval/utils/dataset/datasets.py +0 -412
  95. dataeval-0.76.0/src/dataeval/utils/dataset/read.py +0 -63
  96. dataeval-0.76.0/src/dataeval/utils/metadata.py +0 -581
  97. {dataeval-0.76.0 → dataeval-0.81.0}/LICENSE.txt +0 -0
  98. /dataeval-0.76.0/src/dataeval/log.py → /dataeval-0.81.0/src/dataeval/_log.py +0 -0
  99. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/__init__.py +0 -0
  100. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
  101. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/py.typed +0 -0
  102. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/torch/__init__.py +0 -0
  103. /dataeval-0.76.0/src/dataeval/utils/torch/blocks.py → /dataeval-0.81.0/src/dataeval/utils/torch/_blocks.py +0 -0
  104. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/utils/torch/trainer.py +0 -0
  105. {dataeval-0.76.0 → dataeval-0.81.0}/src/dataeval/workflows/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.76.0
3
+ Version: 0.81.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -21,8 +21,12 @@ Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Programming Language :: Python :: 3 :: Only
22
22
  Classifier: Topic :: Scientific/Engineering
23
23
  Provides-Extra: all
24
- Requires-Dist: matplotlib ; extra == "all"
24
+ Requires-Dist: defusedxml (>=0.7.1)
25
+ Requires-Dist: fast_hdbscan (==0.2.0)
26
+ Requires-Dist: matplotlib (>=3.7.1) ; extra == "all"
27
+ Requires-Dist: numba (>=0.59.1)
25
28
  Requires-Dist: numpy (>=1.24.2)
29
+ Requires-Dist: pandas (>=2.0) ; extra == "all"
26
30
  Requires-Dist: pillow (>=10.3.0)
27
31
  Requires-Dist: requests
28
32
  Requires-Dist: scikit-learn (>=1.5.0)
@@ -38,13 +42,17 @@ Description-Content-Type: text/markdown
38
42
 
39
43
  # DataEval
40
44
 
41
- To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
45
+ To view our extensive collection of tutorials, how-to's, explanation guides,
46
+ and reference material, please visit our documentation on
47
+ **[Read the Docs](https://dataeval.readthedocs.io/)**
42
48
 
43
49
  ## About DataEval
44
50
 
45
51
  <!-- start tagline -->
46
52
 
47
- DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
53
+ DataEval curates datasets to train and test performant, robust, unbiased and
54
+ reliable AI models and monitors for data shifts that impact performance of
55
+ deployed models.
48
56
 
49
57
  <!-- end tagline -->
50
58
 
@@ -52,22 +60,33 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
52
60
 
53
61
  <!-- start needs -->
54
62
 
55
- DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
63
+ DataEval is an effective, powerful, and reliable set of tools for any T&E
64
+ engineer. Throughout all stages of the machine learning lifecycle, DataEval
65
+ supports model development, data analysis, and monitoring with state-of-the-art
66
+ algorithms to help you solve difficult problems. With a focus on computer
67
+ vision tasks, DataEval provides simple, but effective metrics for performance
68
+ estimation, bias detection, and dataset linting.
56
69
 
57
70
  <!-- end needs -->
58
71
 
59
72
  <!-- start JATIC interop -->
60
- DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
61
- DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
73
+ DataEval is easy to install, supports a wide range of Python versions, and is
74
+ compatible with many of the most popular packages in the scientific and T&E
75
+ communities.
76
+
77
+ DataEval also has native interopability between JATIC's suite of tools when
78
+ using MAITE-compliant datasets and models.
62
79
  <!-- end JATIC interop -->
63
80
 
64
81
  ## Getting Started
65
82
 
66
83
  **Python versions:** 3.9 - 3.12
67
84
 
68
- **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
85
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
86
+ *Gradient*
69
87
 
70
- Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
88
+ Choose your preferred method of installation below or follow our
89
+ [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
71
90
 
72
91
  * [Installing with pip](#installing-with-pip)
73
92
  * [Installing with conda/mamba](#installing-with-conda)
@@ -75,7 +94,8 @@ Choose your preferred method of installation below or follow our [installation g
75
94
 
76
95
  ### **Installing with pip**
77
96
 
78
- You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
97
+ You can install DataEval directly from pypi.org using the following command.
98
+ The optional dependencies of DataEval are `all`.
79
99
 
80
100
  ```bash
81
101
  pip install dataeval[all]
@@ -83,8 +103,9 @@ pip install dataeval[all]
83
103
 
84
104
  ### **Installing with conda**
85
105
 
86
- DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
87
- are installed from the `pytorch` channel, the channel is specified in the below example.
106
+ DataEval can be installed in a Conda/Mamba environment using the provided
107
+ `environment.yaml` file. As some dependencies are installed from the `pytorch`
108
+ channel, the channel is specified in the below example.
88
109
 
89
110
  ```bash
90
111
  micromamba create -f environment\environment.yaml -c pytorch
@@ -92,7 +113,9 @@ micromamba create -f environment\environment.yaml -c pytorch
92
113
 
93
114
  ### **Installing from GitHub**
94
115
 
95
- To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
116
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
117
+ download larger, binary source files and `poetry` for project dependency
118
+ management.
96
119
 
97
120
  ```bash
98
121
  sudo apt-get install git-lfs
@@ -112,7 +135,9 @@ Install DataEval with optional dependencies for development.
112
135
  poetry install --all-extras --with dev
113
136
  ```
114
137
 
115
- Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
138
+ Now that DataEval is installed, you can run commands in the poetry virtual
139
+ environment by prefixing shell commands with `poetry run`, or activate the
140
+ virtual environment directly in the shell.
116
141
 
117
142
  ```bash
118
143
  poetry shell
@@ -131,7 +156,11 @@ If you have any questions, feel free to reach out to the people below:
131
156
 
132
157
  ### CDAO Funding Acknowledgement
133
158
 
134
- This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
159
+ This material is based upon work supported by the Chief Digital and Artificial
160
+ Intelligence Office under Contract No. W519TC-23-9-2033. The views and
161
+ conclusions contained herein are those of the author(s) and should not be
162
+ interpreted as necessarily representing the official policies or endorsements,
163
+ either expressed or implied, of the U.S. Government.
135
164
 
136
165
  <!-- end acknowledgement -->
137
166
 
@@ -1,12 +1,16 @@
1
1
  # DataEval
2
2
 
3
- To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
3
+ To view our extensive collection of tutorials, how-to's, explanation guides,
4
+ and reference material, please visit our documentation on
5
+ **[Read the Docs](https://dataeval.readthedocs.io/)**
4
6
 
5
7
  ## About DataEval
6
8
 
7
9
  <!-- start tagline -->
8
10
 
9
- DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
11
+ DataEval curates datasets to train and test performant, robust, unbiased and
12
+ reliable AI models and monitors for data shifts that impact performance of
13
+ deployed models.
10
14
 
11
15
  <!-- end tagline -->
12
16
 
@@ -14,22 +18,33 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
14
18
 
15
19
  <!-- start needs -->
16
20
 
17
- DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
21
+ DataEval is an effective, powerful, and reliable set of tools for any T&E
22
+ engineer. Throughout all stages of the machine learning lifecycle, DataEval
23
+ supports model development, data analysis, and monitoring with state-of-the-art
24
+ algorithms to help you solve difficult problems. With a focus on computer
25
+ vision tasks, DataEval provides simple, but effective metrics for performance
26
+ estimation, bias detection, and dataset linting.
18
27
 
19
28
  <!-- end needs -->
20
29
 
21
30
  <!-- start JATIC interop -->
22
- DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
23
- DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
31
+ DataEval is easy to install, supports a wide range of Python versions, and is
32
+ compatible with many of the most popular packages in the scientific and T&E
33
+ communities.
34
+
35
+ DataEval also has native interopability between JATIC's suite of tools when
36
+ using MAITE-compliant datasets and models.
24
37
  <!-- end JATIC interop -->
25
38
 
26
39
  ## Getting Started
27
40
 
28
41
  **Python versions:** 3.9 - 3.12
29
42
 
30
- **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
43
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*,
44
+ *Gradient*
31
45
 
32
- Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
46
+ Choose your preferred method of installation below or follow our
47
+ [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
33
48
 
34
49
  * [Installing with pip](#installing-with-pip)
35
50
  * [Installing with conda/mamba](#installing-with-conda)
@@ -37,7 +52,8 @@ Choose your preferred method of installation below or follow our [installation g
37
52
 
38
53
  ### **Installing with pip**
39
54
 
40
- You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
55
+ You can install DataEval directly from pypi.org using the following command.
56
+ The optional dependencies of DataEval are `all`.
41
57
 
42
58
  ```bash
43
59
  pip install dataeval[all]
@@ -45,8 +61,9 @@ pip install dataeval[all]
45
61
 
46
62
  ### **Installing with conda**
47
63
 
48
- DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
49
- are installed from the `pytorch` channel, the channel is specified in the below example.
64
+ DataEval can be installed in a Conda/Mamba environment using the provided
65
+ `environment.yaml` file. As some dependencies are installed from the `pytorch`
66
+ channel, the channel is specified in the below example.
50
67
 
51
68
  ```bash
52
69
  micromamba create -f environment\environment.yaml -c pytorch
@@ -54,7 +71,9 @@ micromamba create -f environment\environment.yaml -c pytorch
54
71
 
55
72
  ### **Installing from GitHub**
56
73
 
57
- To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
74
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to
75
+ download larger, binary source files and `poetry` for project dependency
76
+ management.
58
77
 
59
78
  ```bash
60
79
  sudo apt-get install git-lfs
@@ -74,7 +93,9 @@ Install DataEval with optional dependencies for development.
74
93
  poetry install --all-extras --with dev
75
94
  ```
76
95
 
77
- Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
96
+ Now that DataEval is installed, you can run commands in the poetry virtual
97
+ environment by prefixing shell commands with `poetry run`, or activate the
98
+ virtual environment directly in the shell.
78
99
 
79
100
  ```bash
80
101
  poetry shell
@@ -93,6 +114,10 @@ If you have any questions, feel free to reach out to the people below:
93
114
 
94
115
  ### CDAO Funding Acknowledgement
95
116
 
96
- This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
117
+ This material is based upon work supported by the Chief Digital and Artificial
118
+ Intelligence Office under Contract No. W519TC-23-9-2033. The views and
119
+ conclusions contained herein are those of the author(s) and should not be
120
+ interpreted as necessarily representing the official policies or endorsements,
121
+ either expressed or implied, of the U.S. Government.
97
122
 
98
123
  <!-- end acknowledgement -->
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.76.0" # dynamic
3
+ version = "0.81.0" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -42,6 +42,9 @@ packages = [
42
42
  [tool.poetry.dependencies]
43
43
  # required
44
44
  python = ">=3.9,<3.13"
45
+ defusedxml = {version = ">=0.7.1"}
46
+ fast_hdbscan = {version = "0.2.0"} # 0.2.1 hits a bug in condense_tree comparing float to none
47
+ numba = {version = ">=0.59.1"}
45
48
  numpy = {version = ">=1.24.2"}
46
49
  pillow = {version = ">=10.3.0"}
47
50
  requests = {version = "*"}
@@ -54,10 +57,11 @@ typing-extensions = {version = ">=4.12", python = "^3.9"} # ParamSpec
54
57
  xxhash = {version = ">=3.3"}
55
58
 
56
59
  # optional
57
- matplotlib = {version = "*", optional = true}
60
+ matplotlib = {version = ">=3.7.1", optional = true}
61
+ pandas = {version = ">=2.0", optional = true}
58
62
 
59
63
  [tool.poetry.extras]
60
- all = ["matplotlib"]
64
+ all = ["matplotlib", "pandas"]
61
65
 
62
66
  [tool.poetry.group.dev]
63
67
  optional = true
@@ -81,20 +85,20 @@ coverage = {version = "*", extras = ["toml"]}
81
85
  pyright = {version = "*", extras = ["nodejs"]}
82
86
  # prototype
83
87
  maite = {version = "*"}
84
- pandas = {version = "*"}
85
88
  seaborn = {version = "*"}
86
89
  # docs
87
90
  certifi = {version = ">=2024.07.04"}
88
91
  enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
89
92
  ipykernel = {version = ">=6.26.0"}
90
93
  ipywidgets = {version = ">=8.1.1"}
91
- jinja2 = {version = ">=3.1.5"}
94
+ jinja2 = {version = ">=3.1.6"}
92
95
  jupyter-client = {version = ">=8.6.0"}
93
96
  jupyter-cache = {version = "*"}
94
97
  myst-nb = {version = ">=1.0.0"}
95
- sphinx-immaterial = {version = "*"}
96
98
  sphinx-autoapi = {version = "*"}
97
99
  sphinx-design = {version = "*"}
100
+ sphinx-immaterial = {version = "*"}
101
+ sphinx-new-tab-link = {version = "*"}
98
102
  sphinx-tabs = {version = "*"}
99
103
  Sphinx = {version = ">=7.2.6"}
100
104
  torchmetrics = {version = ">=1.0.0", source = "pytorch"}
@@ -128,6 +132,11 @@ reportMissingImports = false
128
132
  norecursedirs = ["prototype"]
129
133
  testpaths = ["tests"]
130
134
  addopts = ["--pythonwarnings=ignore::DeprecationWarning", "--verbose", "--durations=20", "--durations-min=1.0"]
135
+ markers = [
136
+ "required: marks tests for required features",
137
+ "optional: marks tests for optional features",
138
+ "requires_all: marks tests that require the all extras",
139
+ ]
131
140
 
132
141
  [tool.coverage.run]
133
142
  source = ["src/dataeval"]
@@ -142,8 +151,9 @@ exclude_also = [
142
151
  ]
143
152
  include = ["*/src/dataeval/*"]
144
153
  omit = [
145
- "*/torch/blocks.py",
146
- "*/torch/utils.py",
154
+ "*/torch/_blocks.py",
155
+ "*/_clusterer.py",
156
+ "*/_fast_mst.py",
147
157
  ]
148
158
  fail_under = 90
149
159
 
@@ -177,6 +187,9 @@ per-file-ignores = { "*.ipynb" = ["E402"] }
177
187
  [tool.ruff.lint.isort]
178
188
  known-first-party = ["dataeval"]
179
189
 
190
+ [tool.ruff.lint.flake8-builtins]
191
+ builtins-strict-checking = false
192
+
180
193
  [tool.ruff.format]
181
194
  quote-style = "double"
182
195
  indent-style = "space"
@@ -186,7 +199,7 @@ docstring-code-format = true
186
199
  docstring-code-line-length = "dynamic"
187
200
 
188
201
  [tool.codespell]
189
- skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
202
+ skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html,./docs/source/*/data'
190
203
  ignore-words-list = ["Hart"]
191
204
 
192
205
  [build-system]
@@ -7,12 +7,12 @@ shifts that impact performance of deployed models.
7
7
 
8
8
  from __future__ import annotations
9
9
 
10
- __all__ = ["detectors", "log", "metrics", "utils", "workflows"]
11
- __version__ = "0.76.0"
10
+ __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
+ __version__ = "0.81.0"
12
12
 
13
13
  import logging
14
14
 
15
- from dataeval import detectors, metrics, utils, workflows
15
+ from dataeval import config, detectors, metrics, typing, utils, workflows
16
16
 
17
17
  logging.getLogger(__name__).addHandler(logging.NullHandler())
18
18
 
@@ -32,9 +32,23 @@ class Output:
32
32
  return f"{self.__class__.__name__}: {str(self.dict())}"
33
33
 
34
34
  def dict(self) -> dict[str, Any]:
35
+ """
36
+ Output attributes as a dictionary.
37
+
38
+ Returns
39
+ -------
40
+ dict[str, Any]
41
+ """
35
42
  return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
36
43
 
37
44
  def meta(self) -> dict[str, Any]:
45
+ """
46
+ Execution metadata as a dictionary.
47
+
48
+ Returns
49
+ -------
50
+ dict[str, Any]
51
+ """
38
52
  return {k.removeprefix("_"): v for k, v in self.__dict__.items() if k.startswith("_")}
39
53
 
40
54
 
@@ -0,0 +1,77 @@
1
+ """
2
+ Global configuration settings for DataEval.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ __all__ = ["get_device", "set_device", "get_max_processes", "set_max_processes"]
8
+
9
+ import torch
10
+ from torch import device
11
+
12
+ _device: device | None = None
13
+ _processes: int | None = None
14
+
15
+
16
+ def set_device(device: str | device | int) -> None:
17
+ """
18
+ Sets the default device to use when executing against a PyTorch backend.
19
+
20
+ Parameters
21
+ ----------
22
+ device : str or int or `torch.device`
23
+ The default device to use. See `torch.device <https://pytorch.org/docs/stable/tensor_attributes.html#torch.device>`_
24
+ documentation for more information.
25
+ """
26
+ global _device
27
+ _device = torch.device(device)
28
+
29
+
30
+ def get_device(override: str | device | int | None = None) -> torch.device:
31
+ """
32
+ Returns the PyTorch device to use.
33
+
34
+ Parameters
35
+ ----------
36
+ override : str or int or `torch.device` or None, default None
37
+ The user specified override if provided, otherwise returns the default device.
38
+
39
+ Returns
40
+ -------
41
+ `torch.device`
42
+ """
43
+ if override is None:
44
+ global _device
45
+ return torch.get_default_device() if _device is None else _device
46
+ else:
47
+ return torch.device(override)
48
+
49
+
50
+ def set_max_processes(processes: int | None) -> None:
51
+ """
52
+ Sets the maximum number of worker processes to use when running tasks that support parallel processing.
53
+
54
+ Parameters
55
+ ----------
56
+ processes : int or None
57
+ The maximum number of worker processes to use, or None to use
58
+ `os.process_cpu_count <https://docs.python.org/3/library/os.html#os.process_cpu_count>`_
59
+ to determine the number of worker processes.
60
+ """
61
+ global _processes
62
+ _processes = processes
63
+
64
+
65
+ def get_max_processes() -> int | None:
66
+ """
67
+ Returns the maximum number of worker processes to use when running tasks that support parallel processing.
68
+
69
+ Returns
70
+ -------
71
+ int or None
72
+ The maximum number of worker processes to use, or None to use
73
+ `os.process_cpu_count <https://docs.python.org/3/library/os.html#os.process_cpu_count>`_
74
+ to determine the number of worker processes.
75
+ """
76
+ global _processes
77
+ return _processes
@@ -4,4 +4,4 @@ Detectors can determine if a dataset or individual images in a dataset are indic
4
4
 
5
5
  __all__ = ["drift", "linters", "ood"]
6
6
 
7
- from dataeval.detectors import drift, linters, ood
7
+ from . import drift, linters, ood
@@ -0,0 +1,22 @@
1
+ """
2
+ :term:`Drift` detectors identify if the statistical properties of the data has changed.
3
+ """
4
+
5
+ __all__ = [
6
+ "DriftCVM",
7
+ "DriftKS",
8
+ "DriftMMD",
9
+ "DriftMMDOutput",
10
+ "DriftOutput",
11
+ "DriftUncertainty",
12
+ "preprocess_drift",
13
+ "updates",
14
+ ]
15
+
16
+ from dataeval.detectors.drift import updates
17
+ from dataeval.detectors.drift._base import DriftOutput
18
+ from dataeval.detectors.drift._cvm import DriftCVM
19
+ from dataeval.detectors.drift._ks import DriftKS
20
+ from dataeval.detectors.drift._mmd import DriftMMD, DriftMMDOutput
21
+ from dataeval.detectors.drift._torch import preprocess_drift
22
+ from dataeval.detectors.drift._uncertainty import DriftUncertainty
@@ -10,16 +10,18 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = []
12
12
 
13
+ import math
13
14
  from abc import ABC, abstractmethod
14
15
  from dataclasses import dataclass
15
16
  from functools import wraps
16
17
  from typing import Any, Callable, Literal, TypeVar
17
18
 
18
19
  import numpy as np
19
- from numpy.typing import ArrayLike, NDArray
20
+ from numpy.typing import NDArray
20
21
 
21
- from dataeval.interop import as_numpy
22
- from dataeval.output import Output, set_metadata
22
+ from dataeval._output import Output, set_metadata
23
+ from dataeval.typing import Array, ArrayLike
24
+ from dataeval.utils._array import as_numpy, to_numpy
23
25
 
24
26
  R = TypeVar("R")
25
27
 
@@ -46,16 +48,9 @@ class UpdateStrategy(ABC):
46
48
  class DriftBaseOutput(Output):
47
49
  """
48
50
  Base output class for Drift Detector classes
49
-
50
- Attributes
51
- ----------
52
- is_drift : bool
53
- Drift prediction for the images
54
- threshold : float
55
- Threshold after multivariate correction if needed
56
51
  """
57
52
 
58
- is_drift: bool
53
+ drifted: bool
59
54
  threshold: float
60
55
  p_val: float
61
56
  distance: float
@@ -64,14 +59,18 @@ class DriftBaseOutput(Output):
64
59
  @dataclass(frozen=True)
65
60
  class DriftOutput(DriftBaseOutput):
66
61
  """
67
- Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
62
+ Output class for :class:`.DriftCVM`, :class:`.DriftKS`, and :class:`.DriftUncertainty` drift detectors.
68
63
 
69
64
  Attributes
70
65
  ----------
71
- is_drift : bool
66
+ drifted : bool
72
67
  :term:`Drift` prediction for the images
73
68
  threshold : float
74
69
  Threshold after multivariate correction if needed
70
+ p_val : float
71
+ Instance-level p-value
72
+ distance : float
73
+ Instance-level distance
75
74
  feature_drift : NDArray
76
75
  Feature-level array of images detected to have drifted
77
76
  feature_threshold : float
@@ -82,7 +81,7 @@ class DriftOutput(DriftBaseOutput):
82
81
  Feature-level distances
83
82
  """
84
83
 
85
- # is_drift: bool
84
+ # drifted: bool
86
85
  # threshold: float
87
86
  # p_val: float
88
87
  # distance: float
@@ -196,7 +195,7 @@ class BaseDrift:
196
195
  if correction not in ["bonferroni", "fdr"]:
197
196
  raise ValueError("`correction` must be `bonferroni` or `fdr`.")
198
197
 
199
- self._x_ref = as_numpy(x_ref)
198
+ self._x_ref = x_ref
200
199
  self.x_ref_preprocessed: bool = x_ref_preprocessed
201
200
 
202
201
  # Other attributes
@@ -204,25 +203,25 @@ class BaseDrift:
204
203
  self.update_x_ref = update_x_ref
205
204
  self.preprocess_fn = preprocess_fn
206
205
  self.correction = correction
207
- self.n: int = len(self._x_ref)
206
+ self.n: int = len(x_ref)
208
207
 
209
208
  # Ref counter for preprocessed x
210
209
  self._x_refcount = 0
211
210
 
212
211
  @property
213
- def x_ref(self) -> NDArray[Any]:
212
+ def x_ref(self) -> ArrayLike:
214
213
  """
215
214
  Retrieve the reference data, applying preprocessing if not already done.
216
215
 
217
216
  Returns
218
217
  -------
219
- NDArray
218
+ ArrayLike
220
219
  The reference dataset (`x_ref`), preprocessed if needed.
221
220
  """
222
221
  if not self.x_ref_preprocessed:
223
222
  self.x_ref_preprocessed = True
224
223
  if self.preprocess_fn is not None:
225
- self._x_ref = as_numpy(self.preprocess_fn(self._x_ref))
224
+ self._x_ref = self.preprocess_fn(self._x_ref)
226
225
 
227
226
  return self._x_ref
228
227
 
@@ -323,32 +322,44 @@ class BaseDriftUnivariate(BaseDrift):
323
322
  # lazy process n_features as needed
324
323
  if not isinstance(self._n_features, int):
325
324
  # compute number of features for the univariate tests
326
- if not isinstance(self.preprocess_fn, Callable) or self.x_ref_preprocessed:
327
- # infer features from preprocessed reference data
328
- self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
329
- else:
330
- # infer number of features after applying preprocessing step
331
- x = as_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
332
- self._n_features = x.reshape(x.shape[0], -1).shape[-1]
325
+ x_ref = (
326
+ self.x_ref
327
+ if self.preprocess_fn is None or self.x_ref_preprocessed
328
+ else self.preprocess_fn(self._x_ref[0:1])
329
+ )
330
+ # infer features from preprocessed reference data
331
+ shape = x_ref.shape if isinstance(x_ref, Array) else as_numpy(x_ref).shape
332
+ self._n_features = int(math.prod(shape[1:])) # Multiplies all channel sizes after first
333
333
 
334
334
  return self._n_features
335
335
 
336
336
  @preprocess_x
337
- @abstractmethod
338
337
  def score(self, x: ArrayLike) -> tuple[NDArray[np.float32], NDArray[np.float32]]:
339
338
  """
340
- Abstract method to calculate feature scores after preprocessing.
339
+ Calculates p-values and test statistics per feature.
341
340
 
342
341
  Parameters
343
342
  ----------
344
343
  x : ArrayLike
345
- The batch of data to calculate univariate :term:`drift<Drift>` scores for each feature.
344
+ Batch of instances
346
345
 
347
346
  Returns
348
347
  -------
349
348
  tuple[NDArray, NDArray]
350
- A tuple containing p-values and distance :term:`statistics<Statistics>` for each feature.
349
+ Feature level p-values and test statistics
351
350
  """
351
+ x_np = to_numpy(x)
352
+ x_np = x_np.reshape(x_np.shape[0], -1)
353
+ x_ref_np = as_numpy(self.x_ref)
354
+ x_ref_np = x_ref_np.reshape(x_ref_np.shape[0], -1)
355
+ p_val = np.zeros(self.n_features, dtype=np.float32)
356
+ dist = np.zeros_like(p_val)
357
+ for f in range(self.n_features):
358
+ dist[f], p_val[f] = self._score_fn(x_ref_np[:, f], x_np[:, f])
359
+ return p_val, dist
360
+
361
+ @abstractmethod
362
+ def _score_fn(self, x: NDArray[np.float32], y: NDArray[np.float32]) -> tuple[np.float32, np.float32]: ...
352
363
 
353
364
  def _apply_correction(self, p_vals: NDArray) -> tuple[bool, float]:
354
365
  """