dataeval 0.74.2__tar.gz → 0.75.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. dataeval-0.75.0/PKG-INFO +136 -0
  2. dataeval-0.75.0/README.md +97 -0
  3. {dataeval-0.74.2 → dataeval-0.75.0}/pyproject.toml +12 -13
  4. dataeval-0.75.0/src/dataeval/__init__.py +40 -0
  5. dataeval-0.75.0/src/dataeval/detectors/drift/__init__.py +22 -0
  6. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/base.py +1 -1
  7. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/cvm.py +1 -1
  8. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/ks.py +1 -1
  9. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/mmd.py +6 -5
  10. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/torch.py +12 -12
  11. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/uncertainty.py +3 -2
  12. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/linters/clusterer.py +2 -7
  13. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/linters/duplicates.py +6 -10
  14. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/linters/outliers.py +4 -2
  15. dataeval-0.75.0/src/dataeval/detectors/ood/__init__.py +8 -0
  16. dataeval-0.74.2/src/dataeval/detectors/ood/ae_torch.py → dataeval-0.75.0/src/dataeval/detectors/ood/ae.py +6 -4
  17. dataeval-0.74.2/src/dataeval/detectors/ood/base_torch.py → dataeval-0.75.0/src/dataeval/detectors/ood/base.py +6 -5
  18. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/ood/metadata_ks_compare.py +34 -42
  19. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/ood/metadata_least_likely.py +3 -3
  20. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +6 -5
  21. dataeval-0.74.2/src/dataeval/detectors/ood/base.py → dataeval-0.75.0/src/dataeval/detectors/ood/mixin.py +11 -72
  22. dataeval-0.75.0/src/dataeval/detectors/ood/output.py +63 -0
  23. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/interop.py +6 -5
  24. dataeval-0.74.2/src/dataeval/logging.py → dataeval-0.75.0/src/dataeval/log.py +2 -0
  25. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/bias/__init__.py +9 -12
  26. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/bias/balance.py +10 -8
  27. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/bias/coverage.py +52 -4
  28. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/bias/diversity.py +42 -14
  29. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/bias/parity.py +15 -12
  30. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/estimators/ber.py +3 -1
  31. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/estimators/divergence.py +1 -1
  32. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/estimators/uap.py +1 -1
  33. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/base.py +4 -4
  34. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/boxratiostats.py +8 -9
  35. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/datasetstats.py +10 -14
  36. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/dimensionstats.py +4 -4
  37. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/hashstats.py +12 -8
  38. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/labelstats.py +5 -5
  39. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/pixelstats.py +4 -9
  40. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/visualstats.py +4 -9
  41. dataeval-0.75.0/src/dataeval/utils/__init__.py +9 -0
  42. dataeval-0.75.0/src/dataeval/utils/dataset/__init__.py +7 -0
  43. {dataeval-0.74.2/src/dataeval/utils/torch → dataeval-0.75.0/src/dataeval/utils/dataset}/datasets.py +2 -0
  44. dataeval-0.75.0/src/dataeval/utils/dataset/read.py +63 -0
  45. dataeval-0.74.2/src/dataeval/utils/split_dataset.py → dataeval-0.75.0/src/dataeval/utils/dataset/split.py +38 -30
  46. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/utils/image.py +2 -2
  47. dataeval-0.74.2/src/dataeval/metrics/bias/metadata_preprocessing.py → dataeval-0.75.0/src/dataeval/utils/metadata.py +309 -11
  48. dataeval-0.74.2/src/dataeval/metrics/bias/metadata_utils.py → dataeval-0.75.0/src/dataeval/utils/plot.py +1 -104
  49. dataeval-0.75.0/src/dataeval/utils/torch/__init__.py +10 -0
  50. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/utils/torch/gmm.py +29 -6
  51. dataeval-0.74.2/src/dataeval/utils/torch/utils.py → dataeval-0.75.0/src/dataeval/utils/torch/internal.py +82 -58
  52. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/utils/torch/models.py +10 -8
  53. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/utils/torch/trainer.py +6 -85
  54. dataeval-0.75.0/src/dataeval/workflows/__init__.py +7 -0
  55. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/workflows/sufficiency.py +16 -6
  56. dataeval-0.74.2/PKG-INFO +0 -120
  57. dataeval-0.74.2/README.md +0 -81
  58. dataeval-0.74.2/src/dataeval/__init__.py +0 -36
  59. dataeval-0.74.2/src/dataeval/detectors/drift/__init__.py +0 -20
  60. dataeval-0.74.2/src/dataeval/detectors/ood/__init__.py +0 -15
  61. dataeval-0.74.2/src/dataeval/utils/__init__.py +0 -18
  62. dataeval-0.74.2/src/dataeval/utils/gmm.py +0 -26
  63. dataeval-0.74.2/src/dataeval/utils/metadata.py +0 -278
  64. dataeval-0.74.2/src/dataeval/utils/torch/__init__.py +0 -25
  65. dataeval-0.74.2/src/dataeval/workflows/__init__.py +0 -10
  66. {dataeval-0.74.2 → dataeval-0.75.0}/LICENSE.txt +0 -0
  67. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/__init__.py +2 -2
  68. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/drift/updates.py +0 -0
  69. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/linters/__init__.py +4 -4
  70. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/detectors/linters/merged_stats.py +0 -0
  71. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/__init__.py +2 -2
  72. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/estimators/__init__.py +2 -2
  73. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/metrics/stats/__init__.py +18 -18
  74. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/output.py +0 -0
  75. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/py.typed +0 -0
  76. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/utils/shared.py +0 -0
  77. {dataeval-0.74.2 → dataeval-0.75.0}/src/dataeval/utils/torch/blocks.py +0 -0
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.1
2
+ Name: dataeval
3
+ Version: 0.75.0
4
+ Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
+ Home-page: https://dataeval.ai/
6
+ License: MIT
7
+ Author: Andrew Weng
8
+ Author-email: andrew.weng@ariacoustics.com
9
+ Maintainer: ARiA
10
+ Maintainer-email: dataeval@ariacoustics.com
11
+ Requires-Python: >=3.9,<3.13
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3 :: Only
22
+ Classifier: Topic :: Scientific/Engineering
23
+ Provides-Extra: all
24
+ Requires-Dist: matplotlib ; extra == "all"
25
+ Requires-Dist: numpy (>=1.24.3)
26
+ Requires-Dist: pillow (>=10.3.0)
27
+ Requires-Dist: requests
28
+ Requires-Dist: scikit-learn (>=1.5.0)
29
+ Requires-Dist: scipy (>=1.10)
30
+ Requires-Dist: torch (>=2.2.0)
31
+ Requires-Dist: torchvision (>=0.17.0)
32
+ Requires-Dist: tqdm
33
+ Requires-Dist: typing-extensions (>=4.12) ; python_version >= "3.9" and python_version < "4.0"
34
+ Requires-Dist: xxhash (>=3.3)
35
+ Project-URL: Documentation, https://dataeval.readthedocs.io/
36
+ Project-URL: Repository, https://github.com/aria-ml/dataeval/
37
+ Description-Content-Type: text/markdown
38
+
39
+ # DataEval
40
+
41
+ To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
42
+
43
+ ## About DataEval
44
+
45
+ <!-- start tagline -->
46
+
47
+ DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
48
+
49
+ <!-- end tagline -->
50
+
51
+ ### Our mission
52
+
53
+ <!-- start needs -->
54
+
55
+ DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports **model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
56
+
57
+ <!-- end needs -->
58
+
59
+ <!-- start JATIC interop -->
60
+ DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
61
+ DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
62
+ <!-- end JATIC interop -->
63
+
64
+ ## Getting Started
65
+
66
+ **Python versions:** 3.9 - 3.12
67
+
68
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
69
+
70
+ Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
71
+
72
+ * [Installing with pip](#installing-with-pip)
73
+ * [Installing with conda/mamba](#installing-with-conda)
74
+ * [Installing from GitHub](#installing-from-github)
75
+
76
+ ### **Installing with pip**
77
+ You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
78
+
79
+ ```
80
+ pip install dataeval[all]
81
+ ```
82
+
83
+ ### **Installing with conda**
84
+
85
+ DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
86
+ are installed from the `pytorch` channel, the channel is specified in the below example.
87
+
88
+ ```
89
+ micromamba create -f environment\environment.yaml -c pytorch
90
+ ```
91
+
92
+ ### **Installing from GitHub**
93
+
94
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
95
+
96
+ ```
97
+ sudo apt-get install git-lfs
98
+ pip install poetry
99
+ ```
100
+
101
+ Pull the source down and change to the DataEval project directory.
102
+ ```
103
+ git clone https://github.com/aria-ml/dataeval.git
104
+ cd dataeval
105
+ ```
106
+
107
+ Install DataEval with optional dependencies for development.
108
+ ```
109
+ poetry install --all-extras --with dev
110
+ ```
111
+
112
+ Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
113
+ ```
114
+ poetry shell
115
+ ```
116
+
117
+ ## Contact Us
118
+
119
+ If you have any questions, feel free to reach out to the people below:
120
+
121
+ - **POC**: Scott Swan @scott.swan
122
+ - **DPOC**: Andrew Weng @aweng
123
+
124
+ ## Acknowledgement
125
+
126
+ <!-- start attribution -->
127
+
128
+ ### Alibi-Detect
129
+ This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) Python library developed by SeldonIO.\
130
+ Additional documentation from their developers is available on the [Alibi-Detect documentation page](https://docs.seldon.io/projects/alibi-detect/en/stable/).
131
+
132
+ ### CDAO Funding Acknowledgement
133
+ This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
134
+
135
+ <!-- end attribution -->
136
+
@@ -0,0 +1,97 @@
1
+ # DataEval
2
+
3
+ To view our extensive collection of tutorials, how-to's, explanation guides, and reference material, please visit our documentation on **[Read the Docs](https://dataeval.readthedocs.io/)**
4
+
5
+ ## About DataEval
6
+
7
+ <!-- start tagline -->
8
+
9
+ DataEval curates datasets to train and test performant, robust, unbiased and reliable AI models and monitors for data shifts that impact performance of deployed models.
10
+
11
+ <!-- end tagline -->
12
+
13
+ ### Our mission
14
+
15
+ <!-- start needs -->
16
+
17
+ DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports **model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
18
+
19
+ <!-- end needs -->
20
+
21
+ <!-- start JATIC interop -->
22
+ DataEval is easy to install, supports a wide range of Python versions, and is compatible with many of the most popular packages in the scientific and T&E communities.
23
+ DataEval also has native interopability between JATIC's suite of tools when using MAITE-compliant datasets and models.
24
+ <!-- end JATIC interop -->
25
+
26
+ ## Getting Started
27
+
28
+ **Python versions:** 3.9 - 3.12
29
+
30
+ **Supported packages**: *NumPy*, *Pandas*, *Sci-kit learn*, *MAITE*, *NRTK*, *Gradient*
31
+
32
+ Choose your preferred method of installation below or follow our [installation guide](https://dataeval.readthedocs.io/en/v0.74.2/installation.html).
33
+
34
+ * [Installing with pip](#installing-with-pip)
35
+ * [Installing with conda/mamba](#installing-with-conda)
36
+ * [Installing from GitHub](#installing-from-github)
37
+
38
+ ### **Installing with pip**
39
+ You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
40
+
41
+ ```
42
+ pip install dataeval[all]
43
+ ```
44
+
45
+ ### **Installing with conda**
46
+
47
+ DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
48
+ are installed from the `pytorch` channel, the channel is specified in the below example.
49
+
50
+ ```
51
+ micromamba create -f environment\environment.yaml -c pytorch
52
+ ```
53
+
54
+ ### **Installing from GitHub**
55
+
56
+ To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
57
+
58
+ ```
59
+ sudo apt-get install git-lfs
60
+ pip install poetry
61
+ ```
62
+
63
+ Pull the source down and change to the DataEval project directory.
64
+ ```
65
+ git clone https://github.com/aria-ml/dataeval.git
66
+ cd dataeval
67
+ ```
68
+
69
+ Install DataEval with optional dependencies for development.
70
+ ```
71
+ poetry install --all-extras --with dev
72
+ ```
73
+
74
+ Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
75
+ ```
76
+ poetry shell
77
+ ```
78
+
79
+ ## Contact Us
80
+
81
+ If you have any questions, feel free to reach out to the people below:
82
+
83
+ - **POC**: Scott Swan @scott.swan
84
+ - **DPOC**: Andrew Weng @aweng
85
+
86
+ ## Acknowledgement
87
+
88
+ <!-- start attribution -->
89
+
90
+ ### Alibi-Detect
91
+ This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) Python library developed by SeldonIO.\
92
+ Additional documentation from their developers is available on the [Alibi-Detect documentation page](https://docs.seldon.io/projects/alibi-detect/en/stable/).
93
+
94
+ ### CDAO Funding Acknowledgement
95
+ This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
96
+
97
+ <!-- end attribution -->
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.74.2" # dynamic
3
+ version = "0.75.0" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -44,20 +44,20 @@ packages = [
44
44
  python = ">=3.9,<3.13"
45
45
  numpy = {version = ">=1.24.3"}
46
46
  pillow = {version = ">=10.3.0"}
47
+ requests = {version = "*"}
47
48
  scipy = {version = ">=1.10"}
48
49
  scikit-learn = {version = ">=1.5.0"}
50
+ torch = {version = ">=2.2.0", source = "pytorch"}
51
+ torchvision = {version = ">=0.17.0", source = "pytorch"}
49
52
  tqdm = {version = "*"}
50
- typing-extensions = {version = ">=4.12", python = ">=3.9,<3.10"} # ParamSpec
53
+ typing-extensions = {version = ">=4.12", python = "^3.9"} # ParamSpec
51
54
  xxhash = {version = ">=3.3"}
52
55
 
53
56
  # optional
54
57
  matplotlib = {version = "*", optional = true}
55
- torch = {version = ">=2.2.0", source = "pytorch", optional = true}
56
- torchvision = {version = ">=0.17.0", source = "pytorch", optional = true}
57
58
 
58
59
  [tool.poetry.extras]
59
- torch = ["torch", "torchvision"]
60
- all = ["matplotlib", "torch", "torchvision"]
60
+ all = ["matplotlib"]
61
61
 
62
62
  [tool.poetry.group.dev]
63
63
  optional = true
@@ -65,9 +65,10 @@ optional = true
65
65
  [tool.poetry.group.dev.dependencies]
66
66
  nox = {version = "*", extras = ["uv"]}
67
67
  uv = {version = "*"}
68
- poetry = {version = "*"}
68
+ poetry = {version = "<2"}
69
69
  poetry-lock-groups-plugin = {version = "*"}
70
70
  poetry2conda = {version = "*"}
71
+ numpy = {version = ">=2.0.2"}
71
72
  # lint
72
73
  ruff = {version = "*"}
73
74
  codespell = {version = "*", extras = ["toml"]}
@@ -76,14 +77,12 @@ pytest = {version = "*"}
76
77
  pytest-cov = {version = "*"}
77
78
  pytest-xdist = {version = "*"}
78
79
  coverage = {version = "*", extras = ["toml"]}
79
- torchmetrics = {version = ">=1.0.0", source = "pytorch"}
80
80
  # type
81
81
  pyright = {version = "*", extras = ["nodejs"]}
82
82
  # prototype
83
83
  maite = {version = "*"}
84
84
  pandas = {version = "*"}
85
85
  seaborn = {version = "*"}
86
- numpy = {version = ">=2.0.2"}
87
86
  # docs
88
87
  certifi = {version = ">=2024.07.04"}
89
88
  enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
@@ -93,9 +92,11 @@ jupyter-client = {version = ">=8.6.0"}
93
92
  jupyter-cache = {version = "*"}
94
93
  myst-nb = {version = ">=1.0.0"}
95
94
  pydata-sphinx-theme = {version = ">=0.15.4"}
95
+ sphinx-autoapi = {version = "*"}
96
96
  sphinx-design = {version = "*"}
97
97
  sphinx-tabs = {version = "*"}
98
98
  Sphinx = {version = ">=7.2.6"}
99
+ torchmetrics = {version = ">=1.0.0", source = "pytorch"}
99
100
  markupsafe = {version = "<3.0.2", optional = true}
100
101
 
101
102
  [[tool.poetry.source]]
@@ -136,8 +137,6 @@ parallel = true
136
137
  [tool.coverage.report]
137
138
  exclude_also = [
138
139
  "raise NotImplementedError",
139
- "if _IS_TORCH_AVAILABLE",
140
- "if _IS_TORCHVISION_AVAILABLE",
141
140
  ]
142
141
  include = ["*/src/dataeval/*"]
143
142
  omit = [
@@ -155,7 +154,7 @@ exclude = [
155
154
  ".jupyter_cache",
156
155
  "*env*",
157
156
  "output",
158
- "_build",
157
+ "build",
159
158
  ".nox",
160
159
  ".tox",
161
160
  "prototype",
@@ -185,7 +184,7 @@ docstring-code-format = true
185
184
  docstring-code-line-length = "dynamic"
186
185
 
187
186
  [tool.codespell]
188
- skip = './*env*,./prototype,./output,./docs/_build,./docs/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
187
+ skip = './*env*,./prototype,./output,./docs/build,./docs/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
189
188
  ignore-words-list = ["Hart"]
190
189
 
191
190
  [build-system]
@@ -0,0 +1,40 @@
1
+ """
2
+ DataEval provides a simple interface to characterize image data and its impact on model performance
3
+ across classification and object-detection tasks. It also provides capabilities to select and curate
4
+ datasets to test and train performant, robust, unbiased and reliable AI models and monitor for data
5
+ shifts that impact performance of deployed models.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ __all__ = ["detectors", "log", "metrics", "utils", "workflows"]
11
+ __version__ = "0.75.0"
12
+
13
+ import logging
14
+
15
+ from dataeval import detectors, metrics, utils, workflows
16
+
17
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
18
+
19
+
20
+ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> None:
21
+ """
22
+ Helper for quickly adding a StreamHandler to the logger. Useful for debugging.
23
+
24
+ Parameters
25
+ ----------
26
+ level : int, default logging.DEBUG(10)
27
+ Set the logging level for the logger
28
+ handler : logging.Handler, optional
29
+ Sets the logging handler for the logger if provided, otherwise logger will be
30
+ provided with a StreamHandler
31
+ """
32
+ import logging
33
+
34
+ logger = logging.getLogger(__name__)
35
+ if handler is None:
36
+ handler = logging.StreamHandler() if handler is None else handler
37
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
38
+ logger.addHandler(handler)
39
+ logger.setLevel(level)
40
+ logger.debug(f"Added logging handler {handler} to logger: {__name__}")
@@ -0,0 +1,22 @@
1
+ """
2
+ :term:`Drift` detectors identify if the statistical properties of the data has changed.
3
+ """
4
+
5
+ __all__ = [
6
+ "DriftCVM",
7
+ "DriftKS",
8
+ "DriftMMD",
9
+ "DriftMMDOutput",
10
+ "DriftOutput",
11
+ "DriftUncertainty",
12
+ "preprocess_drift",
13
+ "updates",
14
+ ]
15
+
16
+ from dataeval.detectors.drift import updates
17
+ from dataeval.detectors.drift.base import DriftOutput
18
+ from dataeval.detectors.drift.cvm import DriftCVM
19
+ from dataeval.detectors.drift.ks import DriftKS
20
+ from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
21
+ from dataeval.detectors.drift.torch import preprocess_drift
22
+ from dataeval.detectors.drift.uncertainty import DriftUncertainty
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- __all__ = ["DriftOutput"]
11
+ __all__ = []
12
12
 
13
13
  from abc import ABC, abstractmethod
14
14
  from dataclasses import dataclass
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- __all__ = ["DriftCVM"]
11
+ __all__ = []
12
12
 
13
13
  from typing import Callable, Literal
14
14
 
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- __all__ = ["DriftKS"]
11
+ __all__ = []
12
12
 
13
13
  from typing import Callable, Literal
14
14
 
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- __all__ = ["DriftMMD", "DriftMMDOutput"]
11
+ __all__ = []
12
12
 
13
13
  from dataclasses import dataclass
14
14
  from typing import Callable
@@ -17,9 +17,10 @@ import torch
17
17
  from numpy.typing import ArrayLike
18
18
 
19
19
  from dataeval.detectors.drift.base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
20
- from dataeval.detectors.drift.torch import _GaussianRBF, _mmd2_from_kernel_matrix, get_device
20
+ from dataeval.detectors.drift.torch import GaussianRBF, mmd2_from_kernel_matrix
21
21
  from dataeval.interop import as_numpy
22
22
  from dataeval.output import set_metadata
23
+ from dataeval.utils.torch.internal import get_device
23
24
 
24
25
 
25
26
  @dataclass(frozen=True)
@@ -109,7 +110,7 @@ class DriftMMD(BaseDrift):
109
110
 
110
111
  # initialize kernel
111
112
  sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
112
- self._kernel = _GaussianRBF(sigma_tensor).to(self.device)
113
+ self._kernel = GaussianRBF(sigma_tensor).to(self.device)
113
114
 
114
115
  # compute kernel matrix for the reference data
115
116
  if self._infer_sigma or isinstance(sigma_tensor, torch.Tensor):
@@ -150,9 +151,9 @@ class DriftMMD(BaseDrift):
150
151
  n = x.shape[0]
151
152
  kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
152
153
  kernel_mat = kernel_mat - torch.diag(kernel_mat.diag()) # zero diagonal
153
- mmd2 = _mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
154
+ mmd2 = mmd2_from_kernel_matrix(kernel_mat, n, permute=False, zero_diag=False)
154
155
  mmd2_permuted = torch.Tensor(
155
- [_mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
156
+ [mmd2_from_kernel_matrix(kernel_mat, n, permute=True, zero_diag=False) for _ in range(self.n_permutations)]
156
157
  )
157
158
  mmd2, mmd2_permuted = mmd2.detach().cpu(), mmd2_permuted.detach().cpu()
158
159
  p_val = (mmd2 <= mmd2_permuted).float().mean()
@@ -17,10 +17,10 @@ import torch
17
17
  import torch.nn as nn
18
18
  from numpy.typing import NDArray
19
19
 
20
- from dataeval.utils.torch.utils import get_device, predict_batch
20
+ from dataeval.utils.torch.internal import get_device, predict_batch
21
21
 
22
22
 
23
- def _mmd2_from_kernel_matrix(
23
+ def mmd2_from_kernel_matrix(
24
24
  kernel_mat: torch.Tensor, m: int, permute: bool = False, zero_diag: bool = True
25
25
  ) -> torch.Tensor:
26
26
  """
@@ -127,7 +127,7 @@ def _squared_pairwise_distance(
127
127
 
128
128
  def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.Tensor:
129
129
  """
130
- Bandwidth estimation using the median heuristic :cite:t:`Gretton2012`.
130
+ Bandwidth estimation using the median heuristic `Gretton2012`
131
131
 
132
132
  Parameters
133
133
  ----------
@@ -151,7 +151,7 @@ def sigma_median(x: torch.Tensor, y: torch.Tensor, dist: torch.Tensor) -> torch.
151
151
  return sigma
152
152
 
153
153
 
154
- class _GaussianRBF(nn.Module):
154
+ class GaussianRBF(nn.Module):
155
155
  """
156
156
  Gaussian RBF kernel: k(x,y) = exp(-(1/(2*sigma^2)||x-y||^2).
157
157
 
@@ -179,18 +179,18 @@ class _GaussianRBF(nn.Module):
179
179
  ) -> None:
180
180
  super().__init__()
181
181
  init_sigma_fn = sigma_median if init_sigma_fn is None else init_sigma_fn
182
- self.config = {
182
+ self.config: dict[str, Any] = {
183
183
  "sigma": sigma,
184
184
  "trainable": trainable,
185
185
  "init_sigma_fn": init_sigma_fn,
186
186
  }
187
187
  if sigma is None:
188
- self.log_sigma = nn.Parameter(torch.empty(1), requires_grad=trainable)
189
- self.init_required = True
188
+ self.log_sigma: nn.Parameter = nn.Parameter(torch.empty(1), requires_grad=trainable)
189
+ self.init_required: bool = True
190
190
  else:
191
191
  sigma = sigma.reshape(-1) # [Ns,]
192
- self.log_sigma = nn.Parameter(sigma.log(), requires_grad=trainable)
193
- self.init_required = False
192
+ self.log_sigma: nn.Parameter = nn.Parameter(sigma.log(), requires_grad=trainable)
193
+ self.init_required: bool = False
194
194
  self.init_sigma_fn = init_sigma_fn
195
195
  self.trainable = trainable
196
196
 
@@ -200,8 +200,8 @@ class _GaussianRBF(nn.Module):
200
200
 
201
201
  def forward(
202
202
  self,
203
- x: np.ndarray | torch.Tensor,
204
- y: np.ndarray | torch.Tensor,
203
+ x: np.ndarray[Any, Any] | torch.Tensor,
204
+ y: np.ndarray[Any, Any] | torch.Tensor,
205
205
  infer_sigma: bool = False,
206
206
  ) -> torch.Tensor:
207
207
  x, y = torch.as_tensor(x), torch.as_tensor(y)
@@ -213,7 +213,7 @@ class _GaussianRBF(nn.Module):
213
213
  sigma = self.init_sigma_fn(x, y, dist)
214
214
  with torch.no_grad():
215
215
  self.log_sigma.copy_(sigma.log().clone())
216
- self.init_required = False
216
+ self.init_required: bool = False
217
217
 
218
218
  gamma = 1.0 / (2.0 * self.sigma**2) # [Ns,]
219
219
  # TODO: do matrix multiplication after all?
@@ -8,7 +8,7 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from __future__ import annotations
10
10
 
11
- __all__ = ["DriftUncertainty"]
11
+ __all__ = []
12
12
 
13
13
  from functools import partial
14
14
  from typing import Callable, Literal
@@ -20,7 +20,8 @@ from scipy.stats import entropy
20
20
 
21
21
  from dataeval.detectors.drift.base import DriftOutput, UpdateStrategy
22
22
  from dataeval.detectors.drift.ks import DriftKS
23
- from dataeval.detectors.drift.torch import get_device, preprocess_drift
23
+ from dataeval.detectors.drift.torch import preprocess_drift
24
+ from dataeval.utils.torch.internal import get_device
24
25
 
25
26
 
26
27
  def classifier_uncertainty(
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["ClustererOutput", "Clusterer"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Any, Iterable, NamedTuple, cast
@@ -147,12 +147,6 @@ class Clusterer:
147
147
  ----
148
148
  The Clusterer works best when the length of the feature dimension, P, is less than 500.
149
149
  If flattening a CxHxW image results in a dimension larger than 500, then it is recommended to reduce the dimensions.
150
-
151
- Example
152
- -------
153
- Initialize the Clusterer class:
154
-
155
- >>> cluster = Clusterer(dataset)
156
150
  """
157
151
 
158
152
  def __init__(self, dataset: ArrayLike) -> None:
@@ -506,6 +500,7 @@ class Clusterer:
506
500
 
507
501
  Example
508
502
  -------
503
+ >>> cluster = Clusterer(clusterer_images)
509
504
  >>> cluster.evaluate()
510
505
  ClustererOutput(outliers=[18, 21, 34, 35, 45], potential_outliers=[13, 15, 42], duplicates=[[9, 24], [23, 48]], potential_duplicates=[[1, 11]])
511
506
  """ # noqa: E501
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["DuplicatesOutput", "Duplicates"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Generic, Iterable, Sequence, TypeVar, overload
@@ -51,13 +51,6 @@ class Duplicates:
51
51
  ----------
52
52
  only_exact : bool, default False
53
53
  Only inspect the dataset for exact image matches
54
-
55
- Example
56
- -------
57
- Initialize the Duplicates class:
58
-
59
- >>> all_dupes = Duplicates()
60
- >>> exact_dupes = Duplicates(only_exact=True)
61
54
  """
62
55
 
63
56
  def __init__(self, only_exact: bool = False) -> None:
@@ -73,7 +66,8 @@ class Duplicates:
73
66
  if not self.only_exact:
74
67
  near_dict: dict[int, list] = {}
75
68
  for i, value in enumerate(stats["pchash"]):
76
- near_dict.setdefault(value, []).append(i)
69
+ if value:
70
+ near_dict.setdefault(value, []).append(i)
77
71
  near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
78
72
  else:
79
73
  near = []
@@ -112,6 +106,7 @@ class Duplicates:
112
106
 
113
107
  Example
114
108
  -------
109
+ >>> exact_dupes = Duplicates(only_exact=True)
115
110
  >>> exact_dupes.from_stats([hashes1, hashes2])
116
111
  DuplicatesOutput(exact=[{0: [3, 20]}, {0: [16], 1: [12]}], near=[])
117
112
  """
@@ -159,7 +154,8 @@ class Duplicates:
159
154
 
160
155
  Example
161
156
  -------
162
- >>> all_dupes.evaluate(images)
157
+ >>> all_dupes = Duplicates()
158
+ >>> all_dupes.evaluate(duplicate_images)
163
159
  DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
164
160
  """ # noqa: E501
165
161
  self.stats = hashstats(data)
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ["OutliersOutput", "Outliers"]
3
+ __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
@@ -188,6 +188,7 @@ class Outliers:
188
188
  -------
189
189
  Evaluate the dataset:
190
190
 
191
+ >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
191
192
  >>> results = outliers.from_stats([stats1, stats2])
192
193
  >>> len(results)
193
194
  2
@@ -248,7 +249,8 @@ class Outliers:
248
249
  -------
249
250
  Evaluate the dataset:
250
251
 
251
- >>> results = outliers.evaluate(images)
252
+ >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
253
+ >>> results = outliers.evaluate(outlier_images)
252
254
  >>> list(results.issues)
253
255
  [10, 12]
254
256
  >>> results.issues[10]
@@ -0,0 +1,8 @@
1
+ """
2
+ Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
3
+ """
4
+
5
+ __all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
6
+
7
+ from dataeval.detectors.ood.ae import OOD_AE
8
+ from dataeval.detectors.ood.output import OODOutput, OODScoreOutput