dataeval 0.75.0__tar.gz → 0.76.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {dataeval-0.75.0 → dataeval-0.76.0}/LICENSE.txt +2 -2
  2. {dataeval-0.75.0 → dataeval-0.76.0}/PKG-INFO +18 -17
  3. {dataeval-0.75.0 → dataeval-0.76.0}/README.md +16 -15
  4. {dataeval-0.75.0 → dataeval-0.76.0}/pyproject.toml +6 -4
  5. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/__init__.py +3 -3
  6. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/base.py +2 -2
  7. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/ks.py +2 -1
  8. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/mmd.py +3 -2
  9. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/uncertainty.py +2 -2
  10. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/updates.py +1 -1
  11. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/linters/clusterer.py +3 -2
  12. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/linters/duplicates.py +4 -4
  13. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/linters/outliers.py +96 -3
  14. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/__init__.py +1 -1
  15. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/base.py +1 -17
  16. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/output.py +1 -1
  17. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/interop.py +1 -1
  18. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/__init__.py +1 -1
  19. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/bias/__init__.py +1 -1
  20. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/bias/balance.py +3 -3
  21. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/bias/coverage.py +1 -1
  22. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/bias/diversity.py +14 -10
  23. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/bias/parity.py +5 -5
  24. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/estimators/ber.py +4 -3
  25. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/estimators/divergence.py +3 -3
  26. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/estimators/uap.py +3 -3
  27. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/__init__.py +1 -1
  28. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/base.py +24 -8
  29. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/boxratiostats.py +5 -5
  30. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/datasetstats.py +39 -6
  31. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/dimensionstats.py +4 -4
  32. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/hashstats.py +2 -2
  33. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/labelstats.py +89 -6
  34. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/pixelstats.py +7 -5
  35. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/stats/visualstats.py +6 -4
  36. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/output.py +23 -14
  37. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/__init__.py +2 -2
  38. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/dataset/read.py +1 -1
  39. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/dataset/split.py +1 -1
  40. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/metadata.py +42 -44
  41. dataeval-0.76.0/src/dataeval/utils/plot.py +249 -0
  42. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/workflows/sufficiency.py +2 -2
  43. dataeval-0.75.0/src/dataeval/utils/plot.py +0 -126
  44. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/__init__.py +0 -0
  45. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/__init__.py +0 -0
  46. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/cvm.py +0 -0
  47. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/drift/torch.py +0 -0
  48. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/linters/__init__.py +0 -0
  49. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/linters/merged_stats.py +0 -0
  50. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/ae.py +0 -0
  51. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -0
  52. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/metadata_least_likely.py +0 -0
  53. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
  54. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/detectors/ood/mixin.py +0 -0
  55. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/log.py +0 -0
  56. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/metrics/estimators/__init__.py +0 -0
  57. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/py.typed +0 -0
  58. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/dataset/__init__.py +0 -0
  59. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/dataset/datasets.py +0 -0
  60. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/image.py +0 -0
  61. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/shared.py +0 -0
  62. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/torch/__init__.py +0 -0
  63. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/torch/blocks.py +0 -0
  64. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/torch/gmm.py +0 -0
  65. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/torch/internal.py +0 -0
  66. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/torch/models.py +0 -0
  67. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/utils/torch/trainer.py +0 -0
  68. {dataeval-0.75.0 → dataeval-0.76.0}/src/dataeval/workflows/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2024 ARiA
3
+ Copyright (c) 2025 ARiA
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
18
  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
19
  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
20
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.75.0
3
+ Version: 0.76.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -22,7 +22,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
22
22
  Classifier: Topic :: Scientific/Engineering
23
23
  Provides-Extra: all
24
24
  Requires-Dist: matplotlib ; extra == "all"
25
- Requires-Dist: numpy (>=1.24.3)
25
+ Requires-Dist: numpy (>=1.24.2)
26
26
  Requires-Dist: pillow (>=10.3.0)
27
27
  Requires-Dist: requests
28
28
  Requires-Dist: scikit-learn (>=1.5.0)
@@ -52,7 +52,7 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
52
52
 
53
53
  <!-- start needs -->
54
54
 
55
- DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports **model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
55
+ DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
56
56
 
57
57
  <!-- end needs -->
58
58
 
@@ -74,9 +74,10 @@ Choose your preferred method of installation below or follow our [installation g
74
74
  * [Installing from GitHub](#installing-from-github)
75
75
 
76
76
  ### **Installing with pip**
77
+
77
78
  You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
78
79
 
79
- ```
80
+ ```bash
80
81
  pip install dataeval[all]
81
82
  ```
82
83
 
@@ -85,7 +86,7 @@ pip install dataeval[all]
85
86
  DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
86
87
  are installed from the `pytorch` channel, the channel is specified in the below example.
87
88
 
88
- ```
89
+ ```bash
89
90
  micromamba create -f environment\environment.yaml -c pytorch
90
91
  ```
91
92
 
@@ -93,24 +94,27 @@ micromamba create -f environment\environment.yaml -c pytorch
93
94
 
94
95
  To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
95
96
 
96
- ```
97
+ ```bash
97
98
  sudo apt-get install git-lfs
98
99
  pip install poetry
99
100
  ```
100
101
 
101
102
  Pull the source down and change to the DataEval project directory.
102
- ```
103
+
104
+ ```bash
103
105
  git clone https://github.com/aria-ml/dataeval.git
104
106
  cd dataeval
105
107
  ```
106
108
 
107
109
  Install DataEval with optional dependencies for development.
108
- ```
110
+
111
+ ```bash
109
112
  poetry install --all-extras --with dev
110
113
  ```
111
114
 
112
115
  Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
113
- ```
116
+
117
+ ```bash
114
118
  poetry shell
115
119
  ```
116
120
 
@@ -118,19 +122,16 @@ poetry shell
118
122
 
119
123
  If you have any questions, feel free to reach out to the people below:
120
124
 
121
- - **POC**: Scott Swan @scott.swan
122
- - **DPOC**: Andrew Weng @aweng
125
+ * **POC**: Scott Swan @scott.swan
126
+ * **DPOC**: Andrew Weng @aweng
123
127
 
124
128
  ## Acknowledgement
125
129
 
126
- <!-- start attribution -->
127
-
128
- ### Alibi-Detect
129
- This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) Python library developed by SeldonIO.\
130
- Additional documentation from their developers is available on the [Alibi-Detect documentation page](https://docs.seldon.io/projects/alibi-detect/en/stable/).
130
+ <!-- start acknowledgement -->
131
131
 
132
132
  ### CDAO Funding Acknowledgement
133
+
133
134
  This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
134
135
 
135
- <!-- end attribution -->
136
+ <!-- end acknowledgement -->
136
137
 
@@ -14,7 +14,7 @@ DataEval curates datasets to train and test performant, robust, unbiased and rel
14
14
 
15
15
  <!-- start needs -->
16
16
 
17
- DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports **model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
17
+ DataEval is an effective, powerful, and reliable set of tools for any T&E engineer. Throughout all stages of the machine learning lifecycle, DataEval supports model development, data analysis, and monitoring with state-of-the-art algorithms to help you solve difficult problems. With a focus on computer vision tasks, DataEval provides simple, but effective metrics for performance estimation, bias detection, and dataset linting.
18
18
 
19
19
  <!-- end needs -->
20
20
 
@@ -36,9 +36,10 @@ Choose your preferred method of installation below or follow our [installation g
36
36
  * [Installing from GitHub](#installing-from-github)
37
37
 
38
38
  ### **Installing with pip**
39
+
39
40
  You can install DataEval directly from pypi.org using the following command. The optional dependencies of DataEval are `all`.
40
41
 
41
- ```
42
+ ```bash
42
43
  pip install dataeval[all]
43
44
  ```
44
45
 
@@ -47,7 +48,7 @@ pip install dataeval[all]
47
48
  DataEval can be installed in a Conda/Mamba environment using the provided `environment.yaml` file. As some dependencies
48
49
  are installed from the `pytorch` channel, the channel is specified in the below example.
49
50
 
50
- ```
51
+ ```bash
51
52
  micromamba create -f environment\environment.yaml -c pytorch
52
53
  ```
53
54
 
@@ -55,24 +56,27 @@ micromamba create -f environment\environment.yaml -c pytorch
55
56
 
56
57
  To install DataEval from source locally on Ubuntu, you will need `git-lfs` to download larger, binary source files and `poetry` for project dependency management.
57
58
 
58
- ```
59
+ ```bash
59
60
  sudo apt-get install git-lfs
60
61
  pip install poetry
61
62
  ```
62
63
 
63
64
  Pull the source down and change to the DataEval project directory.
64
- ```
65
+
66
+ ```bash
65
67
  git clone https://github.com/aria-ml/dataeval.git
66
68
  cd dataeval
67
69
  ```
68
70
 
69
71
  Install DataEval with optional dependencies for development.
70
- ```
72
+
73
+ ```bash
71
74
  poetry install --all-extras --with dev
72
75
  ```
73
76
 
74
77
  Now that DataEval is installed, you can run commands in the poetry virtual environment by prefixing shell commands with `poetry run`, or activate the virtual environment directly in the shell.
75
- ```
78
+
79
+ ```bash
76
80
  poetry shell
77
81
  ```
78
82
 
@@ -80,18 +84,15 @@ poetry shell
80
84
 
81
85
  If you have any questions, feel free to reach out to the people below:
82
86
 
83
- - **POC**: Scott Swan @scott.swan
84
- - **DPOC**: Andrew Weng @aweng
87
+ * **POC**: Scott Swan @scott.swan
88
+ * **DPOC**: Andrew Weng @aweng
85
89
 
86
90
  ## Acknowledgement
87
91
 
88
- <!-- start attribution -->
89
-
90
- ### Alibi-Detect
91
- This project uses code from the [Alibi-Detect](https://github.com/SeldonIO/alibi-detect) Python library developed by SeldonIO.\
92
- Additional documentation from their developers is available on the [Alibi-Detect documentation page](https://docs.seldon.io/projects/alibi-detect/en/stable/).
92
+ <!-- start acknowledgement -->
93
93
 
94
94
  ### CDAO Funding Acknowledgement
95
+
95
96
  This material is based upon work supported by the Chief Digital and Artificial Intelligence Office under Contract No. W519TC-23-9-2033. The views and conclusions contained herein are those of the author(s) and should not be interpreted as necessarily representing the official policies or endorsements, either expressed or implied, of the U.S. Government.
96
97
 
97
- <!-- end attribution -->
98
+ <!-- end acknowledgement -->
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.75.0" # dynamic
3
+ version = "0.76.0" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -42,7 +42,7 @@ packages = [
42
42
  [tool.poetry.dependencies]
43
43
  # required
44
44
  python = ">=3.9,<3.13"
45
- numpy = {version = ">=1.24.3"}
45
+ numpy = {version = ">=1.24.2"}
46
46
  pillow = {version = ">=10.3.0"}
47
47
  requests = {version = "*"}
48
48
  scipy = {version = ">=1.10"}
@@ -88,10 +88,11 @@ certifi = {version = ">=2024.07.04"}
88
88
  enum_tools = {version = ">=0.12.0", extras = ["sphinx"]}
89
89
  ipykernel = {version = ">=6.26.0"}
90
90
  ipywidgets = {version = ">=8.1.1"}
91
+ jinja2 = {version = ">=3.1.5"}
91
92
  jupyter-client = {version = ">=8.6.0"}
92
93
  jupyter-cache = {version = "*"}
93
94
  myst-nb = {version = ">=1.0.0"}
94
- pydata-sphinx-theme = {version = ">=0.15.4"}
95
+ sphinx-immaterial = {version = "*"}
95
96
  sphinx-autoapi = {version = "*"}
96
97
  sphinx-design = {version = "*"}
97
98
  sphinx-tabs = {version = "*"}
@@ -137,6 +138,7 @@ parallel = true
137
138
  [tool.coverage.report]
138
139
  exclude_also = [
139
140
  "raise NotImplementedError",
141
+ ": \\.\\.\\."
140
142
  ]
141
143
  include = ["*/src/dataeval/*"]
142
144
  omit = [
@@ -184,7 +186,7 @@ docstring-code-format = true
184
186
  docstring-code-line-length = "dynamic"
185
187
 
186
188
  [tool.codespell]
187
- skip = './*env*,./prototype,./output,./docs/build,./docs/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
189
+ skip = './*env*,./prototype,./output,./docs/build,./docs/source/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
188
190
  ignore-words-list = ["Hart"]
189
191
 
190
192
  [build-system]
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["detectors", "log", "metrics", "utils", "workflows"]
11
- __version__ = "0.75.0"
11
+ __version__ = "0.76.0"
12
12
 
13
13
  import logging
14
14
 
@@ -24,10 +24,10 @@ def log(level: int = logging.DEBUG, handler: logging.Handler | None = None) -> N
24
24
  Parameters
25
25
  ----------
26
26
  level : int, default logging.DEBUG(10)
27
- Set the logging level for the logger
27
+ Set the logging level for the logger.
28
28
  handler : logging.Handler, optional
29
29
  Sets the logging handler for the logger if provided, otherwise logger will be
30
- provided with a StreamHandler
30
+ provided with a StreamHandler.
31
31
  """
32
32
  import logging
33
33
 
@@ -45,7 +45,7 @@ class UpdateStrategy(ABC):
45
45
  @dataclass(frozen=True)
46
46
  class DriftBaseOutput(Output):
47
47
  """
48
- Base output class for Drift detector classes
48
+ Base output class for Drift Detector classes
49
49
 
50
50
  Attributes
51
51
  ----------
@@ -64,7 +64,7 @@ class DriftBaseOutput(Output):
64
64
  @dataclass(frozen=True)
65
65
  class DriftOutput(DriftBaseOutput):
66
66
  """
67
- Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors
67
+ Output class for :class:`DriftCVM`, :class:`DriftKS`, and :class:`DriftUncertainty` drift detectors.
68
68
 
69
69
  Attributes
70
70
  ----------
@@ -22,7 +22,8 @@ from dataeval.interop import to_numpy
22
22
 
23
23
  class DriftKS(BaseDriftUnivariate):
24
24
  """
25
- :term:`Drift` detector employing the Kolmogorov-Smirnov (KS) distribution test.
25
+ :term:`Drift` detector employing the :term:`Kolmogorov-Smirnov (KS) \
26
+ distribution<Kolmogorov-Smirnov (K-S) test>` test.
26
27
 
27
28
  The KS test detects changes in the maximum distance between two data
28
29
  distributions with Bonferroni or :term:`False Discovery Rate (FDR)` correction
@@ -26,7 +26,7 @@ from dataeval.utils.torch.internal import get_device
26
26
  @dataclass(frozen=True)
27
27
  class DriftMMDOutput(DriftBaseOutput):
28
28
  """
29
- Output class for :class:`DriftMMD` :term:`drift<Drift>` detector
29
+ Output class for :class:`DriftMMD` :term:`drift<Drift>` detector.
30
30
 
31
31
  Attributes
32
32
  ----------
@@ -51,7 +51,8 @@ class DriftMMDOutput(DriftBaseOutput):
51
51
 
52
52
  class DriftMMD(BaseDrift):
53
53
  """
54
- :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm using a permutation test.
54
+ :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
55
+ using a permutation test.
55
56
 
56
57
  Parameters
57
58
  ----------
@@ -66,8 +66,8 @@ def classifier_uncertainty(
66
66
 
67
67
  class DriftUncertainty:
68
68
  """
69
- Test for a change in the number of instances falling into regions on which the
70
- model is uncertain.
69
+ Test for a change in the number of instances falling into regions on which \
70
+ the model is uncertain.
71
71
 
72
72
  Performs a K-S test on prediction entropies.
73
73
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring
2
+ Update strategies inform how the :term:`drift<Drift>` detector classes update the reference data when monitoring.
3
3
  for drift.
4
4
  """
5
5
 
@@ -18,7 +18,7 @@ from dataeval.utils.shared import flatten
18
18
  @dataclass(frozen=True)
19
19
  class ClustererOutput(Output):
20
20
  """
21
- Output class for :class:`Clusterer` lint detector
21
+ Output class for :class:`Clusterer` lint detector.
22
22
 
23
23
  Attributes
24
24
  ----------
@@ -131,7 +131,8 @@ class _ClusterMergeEntry:
131
131
 
132
132
  class Clusterer:
133
133
  """
134
- Uses hierarchical clustering to flag dataset properties of interest like Outliers and :term:`duplicates<Duplicates>`
134
+ Uses hierarchical clustering to flag dataset properties of interest like outliers \
135
+ and :term:`duplicates<Duplicates>`.
135
136
 
136
137
  Parameters
137
138
  ----------
@@ -19,7 +19,7 @@ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateG
19
19
  @dataclass(frozen=True)
20
20
  class DuplicatesOutput(Generic[TIndexCollection], Output):
21
21
  """
22
- Output class for :class:`Duplicates` lint detector
22
+ Output class for :class:`Duplicates` lint detector.
23
23
 
24
24
  Attributes
25
25
  ----------
@@ -39,8 +39,8 @@ class DuplicatesOutput(Generic[TIndexCollection], Output):
39
39
 
40
40
  class Duplicates:
41
41
  """
42
- Finds the duplicate images in a dataset using xxhash for exact :term:`duplicates<Duplicates>`
43
- and pchash for near duplicates
42
+ Finds the duplicate images in a dataset using xxhash for exact \
43
+ :term:`duplicates<Duplicates>` and pchash for near duplicates.
44
44
 
45
45
  Attributes
46
46
  ----------
@@ -92,7 +92,7 @@ class Duplicates:
92
92
 
93
93
  Parameters
94
94
  ----------
95
- data : HashStatsOutput | Sequence[HashStatsOutput]
95
+ hashes : HashStatsOutput | Sequence[HashStatsOutput]
96
96
  The output(s) from a hashstats analysis
97
97
 
98
98
  Returns
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
+ # import contextlib
5
6
  from dataclasses import dataclass
6
7
  from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
7
8
 
@@ -12,19 +13,78 @@ from dataeval.detectors.linters.merged_stats import combine_stats, get_dataset_s
12
13
  from dataeval.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
13
14
  from dataeval.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
14
15
  from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput
16
+ from dataeval.metrics.stats.labelstats import LabelStatsOutput
15
17
  from dataeval.metrics.stats.pixelstats import PixelStatsOutput
16
18
  from dataeval.metrics.stats.visualstats import VisualStatsOutput
17
19
  from dataeval.output import Output, set_metadata
18
20
 
21
+ # with contextlib.suppress(ImportError):
22
+ # import pandas as pd
23
+
24
+
19
25
  IndexIssueMap = dict[int, dict[str, float]]
20
26
  OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
21
27
  TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
22
28
 
23
29
 
30
+ def _reorganize_by_class_and_metric(result, lstats):
31
+ """Flip result from grouping by image to grouping by class and metric"""
32
+ metrics = {}
33
+ class_wise = {label: {} for label in lstats.image_indices_per_label}
34
+
35
+ # Group metrics and calculate class-wise counts
36
+ for img, group in result.items():
37
+ for extreme in group:
38
+ metrics.setdefault(extreme, []).append(img)
39
+ for label, images in lstats.image_indices_per_label.items():
40
+ if img in images:
41
+ class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
42
+
43
+ return metrics, class_wise
44
+
45
+
46
+ def _create_table(metrics, class_wise):
47
+ """Create table for displaying the results"""
48
+ max_class_length = max(len(str(label)) for label in class_wise) + 2
49
+ max_total = max(len(metrics[group]) for group in metrics) + 2
50
+
51
+ table_header = " | ".join(
52
+ [f"{'Class':>{max_class_length}}"]
53
+ + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
54
+ + [f"{'Total':<{max_total}}"]
55
+ )
56
+ table_rows = []
57
+
58
+ for class_cat, results in class_wise.items():
59
+ table_value = [f"{class_cat:>{max_class_length}}"]
60
+ total = 0
61
+ for group in sorted(metrics.keys()):
62
+ count = results.get(group, 0)
63
+ table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
64
+ total += count
65
+ table_value.append(f"{total:^{max_total}}")
66
+ table_rows.append(" | ".join(table_value))
67
+
68
+ table = [table_header] + table_rows
69
+ return table
70
+
71
+
72
+ # def _create_pandas_dataframe(class_wise):
73
+ # """Create data for pandas dataframe"""
74
+ # data = []
75
+ # for label, metrics_dict in class_wise.items():
76
+ # row = {"Class": label}
77
+ # total = sum(metrics_dict.values())
78
+ # row.update(metrics_dict) # Add metric counts
79
+ # row["Total"] = total
80
+ # data.append(row)
81
+ # return data
82
+
83
+
24
84
  @dataclass(frozen=True)
25
85
  class OutliersOutput(Generic[TIndexIssueMap], Output):
26
86
  """
27
- Output class for :class:`Outliers` lint detector
87
+ Output class for :class:`Outliers` lint detector.
28
88
 
29
89
  Attributes
30
90
  ----------
@@ -45,6 +105,39 @@ class OutliersOutput(Generic[TIndexIssueMap], Output):
45
105
  else:
46
106
  return sum(len(d) for d in self.issues)
47
107
 
108
+ def to_table(self, labelstats: LabelStatsOutput) -> str:
109
+ if isinstance(self.issues, dict):
110
+ metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
111
+ listed_table = _create_table(metrics, classwise)
112
+ table = "\n".join(listed_table)
113
+ else:
114
+ outertable = []
115
+ for d in self.issues:
116
+ metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
117
+ listed_table = _create_table(metrics, classwise)
118
+ str_table = "\n".join(listed_table)
119
+ outertable.append(str_table)
120
+ table = "\n\n".join(outertable)
121
+ return table
122
+
123
+ # def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
124
+ # import pandas as pd
125
+
126
+ # if isinstance(self.issues, dict):
127
+ # _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
128
+ # data = _create_pandas_dataframe(classwise)
129
+ # df = pd.DataFrame(data)
130
+ # else:
131
+ # df_list = []
132
+ # for i, d in enumerate(self.issues):
133
+ # _, classwise = _reorganize_by_class_and_metric(d, labelstats)
134
+ # data = _create_pandas_dataframe(classwise)
135
+ # single_df = pd.DataFrame(data)
136
+ # single_df["Dataset"] = i
137
+ # df_list.append(single_df)
138
+ # df = pd.concat(df_list)
139
+ # return df
140
+
48
141
 
49
142
  def _get_outlier_mask(
50
143
  values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
@@ -71,7 +164,7 @@ def _get_outlier_mask(
71
164
 
72
165
  class Outliers:
73
166
  r"""
74
- Calculates statistical Outliers of a dataset using various statistical tests applied to each image
167
+ Calculates statistical outliers of a dataset using various statistical tests applied to each image.
75
168
 
76
169
  Parameters
77
170
  ----------
@@ -164,7 +257,7 @@ class Outliers:
164
257
  self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
165
258
  ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
166
259
  """
167
- Returns indices of Outliers with the issues identified for each
260
+ Returns indices of Outliers with the issues identified for each.
168
261
 
169
262
  Parameters
170
263
  ----------
@@ -1,5 +1,5 @@
1
1
  """
2
- Out-of-distribution (OOD)` detectors identify data that is different from the data used to train a particular model.
2
+ Out-of-distribution (OOD) detectors identify data that is different from the data used to train a particular model.
3
3
  """
4
4
 
5
5
  __all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
@@ -87,24 +87,8 @@ class OODBaseGMM(OODBase, OODGMMMixin[GaussianMixtureModelParams]):
87
87
  batch_size: int,
88
88
  verbose: bool,
89
89
  ) -> None:
90
- # Train the model
91
- trainer(
92
- model=self.model,
93
- x_train=to_numpy(x_ref),
94
- y_train=None,
95
- loss_fn=loss_fn,
96
- optimizer=optimizer,
97
- preprocess_fn=None,
98
- epochs=epochs,
99
- batch_size=batch_size,
100
- device=self.device,
101
- verbose=verbose,
102
- )
90
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
103
91
 
104
92
  # Calculate the GMM parameters
105
93
  _, z, gamma = cast(tuple[torch.Tensor, torch.Tensor, torch.Tensor], self.model(x_ref))
106
94
  self._gmm_params = gmm_params(z, gamma)
107
-
108
- # Infer the threshold values
109
- self._ref_score = self.score(x_ref, batch_size)
110
- self._threshold_perc = threshold_perc
@@ -36,7 +36,7 @@ class OODScoreOutput(Output):
36
36
  """
37
37
  Output class for instance and feature scores from out-of-distribution detectors.
38
38
 
39
- Parameters
39
+ Attributes
40
40
  ----------
41
41
  instance_score : NDArray
42
42
  Instance score of the evaluated dataset.
@@ -46,7 +46,7 @@ def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
46
46
  if isinstance(array, np.ndarray):
47
47
  return array.copy() if copy else array
48
48
 
49
- if array.__class__.__module__.startswith("tensorflow"):
49
+ if array.__class__.__module__.startswith("tensorflow"): # pragma: no cover - removed tf from deps
50
50
  tf = _try_import("tensorflow")
51
51
  if tf and tf.is_tensor(array):
52
52
  _logger.log(logging.INFO, "Converting Tensorflow array to NumPy array.")
@@ -1,5 +1,5 @@
1
1
  """
2
- Metrics are a way to measure the performance of your models or datasets that
2
+ Metrics are a way to measure the performance of your models or datasets that \
3
3
  can then be analyzed in the context of a given problem.
4
4
  """
5
5
 
@@ -1,5 +1,5 @@
1
1
  """
2
- Bias metrics check for skewed or imbalanced datasets and incomplete feature
2
+ Bias metrics check for skewed or imbalanced datasets and incomplete feature \
3
3
  representation which may impact model performance.
4
4
  """
5
5
 
@@ -23,8 +23,8 @@ with contextlib.suppress(ImportError):
23
23
  @dataclass(frozen=True)
24
24
  class BalanceOutput(Output):
25
25
  """
26
- Output class for :func:`balance` bias metric
27
-
26
+ Output class for :func:`balance` :term:`bias<Bias>` metric.
27
+
28
28
  Attributes
29
29
  ----------
30
30
  balance : NDArray[np.float64]
@@ -123,7 +123,7 @@ def balance(
123
123
  num_neighbors: int = 5,
124
124
  ) -> BalanceOutput:
125
125
  """
126
- Mutual information (MI) between factors (class label, metadata, label/image properties)
126
+ Mutual information (MI) between factors (class label, metadata, label/image properties).
127
127
 
128
128
  Parameters
129
129
  ----------
@@ -71,7 +71,7 @@ def _plot(images: NDArray[Any], num_images: int) -> Figure:
71
71
  @dataclass(frozen=True)
72
72
  class CoverageOutput(Output):
73
73
  """
74
- Output class for :func:`coverage` :term:`bias<Bias>` metric
74
+ Output class for :func:`coverage` :term:`bias<Bias>` metric.
75
75
 
76
76
  Attributes
77
77
  ----------