dataeval 0.73.0__tar.gz → 0.73.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.73.0 → dataeval-0.73.1}/PKG-INFO +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/pyproject.toml +6 -4
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/__init__.py +3 -3
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/__init__.py +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/__init__.py +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/base.py +2 -2
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/clusterer.py +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/__init__.py +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/balance.py +29 -19
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/coverage.py +11 -11
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/diversity.py +79 -50
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/metadata.py +133 -51
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/parity.py +30 -24
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/__init__.py +2 -2
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/shared.py +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/split_dataset.py +12 -6
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/datasets.py +2 -2
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/workflows/__init__.py +1 -1
- {dataeval-0.73.0 → dataeval-0.73.1}/LICENSE.txt +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/README.md +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/cvm.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/ks.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/mmd.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/torch.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/uncertainty.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/updates.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/duplicates.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/merged_stats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/outliers.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/ae.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/aegmm.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/base.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/llr.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/metadata_least_likely.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/vae.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/vaegmm.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/interop.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/ber.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/divergence.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/uap.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/base.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/boxratiostats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/datasetstats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/dimensionstats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/hashstats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/labelstats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/pixelstats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/visualstats.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/output.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/py.typed +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/image.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/lazy.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/metadata.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/gmm.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/loss.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/models.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/trainer.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/utils.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/loss/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/blocks.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/trainer.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/utils.py +0 -0
- {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.73.
|
3
|
+
Version: 0.73.1
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.73.
|
3
|
+
version = "0.73.1" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -69,8 +69,7 @@ all = ["matplotlib", "markupsafe", "tensorflow", "tensorflow_probability", "tf-k
|
|
69
69
|
optional = true
|
70
70
|
|
71
71
|
[tool.poetry.group.dev.dependencies]
|
72
|
-
|
73
|
-
tox-uv = {version = "*"}
|
72
|
+
nox = {version = "*", extras = ["uv"]}
|
74
73
|
uv = {version = "*"}
|
75
74
|
poetry = {version = "*"}
|
76
75
|
poetry-lock-groups-plugin = {version = "*"}
|
@@ -122,7 +121,6 @@ files = ["src/dataeval/__init__.py"]
|
|
122
121
|
name = "dataeval"
|
123
122
|
|
124
123
|
[tool.poetry2conda.dependencies]
|
125
|
-
nvidia-cudnn-cu11 = { name = "cudnn" }
|
126
124
|
tensorflow_probability = { name = "tensorflow-probability" }
|
127
125
|
torch = { name = "pytorch" }
|
128
126
|
xxhash = { name = "python-xxhash" }
|
@@ -145,6 +143,9 @@ parallel = true
|
|
145
143
|
exclude_also = [
|
146
144
|
"raise NotImplementedError",
|
147
145
|
"if TYPE_CHECKING:",
|
146
|
+
"if _IS_TENSORFLOW_AVAILABLE",
|
147
|
+
"if _IS_TORCH_AVAILABLE",
|
148
|
+
"if _IS_TORCHVISION_AVAILABLE",
|
148
149
|
]
|
149
150
|
include = ["*/src/dataeval/*"]
|
150
151
|
omit = [
|
@@ -164,6 +165,7 @@ exclude = [
|
|
164
165
|
"*env*",
|
165
166
|
"output",
|
166
167
|
"_build",
|
168
|
+
".nox",
|
167
169
|
".tox",
|
168
170
|
"prototype",
|
169
171
|
]
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.73.
|
1
|
+
__version__ = "0.73.1"
|
2
2
|
|
3
3
|
from importlib.util import find_spec
|
4
4
|
|
@@ -12,12 +12,12 @@ from dataeval import detectors, metrics # noqa: E402
|
|
12
12
|
|
13
13
|
__all__ = ["detectors", "metrics"]
|
14
14
|
|
15
|
-
if _IS_TORCH_AVAILABLE:
|
15
|
+
if _IS_TORCH_AVAILABLE:
|
16
16
|
from dataeval import workflows
|
17
17
|
|
18
18
|
__all__ += ["workflows"]
|
19
19
|
|
20
|
-
if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
|
20
|
+
if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
|
21
21
|
from dataeval import utils
|
22
22
|
|
23
23
|
__all__ += ["utils"]
|
@@ -10,7 +10,7 @@ from dataeval.detectors.drift.ks import DriftKS
|
|
10
10
|
|
11
11
|
__all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
|
12
12
|
|
13
|
-
if _IS_TORCH_AVAILABLE:
|
13
|
+
if _IS_TORCH_AVAILABLE:
|
14
14
|
from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
|
15
15
|
from dataeval.detectors.drift.torch import preprocess_drift
|
16
16
|
from dataeval.detectors.drift.uncertainty import DriftUncertainty
|
@@ -18,7 +18,7 @@ from typing import Any, Callable, Literal, TypeVar
|
|
18
18
|
import numpy as np
|
19
19
|
from numpy.typing import ArrayLike, NDArray
|
20
20
|
|
21
|
-
from dataeval.interop import as_numpy
|
21
|
+
from dataeval.interop import as_numpy
|
22
22
|
from dataeval.output import OutputMetadata, set_metadata
|
23
23
|
|
24
24
|
R = TypeVar("R")
|
@@ -196,7 +196,7 @@ class BaseDrift:
|
|
196
196
|
if correction not in ["bonferroni", "fdr"]:
|
197
197
|
raise ValueError("`correction` must be `bonferroni` or `fdr`.")
|
198
198
|
|
199
|
-
self._x_ref =
|
199
|
+
self._x_ref = as_numpy(x_ref)
|
200
200
|
self.x_ref_preprocessed: bool = x_ref_preprocessed
|
201
201
|
|
202
202
|
# Other attributes
|
@@ -480,7 +480,7 @@ class Clusterer:
|
|
480
480
|
samples = self.clusters[level][cluster_id].samples
|
481
481
|
if len(samples) >= self._min_num_samples_per_cluster:
|
482
482
|
duplicates_std.append(self.clusters[level][cluster_id].dist_std)
|
483
|
-
diag_mask = np.ones_like(self._sqdmat, dtype=
|
483
|
+
diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
|
484
484
|
np.fill_diagonal(diag_mask, 0)
|
485
485
|
diag_mask = np.triu(diag_mask)
|
486
486
|
|
@@ -4,7 +4,7 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
|
|
4
4
|
|
5
5
|
from dataeval import _IS_TENSORFLOW_AVAILABLE
|
6
6
|
|
7
|
-
if _IS_TENSORFLOW_AVAILABLE:
|
7
|
+
if _IS_TENSORFLOW_AVAILABLE:
|
8
8
|
from dataeval.detectors.ood.ae import OOD_AE
|
9
9
|
from dataeval.detectors.ood.aegmm import OOD_AEGMM
|
10
10
|
from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
11
11
|
from numpy.typing import ArrayLike, NDArray
|
12
12
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
13
13
|
|
14
|
-
from dataeval.metrics.bias.metadata import entropy, heatmap, preprocess_metadata
|
14
|
+
from dataeval.metrics.bias.metadata import CLASS_LABEL, entropy, heatmap, preprocess_metadata
|
15
15
|
from dataeval.output import OutputMetadata, set_metadata
|
16
16
|
|
17
17
|
with contextlib.suppress(ImportError):
|
@@ -31,9 +31,9 @@ class BalanceOutput(OutputMetadata):
|
|
31
31
|
Estimate of inter/intra-factor mutual information
|
32
32
|
classwise : NDArray[np.float64]
|
33
33
|
Estimate of mutual information between metadata factors and individual class labels
|
34
|
-
class_list: NDArray
|
34
|
+
class_list : NDArray
|
35
35
|
Array of the class labels present in the dataset
|
36
|
-
metadata_names: list[str]
|
36
|
+
metadata_names : list[str]
|
37
37
|
Names of each metadata factor
|
38
38
|
"""
|
39
39
|
|
@@ -54,9 +54,9 @@ class BalanceOutput(OutputMetadata):
|
|
54
54
|
|
55
55
|
Parameters
|
56
56
|
----------
|
57
|
-
row_labels : ArrayLike
|
57
|
+
row_labels : ArrayLike or None, default None
|
58
58
|
List/Array containing the labels for rows in the histogram
|
59
|
-
col_labels : ArrayLike
|
59
|
+
col_labels : ArrayLike or None, default None
|
60
60
|
List/Array containing the labels for columns in the histogram
|
61
61
|
plot_classwise : bool, default False
|
62
62
|
Whether to plot per-class balance instead of global balance
|
@@ -116,19 +116,29 @@ def validate_num_neighbors(num_neighbors: int) -> int:
|
|
116
116
|
|
117
117
|
|
118
118
|
@set_metadata("dataeval.metrics")
|
119
|
-
def balance(
|
119
|
+
def balance(
|
120
|
+
class_labels: ArrayLike,
|
121
|
+
metadata: Mapping[str, ArrayLike],
|
122
|
+
num_neighbors: int = 5,
|
123
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
124
|
+
) -> BalanceOutput:
|
120
125
|
"""
|
121
126
|
Mutual information (MI) between factors (class label, metadata, label/image properties)
|
122
127
|
|
123
128
|
Parameters
|
124
129
|
----------
|
125
|
-
class_labels: ArrayLike
|
130
|
+
class_labels : ArrayLike
|
126
131
|
List of class labels for each image
|
127
|
-
metadata: Mapping[str, ArrayLike]
|
132
|
+
metadata : Mapping[str, ArrayLike]
|
128
133
|
Dict of lists of metadata factors for each image
|
129
|
-
num_neighbors: int, default 5
|
134
|
+
num_neighbors : int, default 5
|
130
135
|
Number of nearest neighbors to use for computing MI between discrete
|
131
136
|
and continuous variables.
|
137
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
138
|
+
The factors in metadata that have continuous values and the array of bin counts to
|
139
|
+
discretize values into. All factors are treated as having discrete values unless they
|
140
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
141
|
+
in metadata.
|
132
142
|
|
133
143
|
Returns
|
134
144
|
-------
|
@@ -148,7 +158,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
148
158
|
-------
|
149
159
|
Return balance (mutual information) of factors with class_labels
|
150
160
|
|
151
|
-
>>> bal = balance(class_labels, metadata)
|
161
|
+
>>> bal = balance(class_labels, metadata, continuous_factor_bincounts=continuous_factor_bincounts)
|
152
162
|
>>> bal.balance
|
153
163
|
array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
|
154
164
|
|
@@ -165,6 +175,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
165
175
|
array([[0.99999822, 0.13363788, 0. , 0. ],
|
166
176
|
[0.99999822, 0.13363788, 0. , 0. ]])
|
167
177
|
|
178
|
+
|
168
179
|
See Also
|
169
180
|
--------
|
170
181
|
sklearn.feature_selection.mutual_info_classif
|
@@ -178,9 +189,9 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
178
189
|
mi[:] = np.nan
|
179
190
|
|
180
191
|
for idx in range(num_factors):
|
181
|
-
tgt = data[:, idx].astype(
|
192
|
+
tgt = data[:, idx].astype(np.intp)
|
182
193
|
|
183
|
-
if
|
194
|
+
if continuous_factor_bincounts and names[idx] not in continuous_factor_bincounts:
|
184
195
|
mi[idx, :] = mutual_info_classif(
|
185
196
|
data,
|
186
197
|
tgt,
|
@@ -197,7 +208,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
197
208
|
random_state=0,
|
198
209
|
)
|
199
210
|
|
200
|
-
ent_all = entropy(data, names,
|
211
|
+
ent_all = entropy(data, names, continuous_factor_bincounts, normalized=False)
|
201
212
|
norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
|
202
213
|
# in principle MI should be symmetric, but it is not in practice.
|
203
214
|
nmi = 0.5 * (mi + mi.T) / norm_factor
|
@@ -205,7 +216,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
205
216
|
factors = nmi[1:, 1:]
|
206
217
|
|
207
218
|
# unique class labels
|
208
|
-
class_idx = names.index(
|
219
|
+
class_idx = names.index(CLASS_LABEL)
|
209
220
|
u_cls = np.unique(data[:, class_idx])
|
210
221
|
num_classes = len(u_cls)
|
211
222
|
|
@@ -214,12 +225,11 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
|
|
214
225
|
classwise_mi[:] = np.nan
|
215
226
|
|
216
227
|
# categorical variables, excluding class label
|
217
|
-
cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(
|
228
|
+
cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(np.intp)
|
218
229
|
|
219
|
-
tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(
|
220
|
-
|
221
|
-
|
222
|
-
)
|
230
|
+
tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(np.intp)
|
231
|
+
names = [str(idx) for idx in range(num_classes)]
|
232
|
+
ent_tgt_bin = entropy(tgt_bin, names, continuous_factor_bincounts)
|
223
233
|
|
224
234
|
# classification MI for discrete/categorical features
|
225
235
|
for idx in range(num_classes):
|
@@ -5,7 +5,7 @@ __all__ = ["CoverageOutput", "coverage"]
|
|
5
5
|
import contextlib
|
6
6
|
import math
|
7
7
|
from dataclasses import dataclass
|
8
|
-
from typing import
|
8
|
+
from typing import Literal
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
from numpy.typing import ArrayLike, NDArray
|
@@ -27,9 +27,9 @@ class CoverageOutput(OutputMetadata):
|
|
27
27
|
|
28
28
|
Attributes
|
29
29
|
----------
|
30
|
-
indices : NDArray
|
30
|
+
indices : NDArray[np.intp]
|
31
31
|
Array of uncovered indices
|
32
|
-
radii : NDArray
|
32
|
+
radii : NDArray[np.float64]
|
33
33
|
Array of critical value radii
|
34
34
|
critical_value : float
|
35
35
|
Radius for :term:`coverage<Coverage>`
|
@@ -39,11 +39,7 @@ class CoverageOutput(OutputMetadata):
|
|
39
39
|
radii: NDArray[np.float64]
|
40
40
|
critical_value: float
|
41
41
|
|
42
|
-
def plot(
|
43
|
-
self,
|
44
|
-
images: NDArray[Any],
|
45
|
-
top_k: int = 6,
|
46
|
-
) -> Figure:
|
42
|
+
def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
|
47
43
|
"""
|
48
44
|
Plot the top k images together for visualization
|
49
45
|
|
@@ -53,6 +49,10 @@ class CoverageOutput(OutputMetadata):
|
|
53
49
|
Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
|
54
50
|
top_k : int, default 6
|
55
51
|
Number of images to plot (plotting assumes groups of 3)
|
52
|
+
|
53
|
+
Returns
|
54
|
+
-------
|
55
|
+
matplotlib.figure.Figure
|
56
56
|
"""
|
57
57
|
# Determine which images to plot
|
58
58
|
highest_uncovered_indices = self.indices[:top_k]
|
@@ -82,12 +82,12 @@ def coverage(
|
|
82
82
|
embeddings : ArrayLike, shape - (N, P)
|
83
83
|
A dataset in an ArrayLike format.
|
84
84
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
|
85
|
-
radius_type :
|
85
|
+
radius_type : {"adaptive", "naive"}, default "adaptive"
|
86
86
|
The function used to determine radius.
|
87
|
-
k: int, default 20
|
87
|
+
k : int, default 20
|
88
88
|
Number of observations required in order to be covered.
|
89
89
|
[1] suggests that a minimum of 20-50 samples is necessary.
|
90
|
-
percent: float, default 0.01
|
90
|
+
percent : float, default 0.01
|
91
91
|
Percent of observations to be considered uncovered. Only applies to adaptive radius.
|
92
92
|
|
93
93
|
Returns
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
10
10
|
from numpy.typing import ArrayLike, NDArray
|
11
11
|
|
12
12
|
from dataeval.metrics.bias.metadata import (
|
13
|
+
CLASS_LABEL,
|
13
14
|
diversity_bar_plot,
|
14
15
|
entropy,
|
15
16
|
get_counts,
|
@@ -35,9 +36,9 @@ class DiversityOutput(OutputMetadata):
|
|
35
36
|
:term:`Diversity` index for classes and factors
|
36
37
|
classwise : NDArray[np.float64]
|
37
38
|
Classwise diversity index [n_class x n_factor]
|
38
|
-
class_list: NDArray[np.int64]
|
39
|
+
class_list : NDArray[np.int64]
|
39
40
|
Class labels for each value in the dataset
|
40
|
-
metadata_names: list[str]
|
41
|
+
metadata_names : list[str]
|
41
42
|
Names of each metadata factor
|
42
43
|
"""
|
43
44
|
|
@@ -45,12 +46,11 @@ class DiversityOutput(OutputMetadata):
|
|
45
46
|
classwise: NDArray[np.float64]
|
46
47
|
class_list: NDArray[Any]
|
47
48
|
metadata_names: list[str]
|
48
|
-
method: Literal["shannon", "simpson"]
|
49
49
|
|
50
50
|
def plot(
|
51
51
|
self,
|
52
|
-
row_labels:
|
53
|
-
col_labels:
|
52
|
+
row_labels: ArrayLike | list[Any] | None = None,
|
53
|
+
col_labels: ArrayLike | list[Any] | None = None,
|
54
54
|
plot_classwise: bool = False,
|
55
55
|
) -> Figure:
|
56
56
|
"""
|
@@ -58,9 +58,9 @@ class DiversityOutput(OutputMetadata):
|
|
58
58
|
|
59
59
|
Parameters
|
60
60
|
----------
|
61
|
-
row_labels : ArrayLike
|
61
|
+
row_labels : ArrayLike or None, default None
|
62
62
|
List/Array containing the labels for rows in the histogram
|
63
|
-
col_labels : ArrayLike
|
63
|
+
col_labels : ArrayLike or None, default None
|
64
64
|
List/Array containing the labels for columns in the histogram
|
65
65
|
plot_classwise : bool, default False
|
66
66
|
Whether to plot per-class balance instead of global balance
|
@@ -77,7 +77,7 @@ class DiversityOutput(OutputMetadata):
|
|
77
77
|
col_labels,
|
78
78
|
xlabel="Factors",
|
79
79
|
ylabel="Class",
|
80
|
-
cbarlabel=f"Normalized {self.method.title()} Index",
|
80
|
+
cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
|
81
81
|
)
|
82
82
|
|
83
83
|
else:
|
@@ -92,7 +92,7 @@ class DiversityOutput(OutputMetadata):
|
|
92
92
|
def diversity_shannon(
|
93
93
|
data: NDArray[Any],
|
94
94
|
names: list[str],
|
95
|
-
|
95
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
96
96
|
subset_mask: NDArray[np.bool_] | None = None,
|
97
97
|
) -> NDArray[np.float64]:
|
98
98
|
"""
|
@@ -106,14 +106,16 @@ def diversity_shannon(
|
|
106
106
|
|
107
107
|
Parameters
|
108
108
|
----------
|
109
|
-
data: NDArray
|
109
|
+
data : NDArray
|
110
110
|
Array containing numerical values for metadata factors
|
111
|
-
names: list[str]
|
111
|
+
names : list[str]
|
112
112
|
Names of metadata factors -- keys of the metadata dictionary
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
113
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
114
|
+
The factors in names that have continuous values and the array of bin counts to
|
115
|
+
discretize values into. All factors are treated as having discrete values unless they
|
116
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
117
|
+
in names.
|
118
|
+
subset_mask : NDArray[np.bool_] or None, default None
|
117
119
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
118
120
|
|
119
121
|
Note
|
@@ -122,18 +124,32 @@ def diversity_shannon(
|
|
122
124
|
|
123
125
|
Returns
|
124
126
|
-------
|
125
|
-
diversity_index: NDArray
|
127
|
+
diversity_index : NDArray[np.float64]
|
126
128
|
Diversity index per column of X
|
127
129
|
|
128
130
|
See Also
|
129
131
|
--------
|
130
132
|
numpy.histogram
|
131
133
|
"""
|
134
|
+
hist_cache = {}
|
132
135
|
|
133
136
|
# entropy computed using global auto bins so that we can properly normalize
|
134
|
-
ent_unnormalized = entropy(
|
137
|
+
ent_unnormalized = entropy(
|
138
|
+
data,
|
139
|
+
names,
|
140
|
+
continuous_factor_bincounts,
|
141
|
+
normalized=False,
|
142
|
+
subset_mask=subset_mask,
|
143
|
+
hist_cache=hist_cache,
|
144
|
+
)
|
135
145
|
# normalize by global counts rather than classwise counts
|
136
|
-
num_bins = get_num_bins(
|
146
|
+
num_bins = get_num_bins(
|
147
|
+
data,
|
148
|
+
names,
|
149
|
+
continuous_factor_bincounts=continuous_factor_bincounts,
|
150
|
+
subset_mask=subset_mask,
|
151
|
+
hist_cache=hist_cache,
|
152
|
+
)
|
137
153
|
ent_norm = np.empty(ent_unnormalized.shape)
|
138
154
|
ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
|
139
155
|
ent_norm[num_bins == 1] = 0
|
@@ -143,7 +159,7 @@ def diversity_shannon(
|
|
143
159
|
def diversity_simpson(
|
144
160
|
data: NDArray[Any],
|
145
161
|
names: list[str],
|
146
|
-
|
162
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
147
163
|
subset_mask: NDArray[np.bool_] | None = None,
|
148
164
|
) -> NDArray[np.float64]:
|
149
165
|
"""
|
@@ -157,14 +173,16 @@ def diversity_simpson(
|
|
157
173
|
|
158
174
|
Parameters
|
159
175
|
----------
|
160
|
-
data: NDArray
|
176
|
+
data : NDArray
|
161
177
|
Array containing numerical values for metadata factors
|
162
|
-
names: list[str]
|
178
|
+
names : list[str]
|
163
179
|
Names of metadata factors -- keys of the metadata dictionary
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
180
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
181
|
+
The factors in names that have continuous values and the array of bin counts to
|
182
|
+
discretize values into. All factors are treated as having discrete values unless they
|
183
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
184
|
+
in names.
|
185
|
+
subset_mask : NDArray[np.bool_] or None, default None
|
168
186
|
Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
|
169
187
|
|
170
188
|
Note
|
@@ -175,35 +193,39 @@ def diversity_simpson(
|
|
175
193
|
|
176
194
|
Returns
|
177
195
|
-------
|
178
|
-
NDArray
|
196
|
+
diversity_index : NDArray[np.float64]
|
179
197
|
Diversity index per column of X
|
180
198
|
|
181
199
|
See Also
|
182
200
|
--------
|
183
201
|
numpy.histogram
|
184
202
|
"""
|
203
|
+
hist_cache = {}
|
185
204
|
|
186
|
-
hist_counts
|
205
|
+
hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
|
187
206
|
# normalize by global counts, not classwise counts
|
188
|
-
num_bins = get_num_bins(data, names,
|
207
|
+
num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
|
189
208
|
|
190
209
|
ev_index = np.empty(len(names))
|
191
210
|
# loop over columns for convenience
|
192
211
|
for col, cnts in enumerate(hist_counts.values()):
|
193
212
|
# relative frequencies
|
194
|
-
p_i = cnts /
|
213
|
+
p_i = cnts / np.sum(cnts)
|
195
214
|
# inverse Simpson index normalized by (number of bins)
|
196
|
-
s_0 = 1 / np.sum(p_i**2) / num_bins[col]
|
215
|
+
s_0 = 1 / np.sum(p_i**2) # / num_bins[col]
|
197
216
|
if num_bins[col] == 1:
|
198
217
|
ev_index[col] = 0
|
199
218
|
else:
|
200
|
-
ev_index[col] = (s_0
|
219
|
+
ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
|
201
220
|
return ev_index
|
202
221
|
|
203
222
|
|
204
223
|
@set_metadata()
|
205
224
|
def diversity(
|
206
|
-
class_labels: ArrayLike,
|
225
|
+
class_labels: ArrayLike,
|
226
|
+
metadata: Mapping[str, ArrayLike],
|
227
|
+
continuous_factor_bincounts: Mapping[str, int] | None = None,
|
228
|
+
method: Literal["simpson", "shannon"] = "simpson",
|
207
229
|
) -> DiversityOutput:
|
208
230
|
"""
|
209
231
|
Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
|
@@ -216,11 +238,16 @@ def diversity(
|
|
216
238
|
|
217
239
|
Parameters
|
218
240
|
----------
|
219
|
-
class_labels: ArrayLike
|
241
|
+
class_labels : ArrayLike
|
220
242
|
List of class labels for each image
|
221
|
-
metadata: Mapping[str, ArrayLike]
|
243
|
+
metadata : Mapping[str, ArrayLike]
|
222
244
|
Dict of list of metadata factors for each image
|
223
|
-
|
245
|
+
continuous_factor_bincounts : Mapping[str, int] or None, default None
|
246
|
+
The factors in metadata that have continuous values and the array of bin counts to
|
247
|
+
discretize values into. All factors are treated as having discrete values unless they
|
248
|
+
are specified as keys in this dictionary. Each element of this array must occur as a key
|
249
|
+
in metadata.
|
250
|
+
method : {"simpson", "shannon"}, default "simpson"
|
224
251
|
Indicates which diversity index should be computed
|
225
252
|
|
226
253
|
Note
|
@@ -239,40 +266,42 @@ def diversity(
|
|
239
266
|
-------
|
240
267
|
Compute Simpson diversity index of metadata and class labels
|
241
268
|
|
242
|
-
>>> div_simp = diversity(class_labels, metadata, method="simpson")
|
269
|
+
>>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
|
243
270
|
>>> div_simp.diversity_index
|
244
|
-
array([0.
|
271
|
+
array([0.72413793, 0.72413793, 0.88636364])
|
245
272
|
|
246
273
|
>>> div_simp.classwise
|
247
|
-
array([[0.
|
248
|
-
[0.
|
274
|
+
array([[0.68965517, 0.69230769],
|
275
|
+
[0.8 , 1. ]])
|
249
276
|
|
250
277
|
Compute Shannon diversity index of metadata and class labels
|
251
278
|
|
252
|
-
>>> div_shan = diversity(class_labels, metadata, method="shannon")
|
279
|
+
>>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
|
253
280
|
>>> div_shan.diversity_index
|
254
|
-
array([0.
|
281
|
+
array([0.8812909 , 0.8812909 , 0.96748876])
|
255
282
|
|
256
283
|
>>> div_shan.classwise
|
257
|
-
array([[0.
|
258
|
-
[0.
|
284
|
+
array([[0.86312057, 0.91651644],
|
285
|
+
[0.91829583, 1. ]])
|
259
286
|
|
260
287
|
See Also
|
261
288
|
--------
|
262
289
|
numpy.histogram
|
263
290
|
"""
|
264
291
|
diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
|
265
|
-
data, names,
|
266
|
-
diversity_index = diversity_fn(data, names,
|
292
|
+
data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
|
293
|
+
diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
|
294
|
+
|
295
|
+
class_idx = names.index(CLASS_LABEL)
|
296
|
+
class_lbl = data[:, class_idx]
|
267
297
|
|
268
|
-
|
269
|
-
u_classes = np.unique(data[:, class_idx])
|
298
|
+
u_classes = np.unique(class_lbl)
|
270
299
|
num_factors = len(names)
|
271
300
|
diversity = np.empty((len(u_classes), num_factors))
|
272
301
|
diversity[:] = np.nan
|
273
302
|
for idx, cls in enumerate(u_classes):
|
274
|
-
subset_mask =
|
275
|
-
diversity[idx, :] = diversity_fn(data, names,
|
303
|
+
subset_mask = class_lbl == cls
|
304
|
+
diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
|
276
305
|
div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
|
277
306
|
|
278
|
-
return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys())
|
307
|
+
return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))
|