dataeval 0.74.2__py3-none-any.whl → 0.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +27 -23
- dataeval/detectors/__init__.py +2 -2
- dataeval/detectors/drift/__init__.py +14 -12
- dataeval/detectors/drift/base.py +3 -3
- dataeval/detectors/drift/cvm.py +1 -1
- dataeval/detectors/drift/ks.py +3 -2
- dataeval/detectors/drift/mmd.py +9 -7
- dataeval/detectors/drift/torch.py +12 -12
- dataeval/detectors/drift/uncertainty.py +5 -4
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +4 -4
- dataeval/detectors/linters/clusterer.py +5 -9
- dataeval/detectors/linters/duplicates.py +10 -14
- dataeval/detectors/linters/outliers.py +100 -5
- dataeval/detectors/ood/__init__.py +4 -11
- dataeval/detectors/ood/{ae_torch.py → ae.py} +6 -4
- dataeval/detectors/ood/base.py +47 -160
- dataeval/detectors/ood/metadata_ks_compare.py +34 -42
- dataeval/detectors/ood/metadata_least_likely.py +3 -3
- dataeval/detectors/ood/metadata_ood_mi.py +6 -5
- dataeval/detectors/ood/mixin.py +146 -0
- dataeval/detectors/ood/output.py +63 -0
- dataeval/interop.py +7 -6
- dataeval/{logging.py → log.py} +2 -0
- dataeval/metrics/__init__.py +3 -3
- dataeval/metrics/bias/__init__.py +10 -13
- dataeval/metrics/bias/balance.py +13 -11
- dataeval/metrics/bias/coverage.py +53 -5
- dataeval/metrics/bias/diversity.py +56 -24
- dataeval/metrics/bias/parity.py +20 -17
- dataeval/metrics/estimators/__init__.py +2 -2
- dataeval/metrics/estimators/ber.py +7 -4
- dataeval/metrics/estimators/divergence.py +4 -4
- dataeval/metrics/estimators/uap.py +4 -4
- dataeval/metrics/stats/__init__.py +19 -19
- dataeval/metrics/stats/base.py +28 -12
- dataeval/metrics/stats/boxratiostats.py +13 -14
- dataeval/metrics/stats/datasetstats.py +49 -20
- dataeval/metrics/stats/dimensionstats.py +8 -8
- dataeval/metrics/stats/hashstats.py +14 -10
- dataeval/metrics/stats/labelstats.py +94 -11
- dataeval/metrics/stats/pixelstats.py +11 -14
- dataeval/metrics/stats/visualstats.py +10 -13
- dataeval/output.py +23 -14
- dataeval/utils/__init__.py +5 -14
- dataeval/utils/dataset/__init__.py +7 -0
- dataeval/utils/{torch → dataset}/datasets.py +2 -0
- dataeval/utils/dataset/read.py +63 -0
- dataeval/utils/{split_dataset.py → dataset/split.py} +38 -30
- dataeval/utils/image.py +2 -2
- dataeval/utils/metadata.py +317 -14
- dataeval/{metrics/bias/metadata_utils.py → utils/plot.py} +91 -71
- dataeval/utils/torch/__init__.py +2 -17
- dataeval/utils/torch/gmm.py +29 -6
- dataeval/utils/torch/{utils.py → internal.py} +82 -58
- dataeval/utils/torch/models.py +10 -8
- dataeval/utils/torch/trainer.py +6 -85
- dataeval/workflows/__init__.py +2 -5
- dataeval/workflows/sufficiency.py +18 -8
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/LICENSE.txt +2 -2
- dataeval-0.76.0.dist-info/METADATA +137 -0
- dataeval-0.76.0.dist-info/RECORD +67 -0
- dataeval/detectors/ood/base_torch.py +0 -109
- dataeval/metrics/bias/metadata_preprocessing.py +0 -285
- dataeval/utils/gmm.py +0 -26
- dataeval-0.74.2.dist-info/METADATA +0 -120
- dataeval-0.74.2.dist-info/RECORD +0 -66
- {dataeval-0.74.2.dist-info → dataeval-0.76.0.dist-info}/WHEEL +0 -0
@@ -1,24 +1,21 @@
|
|
1
1
|
"""
|
2
|
-
Bias metrics check for skewed or imbalanced datasets and incomplete feature
|
2
|
+
Bias metrics check for skewed or imbalanced datasets and incomplete feature \
|
3
3
|
representation which may impact model performance.
|
4
4
|
"""
|
5
5
|
|
6
|
-
from dataeval.metrics.bias.balance import BalanceOutput, balance
|
7
|
-
from dataeval.metrics.bias.coverage import CoverageOutput, coverage
|
8
|
-
from dataeval.metrics.bias.diversity import DiversityOutput, diversity
|
9
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput, metadata_preprocessing
|
10
|
-
from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
|
11
|
-
|
12
6
|
__all__ = [
|
7
|
+
"BalanceOutput",
|
8
|
+
"CoverageOutput",
|
9
|
+
"DiversityOutput",
|
10
|
+
"ParityOutput",
|
13
11
|
"balance",
|
14
12
|
"coverage",
|
15
13
|
"diversity",
|
16
14
|
"label_parity",
|
17
15
|
"parity",
|
18
|
-
"metadata_preprocessing",
|
19
|
-
"BalanceOutput",
|
20
|
-
"CoverageOutput",
|
21
|
-
"DiversityOutput",
|
22
|
-
"ParityOutput",
|
23
|
-
"MetadataOutput",
|
24
16
|
]
|
17
|
+
|
18
|
+
from dataeval.metrics.bias.balance import BalanceOutput, balance
|
19
|
+
from dataeval.metrics.bias.coverage import CoverageOutput, coverage
|
20
|
+
from dataeval.metrics.bias.diversity import DiversityOutput, diversity
|
21
|
+
from dataeval.metrics.bias.parity import ParityOutput, label_parity, parity
|
dataeval/metrics/bias/balance.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
import warnings
|
@@ -12,9 +12,9 @@ import scipy as sp
|
|
12
12
|
from numpy.typing import NDArray
|
13
13
|
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
|
14
14
|
|
15
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
16
|
-
from dataeval.metrics.bias.metadata_utils import get_counts, heatmap
|
17
15
|
from dataeval.output import Output, set_metadata
|
16
|
+
from dataeval.utils.metadata import Metadata, get_counts
|
17
|
+
from dataeval.utils.plot import heatmap
|
18
18
|
|
19
19
|
with contextlib.suppress(ImportError):
|
20
20
|
from matplotlib.figure import Figure
|
@@ -23,8 +23,8 @@ with contextlib.suppress(ImportError):
|
|
23
23
|
@dataclass(frozen=True)
|
24
24
|
class BalanceOutput(Output):
|
25
25
|
"""
|
26
|
-
Output class for :func:`balance` bias metric
|
27
|
-
|
26
|
+
Output class for :func:`balance` :term:`bias<Bias>` metric.
|
27
|
+
|
28
28
|
Attributes
|
29
29
|
----------
|
30
30
|
balance : NDArray[np.float64]
|
@@ -119,22 +119,24 @@ def _validate_num_neighbors(num_neighbors: int) -> int:
|
|
119
119
|
|
120
120
|
@set_metadata
|
121
121
|
def balance(
|
122
|
-
metadata:
|
122
|
+
metadata: Metadata,
|
123
123
|
num_neighbors: int = 5,
|
124
124
|
) -> BalanceOutput:
|
125
125
|
"""
|
126
|
-
Mutual information (MI) between factors (class label, metadata, label/image properties)
|
126
|
+
Mutual information (MI) between factors (class label, metadata, label/image properties).
|
127
127
|
|
128
128
|
Parameters
|
129
129
|
----------
|
130
|
-
metadata :
|
131
|
-
|
130
|
+
metadata : Metadata
|
131
|
+
Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
|
132
|
+
num_neighbors : int, default 5
|
133
|
+
Number of points to consider as neighbors
|
132
134
|
|
133
135
|
Returns
|
134
136
|
-------
|
135
137
|
BalanceOutput
|
136
|
-
(num_factors+1) x (num_factors+1) estimate of mutual information
|
137
|
-
|
138
|
+
(num_factors+1) x (num_factors+1) estimate of mutual information \
|
139
|
+
between num_factors metadata factors and class label. Symmetry is enforced.
|
138
140
|
|
139
141
|
Note
|
140
142
|
----
|
@@ -1,18 +1,17 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
import math
|
7
7
|
from dataclasses import dataclass
|
8
|
-
from typing import Literal
|
8
|
+
from typing import Any, Literal
|
9
9
|
|
10
10
|
import numpy as np
|
11
11
|
from numpy.typing import ArrayLike, NDArray
|
12
12
|
from scipy.spatial.distance import pdist, squareform
|
13
13
|
|
14
14
|
from dataeval.interop import to_numpy
|
15
|
-
from dataeval.metrics.bias.metadata_utils import coverage_plot
|
16
15
|
from dataeval.output import Output, set_metadata
|
17
16
|
from dataeval.utils.shared import flatten
|
18
17
|
|
@@ -20,10 +19,59 @@ with contextlib.suppress(ImportError):
|
|
20
19
|
from matplotlib.figure import Figure
|
21
20
|
|
22
21
|
|
22
|
+
def _plot(images: NDArray[Any], num_images: int) -> Figure:
|
23
|
+
"""
|
24
|
+
Creates a single plot of all of the provided images
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
images : NDArray
|
29
|
+
Array containing only the desired images to plot
|
30
|
+
|
31
|
+
Returns
|
32
|
+
-------
|
33
|
+
matplotlib.figure.Figure
|
34
|
+
Plot of all provided images
|
35
|
+
"""
|
36
|
+
import matplotlib.pyplot as plt
|
37
|
+
|
38
|
+
num_images = min(num_images, len(images))
|
39
|
+
|
40
|
+
if images.ndim == 4:
|
41
|
+
images = np.moveaxis(images, 1, -1)
|
42
|
+
elif images.ndim == 3:
|
43
|
+
images = np.repeat(images[:, :, :, np.newaxis], 3, axis=-1)
|
44
|
+
else:
|
45
|
+
raise ValueError(
|
46
|
+
f"Expected a (N,C,H,W) or a (N, H, W) set of images, but got a {images.ndim}-dimensional set of images."
|
47
|
+
)
|
48
|
+
|
49
|
+
rows = int(np.ceil(num_images / 3))
|
50
|
+
fig, axs = plt.subplots(rows, 3, figsize=(9, 3 * rows))
|
51
|
+
|
52
|
+
if rows == 1:
|
53
|
+
for j in range(3):
|
54
|
+
if j >= len(images):
|
55
|
+
continue
|
56
|
+
axs[j].imshow(images[j])
|
57
|
+
axs[j].axis("off")
|
58
|
+
else:
|
59
|
+
for i in range(rows):
|
60
|
+
for j in range(3):
|
61
|
+
i_j = i * 3 + j
|
62
|
+
if i_j >= len(images):
|
63
|
+
continue
|
64
|
+
axs[i, j].imshow(images[i_j])
|
65
|
+
axs[i, j].axis("off")
|
66
|
+
|
67
|
+
fig.tight_layout()
|
68
|
+
return fig
|
69
|
+
|
70
|
+
|
23
71
|
@dataclass(frozen=True)
|
24
72
|
class CoverageOutput(Output):
|
25
73
|
"""
|
26
|
-
Output class for :func:`coverage` :term:`bias<Bias>` metric
|
74
|
+
Output class for :func:`coverage` :term:`bias<Bias>` metric.
|
27
75
|
|
28
76
|
Attributes
|
29
77
|
----------
|
@@ -62,7 +110,7 @@ class CoverageOutput(Output):
|
|
62
110
|
selected_images = images[highest_uncovered_indices]
|
63
111
|
|
64
112
|
# Plot the images
|
65
|
-
fig =
|
113
|
+
fig = _plot(selected_images, top_k)
|
66
114
|
|
67
115
|
return fig
|
68
116
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import contextlib
|
6
6
|
from dataclasses import dataclass
|
@@ -10,19 +10,48 @@ import numpy as np
|
|
10
10
|
import scipy as sp
|
11
11
|
from numpy.typing import ArrayLike, NDArray
|
12
12
|
|
13
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
14
|
-
from dataeval.metrics.bias.metadata_utils import diversity_bar_plot, get_counts, heatmap
|
15
13
|
from dataeval.output import Output, set_metadata
|
14
|
+
from dataeval.utils.metadata import Metadata, get_counts
|
15
|
+
from dataeval.utils.plot import heatmap
|
16
16
|
from dataeval.utils.shared import get_method
|
17
17
|
|
18
18
|
with contextlib.suppress(ImportError):
|
19
19
|
from matplotlib.figure import Figure
|
20
20
|
|
21
21
|
|
22
|
+
def _plot(labels: NDArray[Any], bar_heights: NDArray[Any]) -> Figure:
|
23
|
+
"""
|
24
|
+
Plots a formatted bar plot
|
25
|
+
|
26
|
+
Parameters
|
27
|
+
----------
|
28
|
+
labels : NDArray
|
29
|
+
Array containing the labels for each bar
|
30
|
+
bar_heights : NDArray
|
31
|
+
Array containing the values for each bar
|
32
|
+
|
33
|
+
Returns
|
34
|
+
-------
|
35
|
+
matplotlib.figure.Figure
|
36
|
+
Bar plot figure
|
37
|
+
"""
|
38
|
+
import matplotlib.pyplot as plt
|
39
|
+
|
40
|
+
fig, ax = plt.subplots(figsize=(10, 10))
|
41
|
+
|
42
|
+
ax.bar(labels, bar_heights)
|
43
|
+
ax.set_xlabel("Factors")
|
44
|
+
|
45
|
+
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
|
46
|
+
|
47
|
+
fig.tight_layout()
|
48
|
+
return fig
|
49
|
+
|
50
|
+
|
22
51
|
@dataclass(frozen=True)
|
23
52
|
class DiversityOutput(Output):
|
24
53
|
"""
|
25
|
-
Output class for :func:`diversity` :term:`bias<Bias>` metric
|
54
|
+
Output class for :func:`diversity` :term:`bias<Bias>` metric.
|
26
55
|
|
27
56
|
Attributes
|
28
57
|
----------
|
@@ -77,8 +106,7 @@ class DiversityOutput(Output):
|
|
77
106
|
else:
|
78
107
|
# Creating label array for heat map axes
|
79
108
|
heat_labels = np.concatenate((["class"], self.factor_names))
|
80
|
-
|
81
|
-
fig = diversity_bar_plot(heat_labels, self.diversity_index)
|
109
|
+
fig = _plot(heat_labels, self.diversity_index)
|
82
110
|
|
83
111
|
return fig
|
84
112
|
|
@@ -165,27 +193,26 @@ def diversity_simpson(
|
|
165
193
|
|
166
194
|
@set_metadata
|
167
195
|
def diversity(
|
168
|
-
metadata:
|
196
|
+
metadata: Metadata,
|
169
197
|
method: Literal["simpson", "shannon"] = "simpson",
|
170
198
|
) -> DiversityOutput:
|
171
199
|
"""
|
172
|
-
Compute :term:`diversity<Diversity>` and classwise diversity for
|
173
|
-
|
200
|
+
Compute :term:`diversity<Diversity>` and classwise diversity for \
|
201
|
+
discrete/categorical variables through standard histogram binning, \
|
202
|
+
for continuous variables.
|
174
203
|
|
175
|
-
|
204
|
+
The method specified defines diversity as the inverse Simpson diversity index linearly rescaled to
|
205
|
+
the unit interval, or the normalized form of the Shannon entropy.
|
176
206
|
|
177
207
|
diversity = 1 implies that samples are evenly distributed across a particular factor
|
178
208
|
diversity = 0 implies that all samples belong to one category/bin
|
179
209
|
|
180
210
|
Parameters
|
181
211
|
----------
|
182
|
-
metadata :
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
----
|
187
|
-
- The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
|
188
|
-
- If there is only one category, the diversity index takes a value of 0.
|
212
|
+
metadata : Metadata
|
213
|
+
Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
|
214
|
+
method : "simpson" or "shannon", default "simpson"
|
215
|
+
The methodology used for defining diversity
|
189
216
|
|
190
217
|
Returns
|
191
218
|
-------
|
@@ -193,27 +220,32 @@ def diversity(
|
|
193
220
|
Diversity index per column of self.data or each factor in self.names and
|
194
221
|
classwise diversity [n_class x n_factor]
|
195
222
|
|
223
|
+
Note
|
224
|
+
----
|
225
|
+
- The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
|
226
|
+
- If there is only one category, the diversity index takes a value of 0.
|
227
|
+
|
196
228
|
Example
|
197
229
|
-------
|
198
|
-
Compute
|
230
|
+
Compute the diversity index of metadata and class labels
|
199
231
|
|
200
232
|
>>> div_simp = diversity(metadata, method="simpson")
|
201
233
|
>>> div_simp.diversity_index
|
202
|
-
array([0.
|
234
|
+
array([0.6 , 0.80882353, 1. , 0.8 ])
|
203
235
|
|
204
236
|
>>> div_simp.classwise
|
205
|
-
array([[0.
|
206
|
-
[0.
|
237
|
+
array([[0.5 , 0.8 , 0.8 ],
|
238
|
+
[0.63043478, 0.97560976, 0.52830189]])
|
207
239
|
|
208
240
|
Compute Shannon diversity index of metadata and class labels
|
209
241
|
|
210
242
|
>>> div_shan = diversity(metadata, method="shannon")
|
211
243
|
>>> div_shan.diversity_index
|
212
|
-
array([0.
|
244
|
+
array([0.81127812, 0.9426312 , 1. , 0.91829583])
|
213
245
|
|
214
246
|
>>> div_shan.classwise
|
215
|
-
array([[0.
|
216
|
-
[0.
|
247
|
+
array([[0.68260619, 0.91829583, 0.91829583],
|
248
|
+
[0.81443569, 0.99107606, 0.76420451]])
|
217
249
|
|
218
250
|
See Also
|
219
251
|
--------
|
dataeval/metrics/bias/parity.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
__all__ = [
|
3
|
+
__all__ = []
|
4
4
|
|
5
5
|
import warnings
|
6
6
|
from dataclasses import dataclass
|
@@ -12,8 +12,8 @@ from scipy.stats import chisquare
|
|
12
12
|
from scipy.stats.contingency import chi2_contingency, crosstab
|
13
13
|
|
14
14
|
from dataeval.interop import as_numpy, to_numpy
|
15
|
-
from dataeval.metrics.bias.metadata_preprocessing import MetadataOutput
|
16
15
|
from dataeval.output import Output, set_metadata
|
16
|
+
from dataeval.utils.metadata import Metadata
|
17
17
|
|
18
18
|
TData = TypeVar("TData", np.float64, NDArray[np.float64])
|
19
19
|
|
@@ -21,7 +21,7 @@ TData = TypeVar("TData", np.float64, NDArray[np.float64])
|
|
21
21
|
@dataclass(frozen=True)
|
22
22
|
class ParityOutput(Generic[TData], Output):
|
23
23
|
"""
|
24
|
-
Output class for :func:`parity` and :func:`label_parity` :term:`bias<Bias>` metrics
|
24
|
+
Output class for :func:`parity` and :func:`label_parity` :term:`bias<Bias>` metrics.
|
25
25
|
|
26
26
|
Attributes
|
27
27
|
----------
|
@@ -123,8 +123,8 @@ def label_parity(
|
|
123
123
|
num_classes: int | None = None,
|
124
124
|
) -> ParityOutput[np.float64]:
|
125
125
|
"""
|
126
|
-
Calculate the chi-square statistic to assess the :term:`parity<Parity>`
|
127
|
-
observed label distributions.
|
126
|
+
Calculate the chi-square statistic to assess the :term:`parity<Parity>` \
|
127
|
+
between expected and observed label distributions.
|
128
128
|
|
129
129
|
This function computes the frequency distribution of classes in both expected and observed labels, normalizes
|
130
130
|
the expected distribution to match the total number of observed labels, and then calculates the chi-square
|
@@ -167,8 +167,9 @@ def label_parity(
|
|
167
167
|
--------
|
168
168
|
Randomly creating some label distributions using ``np.random.default_rng``
|
169
169
|
|
170
|
-
>>>
|
171
|
-
>>>
|
170
|
+
>>> rng = np.random.default_rng(175)
|
171
|
+
>>> expected_labels = rng.choice([0, 1, 2, 3, 4], (100))
|
172
|
+
>>> observed_labels = rng.choice([2, 3, 0, 4, 1], (100))
|
172
173
|
>>> label_parity(expected_labels, observed_labels)
|
173
174
|
ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
|
174
175
|
"""
|
@@ -205,10 +206,10 @@ def label_parity(
|
|
205
206
|
|
206
207
|
|
207
208
|
@set_metadata
|
208
|
-
def parity(metadata:
|
209
|
+
def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
|
209
210
|
"""
|
210
|
-
Calculate chi-square statistics to assess the linear relationship
|
211
|
-
and class labels.
|
211
|
+
Calculate chi-square statistics to assess the linear relationship \
|
212
|
+
between multiple factors and class labels.
|
212
213
|
|
213
214
|
This function computes the chi-square statistic for each metadata factor to determine if there is
|
214
215
|
a significant relationship between the factor values and class labels. The chi-square statistic is
|
@@ -216,8 +217,8 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
|
|
216
217
|
|
217
218
|
Parameters
|
218
219
|
----------
|
219
|
-
metadata :
|
220
|
-
|
220
|
+
metadata : Metadata
|
221
|
+
Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
|
221
222
|
|
222
223
|
Returns
|
223
224
|
-------
|
@@ -249,16 +250,18 @@ def parity(metadata: MetadataOutput) -> ParityOutput[NDArray[np.float64]]:
|
|
249
250
|
--------
|
250
251
|
Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
|
251
252
|
|
252
|
-
>>>
|
253
|
+
>>> from dataeval.utils.metadata import preprocess
|
254
|
+
>>> rng = np.random.default_rng(175)
|
255
|
+
>>> labels = rng.choice([0, 1, 2], (100))
|
253
256
|
>>> metadata_dict = [
|
254
257
|
... {
|
255
|
-
... "age": list(
|
256
|
-
... "income": list(
|
257
|
-
... "gender": list(
|
258
|
+
... "age": list(rng.choice([25, 30, 35, 45], (100))),
|
259
|
+
... "income": list(rng.choice([50000, 65000, 80000], (100))),
|
260
|
+
... "gender": list(rng.choice(["M", "F"], (100))),
|
258
261
|
... }
|
259
262
|
... ]
|
260
263
|
>>> continuous_factor_bincounts = {"age": 4, "income": 3}
|
261
|
-
>>> metadata =
|
264
|
+
>>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
|
262
265
|
>>> parity(metadata)
|
263
266
|
ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
|
264
267
|
""" # noqa: E501
|
@@ -2,8 +2,8 @@
|
|
2
2
|
Estimators calculate performance bounds and the statistical distance between datasets.
|
3
3
|
"""
|
4
4
|
|
5
|
+
__all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
|
6
|
+
|
5
7
|
from dataeval.metrics.estimators.ber import BEROutput, ber
|
6
8
|
from dataeval.metrics.estimators.divergence import DivergenceOutput, divergence
|
7
9
|
from dataeval.metrics.estimators.uap import UAPOutput, uap
|
8
|
-
|
9
|
-
__all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
|
@@ -5,11 +5,12 @@ KNN based estimate for the :term:`Bayes error rate<Bayes Error Rate (BER)>`
|
|
5
5
|
|
6
6
|
Learning to Bound the Multi-class Bayes Error (Th. 3 and Th. 4)
|
7
7
|
https://arxiv.org/abs/1811.06419
|
8
|
+
|
8
9
|
"""
|
9
10
|
|
10
11
|
from __future__ import annotations
|
11
12
|
|
12
|
-
__all__ = [
|
13
|
+
__all__ = []
|
13
14
|
|
14
15
|
from dataclasses import dataclass
|
15
16
|
from typing import Literal
|
@@ -27,7 +28,7 @@ from dataeval.utils.shared import compute_neighbors, get_classes_counts, get_met
|
|
27
28
|
@dataclass(frozen=True)
|
28
29
|
class BEROutput(Output):
|
29
30
|
"""
|
30
|
-
Output class for :func:`ber` estimator metric
|
31
|
+
Output class for :func:`ber` estimator metric.
|
31
32
|
|
32
33
|
Attributes
|
33
34
|
----------
|
@@ -38,11 +39,12 @@ class BEROutput(Output):
|
|
38
39
|
"""
|
39
40
|
|
40
41
|
ber: float
|
42
|
+
|
41
43
|
ber_lower: float
|
42
44
|
|
43
45
|
|
44
46
|
def ber_mst(images: NDArray[np.float64], labels: NDArray[np.int_], k: int = 1) -> tuple[float, float]:
|
45
|
-
"""Calculates the :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree
|
47
|
+
"""Calculates the :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree.
|
46
48
|
|
47
49
|
Parameters
|
48
50
|
----------
|
@@ -117,7 +119,8 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
|
|
117
119
|
@set_metadata
|
118
120
|
def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
|
119
121
|
"""
|
120
|
-
An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>`
|
122
|
+
An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
|
123
|
+
using FR or KNN test statistic basis.
|
121
124
|
|
122
125
|
Parameters
|
123
126
|
----------
|
@@ -5,7 +5,7 @@ using the Fast Nearest Neighbor and Minimum Spanning Tree algorithms
|
|
5
5
|
|
6
6
|
from __future__ import annotations
|
7
7
|
|
8
|
-
__all__ = [
|
8
|
+
__all__ = []
|
9
9
|
|
10
10
|
from dataclasses import dataclass
|
11
11
|
from typing import Literal
|
@@ -21,7 +21,7 @@ from dataeval.utils.shared import compute_neighbors, get_method, minimum_spannin
|
|
21
21
|
@dataclass(frozen=True)
|
22
22
|
class DivergenceOutput(Output):
|
23
23
|
"""
|
24
|
-
Output class for :func:`divergence` estimator metric
|
24
|
+
Output class for :func:`divergence` estimator metric.
|
25
25
|
|
26
26
|
Attributes
|
27
27
|
----------
|
@@ -59,7 +59,7 @@ def divergence_mst(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
|
59
59
|
|
60
60
|
def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
61
61
|
"""
|
62
|
-
Calculates the estimated label errors based on their nearest neighbors
|
62
|
+
Calculates the estimated label errors based on their nearest neighbors.
|
63
63
|
|
64
64
|
Parameters
|
65
65
|
----------
|
@@ -81,7 +81,7 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
|
81
81
|
@set_metadata
|
82
82
|
def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
83
83
|
"""
|
84
|
-
Calculates the :term
|
84
|
+
Calculates the :term:`divergence` and any errors between the datasets.
|
85
85
|
|
86
86
|
Parameters
|
87
87
|
----------
|
@@ -6,7 +6,7 @@ average precision<Upper-Bound Average Precision (UAP)>` using empirical mean pre
|
|
6
6
|
|
7
7
|
from __future__ import annotations
|
8
8
|
|
9
|
-
__all__ = [
|
9
|
+
__all__ = []
|
10
10
|
|
11
11
|
from dataclasses import dataclass
|
12
12
|
|
@@ -20,7 +20,7 @@ from dataeval.output import Output, set_metadata
|
|
20
20
|
@dataclass(frozen=True)
|
21
21
|
class UAPOutput(Output):
|
22
22
|
"""
|
23
|
-
Output class for :func:`uap` estimator metric
|
23
|
+
Output class for :func:`uap` estimator metric.
|
24
24
|
|
25
25
|
Attributes
|
26
26
|
----------
|
@@ -34,8 +34,8 @@ class UAPOutput(Output):
|
|
34
34
|
@set_metadata
|
35
35
|
def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
|
36
36
|
"""
|
37
|
-
FR Test Statistic based estimate of the empirical mean precision for
|
38
|
-
|
37
|
+
FR Test Statistic based estimate of the empirical mean precision for the \
|
38
|
+
upperbound average precision.
|
39
39
|
|
40
40
|
Parameters
|
41
41
|
----------
|
@@ -1,8 +1,26 @@
|
|
1
1
|
"""
|
2
|
-
Statistics metrics calculate a variety of image properties and pixel statistics
|
2
|
+
Statistics metrics calculate a variety of image properties and pixel statistics \
|
3
3
|
and label statistics against the images and labels of a dataset.
|
4
4
|
"""
|
5
5
|
|
6
|
+
__all__ = [
|
7
|
+
"ChannelStatsOutput",
|
8
|
+
"DatasetStatsOutput",
|
9
|
+
"DimensionStatsOutput",
|
10
|
+
"HashStatsOutput",
|
11
|
+
"LabelStatsOutput",
|
12
|
+
"PixelStatsOutput",
|
13
|
+
"VisualStatsOutput",
|
14
|
+
"boxratiostats",
|
15
|
+
"channelstats",
|
16
|
+
"datasetstats",
|
17
|
+
"dimensionstats",
|
18
|
+
"hashstats",
|
19
|
+
"labelstats",
|
20
|
+
"pixelstats",
|
21
|
+
"visualstats",
|
22
|
+
]
|
23
|
+
|
6
24
|
from dataeval.metrics.stats.boxratiostats import boxratiostats
|
7
25
|
from dataeval.metrics.stats.datasetstats import (
|
8
26
|
ChannelStatsOutput,
|
@@ -15,21 +33,3 @@ from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
|
|
15
33
|
from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
|
16
34
|
from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
|
17
35
|
from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
|
18
|
-
|
19
|
-
__all__ = [
|
20
|
-
"boxratiostats",
|
21
|
-
"channelstats",
|
22
|
-
"datasetstats",
|
23
|
-
"dimensionstats",
|
24
|
-
"hashstats",
|
25
|
-
"labelstats",
|
26
|
-
"pixelstats",
|
27
|
-
"visualstats",
|
28
|
-
"ChannelStatsOutput",
|
29
|
-
"DatasetStatsOutput",
|
30
|
-
"DimensionStatsOutput",
|
31
|
-
"HashStatsOutput",
|
32
|
-
"LabelStatsOutput",
|
33
|
-
"PixelStatsOutput",
|
34
|
-
"VisualStatsOutput",
|
35
|
-
]
|