dataeval 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +3 -3
- dataeval/{output.py → _output.py} +14 -0
- dataeval/config.py +77 -0
- dataeval/detectors/__init__.py +1 -1
- dataeval/detectors/drift/__init__.py +6 -6
- dataeval/detectors/drift/{base.py → _base.py} +41 -30
- dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
- dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
- dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
- dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
- dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
- dataeval/detectors/drift/updates.py +1 -1
- dataeval/detectors/linters/__init__.py +0 -3
- dataeval/detectors/linters/duplicates.py +17 -8
- dataeval/detectors/linters/outliers.py +52 -43
- dataeval/detectors/ood/ae.py +29 -8
- dataeval/detectors/ood/base.py +5 -4
- dataeval/detectors/ood/metadata_ks_compare.py +1 -1
- dataeval/detectors/ood/mixin.py +20 -5
- dataeval/detectors/ood/output.py +1 -1
- dataeval/detectors/ood/vae.py +73 -0
- dataeval/metadata/__init__.py +5 -0
- dataeval/metadata/_ood.py +238 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +5 -4
- dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
- dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
- dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
- dataeval/metrics/bias/{parity.py → _parity.py} +89 -63
- dataeval/metrics/estimators/__init__.py +14 -4
- dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
- dataeval/metrics/estimators/_clusterer.py +104 -0
- dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
- dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/metrics/stats/{base.py → _base.py} +52 -16
- dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
- dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
- dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
- dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
- dataeval/metrics/stats/{labelstats.py → _labelstats.py} +25 -25
- dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
- dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
- dataeval/typing.py +54 -0
- dataeval/utils/__init__.py +2 -2
- dataeval/utils/_array.py +169 -0
- dataeval/utils/_bin.py +199 -0
- dataeval/utils/_clusterer.py +144 -0
- dataeval/utils/_fast_mst.py +189 -0
- dataeval/utils/{image.py → _image.py} +6 -4
- dataeval/utils/_method.py +18 -0
- dataeval/utils/{shared.py → _mst.py} +3 -65
- dataeval/utils/{plot.py → _plot.py} +4 -4
- dataeval/utils/data/__init__.py +22 -0
- dataeval/utils/data/_embeddings.py +105 -0
- dataeval/utils/data/_images.py +65 -0
- dataeval/utils/data/_metadata.py +352 -0
- dataeval/utils/data/_selection.py +119 -0
- dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
- dataeval/utils/data/_targets.py +73 -0
- dataeval/utils/data/_types.py +58 -0
- dataeval/utils/data/collate.py +103 -0
- dataeval/utils/data/datasets/__init__.py +17 -0
- dataeval/utils/data/datasets/_base.py +254 -0
- dataeval/utils/data/datasets/_cifar10.py +134 -0
- dataeval/utils/data/datasets/_fileio.py +168 -0
- dataeval/utils/data/datasets/_milco.py +153 -0
- dataeval/utils/data/datasets/_mixin.py +56 -0
- dataeval/utils/data/datasets/_mnist.py +183 -0
- dataeval/utils/data/datasets/_ships.py +123 -0
- dataeval/utils/data/datasets/_voc.py +352 -0
- dataeval/utils/data/selections/__init__.py +15 -0
- dataeval/utils/data/selections/_classfilter.py +60 -0
- dataeval/utils/data/selections/_indices.py +26 -0
- dataeval/utils/data/selections/_limit.py +26 -0
- dataeval/utils/data/selections/_reverse.py +18 -0
- dataeval/utils/data/selections/_shuffle.py +29 -0
- dataeval/utils/metadata.py +198 -376
- dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
- dataeval/utils/torch/{internal.py → _internal.py} +21 -51
- dataeval/utils/torch/models.py +43 -2
- dataeval/workflows/sufficiency.py +10 -9
- {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/METADATA +44 -15
- dataeval-0.81.0.dist-info/RECORD +94 -0
- dataeval/detectors/linters/clusterer.py +0 -512
- dataeval/detectors/linters/merged_stats.py +0 -49
- dataeval/detectors/ood/metadata_least_likely.py +0 -119
- dataeval/interop.py +0 -69
- dataeval/utils/dataset/__init__.py +0 -7
- dataeval/utils/dataset/datasets.py +0 -412
- dataeval/utils/dataset/read.py +0 -63
- dataeval-0.76.0.dist-info/RECORD +0 -67
- /dataeval/{log.py → _log.py} +0 -0
- /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
- {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
@@ -2,40 +2,86 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
+
import contextlib
|
5
6
|
import warnings
|
6
7
|
from dataclasses import dataclass
|
7
8
|
from typing import Any, Generic, TypeVar
|
8
9
|
|
9
10
|
import numpy as np
|
10
|
-
from numpy.typing import
|
11
|
+
from numpy.typing import NDArray
|
11
12
|
from scipy.stats import chisquare
|
12
13
|
from scipy.stats.contingency import chi2_contingency, crosstab
|
13
14
|
|
14
|
-
from dataeval.
|
15
|
-
from dataeval.
|
16
|
-
from dataeval.utils.
|
15
|
+
from dataeval._output import Output, set_metadata
|
16
|
+
from dataeval.typing import ArrayLike
|
17
|
+
from dataeval.utils._array import as_numpy
|
18
|
+
from dataeval.utils.data import Metadata
|
19
|
+
|
20
|
+
with contextlib.suppress(ImportError):
|
21
|
+
import pandas as pd
|
17
22
|
|
18
23
|
TData = TypeVar("TData", np.float64, NDArray[np.float64])
|
19
24
|
|
20
25
|
|
21
26
|
@dataclass(frozen=True)
|
22
|
-
class
|
27
|
+
class BaseParityOutput(Generic[TData], Output):
|
28
|
+
score: TData
|
29
|
+
p_value: TData
|
30
|
+
|
31
|
+
def to_dataframe(self) -> pd.DataFrame:
|
32
|
+
"""
|
33
|
+
Exports the parity output results to a pandas DataFrame.
|
34
|
+
|
35
|
+
Returns
|
36
|
+
-------
|
37
|
+
pd.DataFrame
|
38
|
+
"""
|
39
|
+
import pandas as pd
|
40
|
+
|
41
|
+
return pd.DataFrame(
|
42
|
+
index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
|
43
|
+
data={
|
44
|
+
"score": self.score.round(2),
|
45
|
+
"p-value": self.p_value.round(2),
|
46
|
+
},
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
@dataclass(frozen=True)
|
51
|
+
class LabelParityOutput(BaseParityOutput[np.float64]):
|
52
|
+
"""
|
53
|
+
Output class for :func:`.label_parity` :term:`bias<Bias>` metrics.
|
54
|
+
|
55
|
+
Attributes
|
56
|
+
----------
|
57
|
+
score : np.float64
|
58
|
+
chi-squared score(s) of the test
|
59
|
+
p_value : np.float64
|
60
|
+
p-value(s) of the test
|
61
|
+
"""
|
62
|
+
|
63
|
+
|
64
|
+
@dataclass(frozen=True)
|
65
|
+
class ParityOutput(BaseParityOutput[NDArray[np.float64]]):
|
23
66
|
"""
|
24
|
-
Output class for :func
|
67
|
+
Output class for :func:`.parity` :term:`bias<Bias>` metrics.
|
25
68
|
|
26
69
|
Attributes
|
27
70
|
----------
|
28
|
-
score :
|
71
|
+
score : NDArray[np.float64]
|
29
72
|
chi-squared score(s) of the test
|
30
|
-
p_value :
|
73
|
+
p_value : NDArray[np.float64]
|
31
74
|
p-value(s) of the test
|
32
|
-
|
75
|
+
factor_names : list[str]
|
33
76
|
Names of each metadata factor
|
77
|
+
insufficient_data: dict
|
78
|
+
Dictionary of metadata factors with less than 5 class occurrences per value
|
34
79
|
"""
|
35
80
|
|
36
|
-
score:
|
37
|
-
p_value:
|
38
|
-
|
81
|
+
# score: NDArray[np.float64]
|
82
|
+
# p_value: NDArray[np.float64]
|
83
|
+
factor_names: list[str]
|
84
|
+
insufficient_data: dict[str, dict[int, dict[str, int]]]
|
39
85
|
|
40
86
|
|
41
87
|
def normalize_expected_dist(expected_dist: NDArray[Any], observed_dist: NDArray[Any]) -> NDArray[Any]:
|
@@ -109,7 +155,7 @@ def validate_dist(label_dist: NDArray[Any], label_name: str) -> None:
|
|
109
155
|
raise ValueError(f"No labels found in the {label_name} dataset")
|
110
156
|
if np.any(label_dist < 5):
|
111
157
|
warnings.warn(
|
112
|
-
f"Labels {np.where(label_dist<5)[0]} in {label_name}"
|
158
|
+
f"Labels {np.where(label_dist < 5)[0]} in {label_name}"
|
113
159
|
" dataset have frequencies less than 5. This may lead"
|
114
160
|
" to invalid chi-squared evaluation.",
|
115
161
|
UserWarning,
|
@@ -121,7 +167,7 @@ def label_parity(
|
|
121
167
|
expected_labels: ArrayLike,
|
122
168
|
observed_labels: ArrayLike,
|
123
169
|
num_classes: int | None = None,
|
124
|
-
) ->
|
170
|
+
) -> LabelParityOutput:
|
125
171
|
"""
|
126
172
|
Calculate the chi-square statistic to assess the :term:`parity<Parity>` \
|
127
173
|
between expected and observed label distributions.
|
@@ -142,7 +188,7 @@ def label_parity(
|
|
142
188
|
|
143
189
|
Returns
|
144
190
|
-------
|
145
|
-
|
191
|
+
LabelParityOutput
|
146
192
|
chi-squared score and :term`P-Value` of the test
|
147
193
|
|
148
194
|
Raises
|
@@ -171,7 +217,7 @@ def label_parity(
|
|
171
217
|
>>> expected_labels = rng.choice([0, 1, 2, 3, 4], (100))
|
172
218
|
>>> observed_labels = rng.choice([2, 3, 0, 4, 1], (100))
|
173
219
|
>>> label_parity(expected_labels, observed_labels)
|
174
|
-
|
220
|
+
LabelParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
|
175
221
|
"""
|
176
222
|
|
177
223
|
# Calculate
|
@@ -179,8 +225,8 @@ def label_parity(
|
|
179
225
|
num_classes = 0
|
180
226
|
|
181
227
|
# Calculate the class frequencies associated with the datasets
|
182
|
-
observed_dist = np.bincount(
|
183
|
-
expected_dist = np.bincount(
|
228
|
+
observed_dist = np.bincount(as_numpy(observed_labels), minlength=num_classes)
|
229
|
+
expected_dist = np.bincount(as_numpy(expected_labels), minlength=num_classes)
|
184
230
|
|
185
231
|
# Validate
|
186
232
|
validate_dist(observed_dist, "observed")
|
@@ -202,11 +248,11 @@ def label_parity(
|
|
202
248
|
)
|
203
249
|
|
204
250
|
cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
|
205
|
-
return
|
251
|
+
return LabelParityOutput(cs, p)
|
206
252
|
|
207
253
|
|
208
254
|
@set_metadata
|
209
|
-
def parity(metadata: Metadata) -> ParityOutput
|
255
|
+
def parity(metadata: Metadata) -> ParityOutput:
|
210
256
|
"""
|
211
257
|
Calculate chi-square statistics to assess the linear relationship \
|
212
258
|
between multiple factors and class labels.
|
@@ -218,7 +264,7 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
|
|
218
264
|
Parameters
|
219
265
|
----------
|
220
266
|
metadata : Metadata
|
221
|
-
Preprocessed metadata
|
267
|
+
Preprocessed metadata
|
222
268
|
|
223
269
|
Returns
|
224
270
|
-------
|
@@ -250,24 +296,21 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
|
|
250
296
|
--------
|
251
297
|
Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
|
252
298
|
|
253
|
-
>>>
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
...
|
258
|
-
... "
|
259
|
-
...
|
260
|
-
...
|
261
|
-
|
262
|
-
... ]
|
263
|
-
>>> continuous_factor_bincounts = {"age": 4, "income": 3}
|
264
|
-
>>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
|
299
|
+
>>> metadata = generate_random_metadata(
|
300
|
+
... labels=["doctor", "artist", "teacher"],
|
301
|
+
... factors={
|
302
|
+
... "age": [25, 30, 35, 45],
|
303
|
+
... "income": [50000, 65000, 80000],
|
304
|
+
... "gender": ["M", "F"]},
|
305
|
+
... length=100,
|
306
|
+
... random_seed=175)
|
307
|
+
>>> metadata.continuous_factor_bins = {"age": 4, "income": 3}
|
265
308
|
>>> parity(metadata)
|
266
|
-
ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]),
|
309
|
+
ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), factor_names=['age', 'income', 'gender'], insufficient_data={'age': {3: {'artist': 4}, 4: {'artist': 4, 'teacher': 3}}, 'income': {1: {'artist': 3}}})
|
267
310
|
""" # noqa: E501
|
268
311
|
chi_scores = np.zeros(metadata.discrete_data.shape[1])
|
269
312
|
p_values = np.zeros_like(chi_scores)
|
270
|
-
|
313
|
+
insufficient_data = {}
|
271
314
|
for i, col_data in enumerate(metadata.discrete_data.T):
|
272
315
|
# Builds a contingency matrix where entry at index (r,c) represents
|
273
316
|
# the frequency of current_factor_name achieving value unique_factor_values[r]
|
@@ -281,14 +324,14 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
|
|
281
324
|
current_factor_name = metadata.discrete_factor_names[i]
|
282
325
|
for int_factor, int_class in zip(counts[0], counts[1]):
|
283
326
|
if contingency_matrix[int_factor, int_class] > 0:
|
284
|
-
factor_category = unique_factor_values[int_factor]
|
285
|
-
if current_factor_name not in
|
286
|
-
|
287
|
-
if factor_category not in
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
327
|
+
factor_category = unique_factor_values[int_factor].item()
|
328
|
+
if current_factor_name not in insufficient_data:
|
329
|
+
insufficient_data[current_factor_name] = {}
|
330
|
+
if factor_category not in insufficient_data[current_factor_name]:
|
331
|
+
insufficient_data[current_factor_name][factor_category] = {}
|
332
|
+
class_name = metadata.class_names[int_class]
|
333
|
+
class_count = contingency_matrix[int_factor, int_class].item()
|
334
|
+
insufficient_data[current_factor_name][factor_category][class_name] = class_count
|
292
335
|
|
293
336
|
# This deletes rows containing only zeros,
|
294
337
|
# because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
|
@@ -301,24 +344,7 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
|
|
301
344
|
chi_scores[i] = chi2
|
302
345
|
p_values[i] = p
|
303
346
|
|
304
|
-
if
|
305
|
-
|
306
|
-
for factor, fact_dict in not_enough_data.items():
|
307
|
-
stacked_msg = []
|
308
|
-
for key, value in fact_dict.items():
|
309
|
-
msg = []
|
310
|
-
for item in value:
|
311
|
-
msg.append(f"label {item[0]}: {item[1]} occurrences")
|
312
|
-
flat_msg = "\n\t\t".join(msg)
|
313
|
-
stacked_msg.append(f"value {key} - {flat_msg}\n\t")
|
314
|
-
factor_msg.append(factor + " - " + "".join(stacked_msg))
|
315
|
-
|
316
|
-
message = "\n".join(factor_msg)
|
317
|
-
|
318
|
-
warnings.warn(
|
319
|
-
f"The following factors did not meet the recommended 5 occurrences for each value-label combination. \n\
|
320
|
-
Recommend rerunning parity after adjusting the following factor-value-label combinations: \n{message}",
|
321
|
-
UserWarning,
|
322
|
-
)
|
347
|
+
if insufficient_data:
|
348
|
+
warnings.warn("Some factors did not meet the recommended 5 occurrences for each value-label combination.")
|
323
349
|
|
324
|
-
return ParityOutput(chi_scores, p_values, metadata.discrete_factor_names)
|
350
|
+
return ParityOutput(chi_scores, p_values, metadata.discrete_factor_names, insufficient_data)
|
@@ -2,8 +2,18 @@
|
|
2
2
|
Estimators calculate performance bounds and the statistical distance between datasets.
|
3
3
|
"""
|
4
4
|
|
5
|
-
__all__ = [
|
5
|
+
__all__ = [
|
6
|
+
"ber",
|
7
|
+
"clusterer",
|
8
|
+
"divergence",
|
9
|
+
"uap",
|
10
|
+
"BEROutput",
|
11
|
+
"ClustererOutput",
|
12
|
+
"DivergenceOutput",
|
13
|
+
"UAPOutput",
|
14
|
+
]
|
6
15
|
|
7
|
-
from dataeval.metrics.estimators.
|
8
|
-
from dataeval.metrics.estimators.
|
9
|
-
from dataeval.metrics.estimators.
|
16
|
+
from dataeval.metrics.estimators._ber import BEROutput, ber
|
17
|
+
from dataeval.metrics.estimators._clusterer import ClustererOutput, clusterer
|
18
|
+
from dataeval.metrics.estimators._divergence import DivergenceOutput, divergence
|
19
|
+
from dataeval.metrics.estimators._uap import UAPOutput, uap
|
@@ -16,19 +16,21 @@ from dataclasses import dataclass
|
|
16
16
|
from typing import Literal
|
17
17
|
|
18
18
|
import numpy as np
|
19
|
-
from numpy.typing import
|
19
|
+
from numpy.typing import NDArray
|
20
20
|
from scipy.sparse import coo_matrix
|
21
21
|
from scipy.stats import mode
|
22
22
|
|
23
|
-
from dataeval.
|
24
|
-
from dataeval.
|
25
|
-
from dataeval.utils.
|
23
|
+
from dataeval._output import Output, set_metadata
|
24
|
+
from dataeval.typing import ArrayLike
|
25
|
+
from dataeval.utils._array import as_numpy, ensure_embeddings
|
26
|
+
from dataeval.utils._method import get_method
|
27
|
+
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
26
28
|
|
27
29
|
|
28
30
|
@dataclass(frozen=True)
|
29
31
|
class BEROutput(Output):
|
30
32
|
"""
|
31
|
-
Output class for :func
|
33
|
+
Output class for :func:`.ber` estimator metric.
|
32
34
|
|
33
35
|
Attributes
|
34
36
|
----------
|
@@ -116,18 +118,21 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
|
|
116
118
|
return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
|
117
119
|
|
118
120
|
|
121
|
+
_BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
|
122
|
+
|
123
|
+
|
119
124
|
@set_metadata
|
120
|
-
def ber(
|
125
|
+
def ber(embeddings: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
|
121
126
|
"""
|
122
127
|
An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
|
123
128
|
using FR or KNN test statistic basis.
|
124
129
|
|
125
130
|
Parameters
|
126
131
|
----------
|
127
|
-
|
128
|
-
Array of
|
132
|
+
embeddings : ArrayLike (N, ... )
|
133
|
+
Array of image :term:`embeddings<Embeddings>`
|
129
134
|
labels : ArrayLike (N, 1)
|
130
|
-
Array of labels for each image
|
135
|
+
Array of labels for each image
|
131
136
|
k : int, default 1
|
132
137
|
Number of nearest neighbors for KNN estimator -- ignored by MST estimator
|
133
138
|
method : Literal["KNN", "MST"], default "KNN"
|
@@ -152,8 +157,34 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
|
|
152
157
|
>>> ber(images, labels)
|
153
158
|
BEROutput(ber=0.04, ber_lower=0.020416847668728033)
|
154
159
|
"""
|
155
|
-
ber_fn = get_method(
|
156
|
-
X =
|
160
|
+
ber_fn = get_method(_BER_FN_MAP, method)
|
161
|
+
X = ensure_embeddings(embeddings, dtype=np.float64)
|
157
162
|
y = as_numpy(labels)
|
158
163
|
upper, lower = ber_fn(X, y, k)
|
159
164
|
return BEROutput(upper, lower)
|
165
|
+
|
166
|
+
|
167
|
+
def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
|
168
|
+
"""
|
169
|
+
Returns the classes and counts of from an array of labels
|
170
|
+
|
171
|
+
Parameters
|
172
|
+
----------
|
173
|
+
label : NDArray
|
174
|
+
Numpy labels array
|
175
|
+
|
176
|
+
Returns
|
177
|
+
-------
|
178
|
+
Classes and counts
|
179
|
+
|
180
|
+
Raises
|
181
|
+
------
|
182
|
+
ValueError
|
183
|
+
If the number of unique classes is less than 2
|
184
|
+
"""
|
185
|
+
classes, counts = np.unique(labels, return_counts=True)
|
186
|
+
M = len(classes)
|
187
|
+
if M < 2:
|
188
|
+
raise ValueError("Label vector contains less than 2 classes!")
|
189
|
+
N = int(np.sum(counts))
|
190
|
+
return M, N
|
@@ -0,0 +1,104 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
__all__ = []
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from numpy.typing import NDArray
|
9
|
+
|
10
|
+
from dataeval._output import Output
|
11
|
+
from dataeval.typing import ArrayLike
|
12
|
+
from dataeval.utils._array import as_numpy
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass(frozen=True)
|
16
|
+
class ClustererOutput(Output):
|
17
|
+
"""
|
18
|
+
Output class for :func:`.clusterer`.
|
19
|
+
|
20
|
+
Attributes
|
21
|
+
----------
|
22
|
+
clusters : NDArray[int]
|
23
|
+
Assigned clusters
|
24
|
+
mst : NDArray[int]
|
25
|
+
The minimum spanning tree of the data
|
26
|
+
linkage_tree : NDArray[float]
|
27
|
+
The linkage array of the data
|
28
|
+
condensed_tree : NDArray[float]
|
29
|
+
The condensed tree of the data
|
30
|
+
membership_strengths : NDArray[float]
|
31
|
+
The strength of the data point belonging to the assigned cluster
|
32
|
+
"""
|
33
|
+
|
34
|
+
clusters: NDArray[np.int_]
|
35
|
+
mst: NDArray[np.double]
|
36
|
+
linkage_tree: NDArray[np.double]
|
37
|
+
condensed_tree: NDArray[np.double]
|
38
|
+
membership_strengths: NDArray[np.double]
|
39
|
+
|
40
|
+
def find_outliers(self) -> NDArray[np.int_]:
|
41
|
+
"""
|
42
|
+
Retrieves Outliers based on when the sample was added to the cluster
|
43
|
+
and how far it was from the cluster when it was added
|
44
|
+
|
45
|
+
Returns
|
46
|
+
-------
|
47
|
+
NDArray[int]
|
48
|
+
A numpy array of the outlier indices
|
49
|
+
"""
|
50
|
+
return np.nonzero(self.clusters == -1)[0]
|
51
|
+
|
52
|
+
def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
|
53
|
+
"""
|
54
|
+
Finds duplicate and near duplicate data based on cluster average distance
|
55
|
+
|
56
|
+
Returns
|
57
|
+
-------
|
58
|
+
Tuple[List[List[int]], List[List[int]]]
|
59
|
+
The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
|
60
|
+
"""
|
61
|
+
# Delay load numba compiled functions
|
62
|
+
from dataeval.utils._clusterer import compare_links_to_cluster_std, sorted_union_find
|
63
|
+
|
64
|
+
exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)
|
65
|
+
exact_dupes = sorted_union_find(exact_indices)
|
66
|
+
near_dupes = sorted_union_find(near_indices)
|
67
|
+
|
68
|
+
return [[int(ii) for ii in il] for il in exact_dupes], [[int(ii) for ii in il] for il in near_dupes]
|
69
|
+
|
70
|
+
|
71
|
+
def clusterer(data: ArrayLike) -> ClustererOutput:
|
72
|
+
"""
|
73
|
+
Uses hierarchical clustering on the flattened data and returns clustering
|
74
|
+
information.
|
75
|
+
|
76
|
+
Parameters
|
77
|
+
----------
|
78
|
+
data : ArrayLike, shape - (N, ...)
|
79
|
+
A dataset in an ArrayLike format. Function expects the data to have 2
|
80
|
+
or more dimensions which will flatten to (N, P) where N number of
|
81
|
+
observations in a P-dimensional space.
|
82
|
+
|
83
|
+
Returns
|
84
|
+
-------
|
85
|
+
:class:`.ClustererOutput`
|
86
|
+
|
87
|
+
Note
|
88
|
+
----
|
89
|
+
The clusterer works best when the length of the feature dimension, P, is
|
90
|
+
less than 500. If flattening a CxHxW image results in a dimension larger
|
91
|
+
than 500, then it is recommended to reduce the dimensions.
|
92
|
+
|
93
|
+
Example
|
94
|
+
-------
|
95
|
+
>>> clusterer(clusterer_images).clusters
|
96
|
+
array([ 2, 0, 0, 0, 0, 0, 4, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0,
|
97
|
+
4, 2, 0, 0, 1, 2, 0, 1, 3, 0, 3, 3, 4, 0, 0, 3, 0,
|
98
|
+
3, -1, 0, 0, 2, 4, 3, 4, 0, 1, 0, -1, 3, 0, 0, 0])
|
99
|
+
"""
|
100
|
+
# Delay load numba compiled functions
|
101
|
+
from dataeval.utils._clusterer import cluster
|
102
|
+
|
103
|
+
c = cluster(data)
|
104
|
+
return ClustererOutput(c.clusters, c.mst, c.linkage_tree, as_numpy(c.condensed_tree), c.membership_strengths)
|
@@ -11,17 +11,19 @@ from dataclasses import dataclass
|
|
11
11
|
from typing import Literal
|
12
12
|
|
13
13
|
import numpy as np
|
14
|
-
from numpy.typing import
|
14
|
+
from numpy.typing import NDArray
|
15
15
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
18
|
-
from dataeval.utils.
|
16
|
+
from dataeval._output import Output, set_metadata
|
17
|
+
from dataeval.typing import ArrayLike
|
18
|
+
from dataeval.utils._array import ensure_embeddings
|
19
|
+
from dataeval.utils._method import get_method
|
20
|
+
from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
|
19
21
|
|
20
22
|
|
21
23
|
@dataclass(frozen=True)
|
22
24
|
class DivergenceOutput(Output):
|
23
25
|
"""
|
24
|
-
Output class for :func
|
26
|
+
Output class for :func:`.divergence` estimator metric.
|
25
27
|
|
26
28
|
Attributes
|
27
29
|
----------
|
@@ -78,18 +80,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
|
|
78
80
|
return errors
|
79
81
|
|
80
82
|
|
83
|
+
_DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
|
84
|
+
|
85
|
+
|
81
86
|
@set_metadata
|
82
|
-
def divergence(
|
87
|
+
def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
|
83
88
|
"""
|
84
89
|
Calculates the :term:`divergence` and any errors between the datasets.
|
85
90
|
|
86
91
|
Parameters
|
87
92
|
----------
|
88
|
-
|
89
|
-
|
93
|
+
emb_a : ArrayLike, shape - (N, P)
|
94
|
+
Image embeddings in an ArrayLike format to compare.
|
90
95
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
91
|
-
|
92
|
-
|
96
|
+
emb_b : ArrayLike, shape - (N, P)
|
97
|
+
Image embeddings in an ArrayLike format to compare.
|
93
98
|
Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
|
94
99
|
method : Literal["MST, "FNN"], default "FNN"
|
95
100
|
Method used to estimate dataset :term:`divergence<Divergence>`
|
@@ -125,9 +130,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
|
|
125
130
|
>>> divergence(datasetA, datasetB)
|
126
131
|
DivergenceOutput(divergence=0.28, errors=36)
|
127
132
|
"""
|
128
|
-
div_fn = get_method(
|
129
|
-
a =
|
130
|
-
b =
|
133
|
+
div_fn = get_method(_DIVERGENCE_FN_MAP, method)
|
134
|
+
a = ensure_embeddings(emb_a, dtype=np.float64)
|
135
|
+
b = ensure_embeddings(emb_b, dtype=np.float64)
|
131
136
|
N = a.shape[0]
|
132
137
|
M = b.shape[0]
|
133
138
|
|
@@ -10,17 +10,17 @@ __all__ = []
|
|
10
10
|
|
11
11
|
from dataclasses import dataclass
|
12
12
|
|
13
|
-
from numpy.typing import ArrayLike
|
14
13
|
from sklearn.metrics import average_precision_score
|
15
14
|
|
16
|
-
from dataeval.
|
17
|
-
from dataeval.
|
15
|
+
from dataeval._output import Output, set_metadata
|
16
|
+
from dataeval.typing import ArrayLike
|
17
|
+
from dataeval.utils._array import as_numpy
|
18
18
|
|
19
19
|
|
20
20
|
@dataclass(frozen=True)
|
21
21
|
class UAPOutput(Output):
|
22
22
|
"""
|
23
|
-
Output class for :func
|
23
|
+
Output class for :func:`.uap` estimator metric.
|
24
24
|
|
25
25
|
Attributes
|
26
26
|
----------
|
@@ -21,15 +21,15 @@ __all__ = [
|
|
21
21
|
"visualstats",
|
22
22
|
]
|
23
23
|
|
24
|
-
from dataeval.metrics.stats.
|
25
|
-
from dataeval.metrics.stats.
|
24
|
+
from dataeval.metrics.stats._boxratiostats import boxratiostats
|
25
|
+
from dataeval.metrics.stats._datasetstats import (
|
26
26
|
ChannelStatsOutput,
|
27
27
|
DatasetStatsOutput,
|
28
28
|
channelstats,
|
29
29
|
datasetstats,
|
30
30
|
)
|
31
|
-
from dataeval.metrics.stats.
|
32
|
-
from dataeval.metrics.stats.
|
33
|
-
from dataeval.metrics.stats.
|
34
|
-
from dataeval.metrics.stats.
|
35
|
-
from dataeval.metrics.stats.
|
31
|
+
from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, dimensionstats
|
32
|
+
from dataeval.metrics.stats._hashstats import HashStatsOutput, hashstats
|
33
|
+
from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
|
34
|
+
from dataeval.metrics.stats._pixelstats import PixelStatsOutput, pixelstats
|
35
|
+
from dataeval.metrics.stats._visualstats import VisualStatsOutput, visualstats
|