dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataeval/__init__.py +4 -4
- dataeval/detectors/__init__.py +4 -3
- dataeval/detectors/drift/__init__.py +9 -10
- dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
- dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
- dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
- dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
- dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
- dataeval/detectors/drift/updates.py +61 -0
- dataeval/detectors/linters/__init__.py +3 -3
- dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
- dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
- dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
- dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
- dataeval/detectors/ood/__init__.py +6 -6
- dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
- dataeval/detectors/ood/aegmm.py +66 -0
- dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
- dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
- dataeval/detectors/ood/metadata_ks_compare.py +99 -0
- dataeval/detectors/ood/metadata_least_likely.py +119 -0
- dataeval/detectors/ood/metadata_ood_mi.py +92 -0
- dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
- dataeval/detectors/ood/vaegmm.py +75 -0
- dataeval/interop.py +56 -0
- dataeval/metrics/__init__.py +1 -1
- dataeval/metrics/bias/__init__.py +4 -4
- dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
- dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
- dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
- dataeval/metrics/bias/metadata.py +358 -0
- dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
- dataeval/metrics/estimators/__init__.py +3 -3
- dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
- dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
- dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
- dataeval/metrics/stats/__init__.py +7 -7
- dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
- dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
- dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
- dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
- dataeval/metrics/stats/hashstats.py +156 -0
- dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
- dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
- dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
- dataeval/{_internal/output.py → output.py} +26 -6
- dataeval/utils/__init__.py +8 -3
- dataeval/utils/image.py +71 -0
- dataeval/utils/lazy.py +26 -0
- dataeval/utils/metadata.py +258 -0
- dataeval/utils/shared.py +151 -0
- dataeval/{_internal → utils}/split_dataset.py +98 -33
- dataeval/utils/tensorflow/__init__.py +7 -6
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
- dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
- dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
- dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
- dataeval/utils/tensorflow/loss/__init__.py +6 -2
- dataeval/utils/torch/__init__.py +7 -3
- dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
- dataeval/{_internal → utils/torch}/datasets.py +48 -42
- dataeval/utils/torch/models.py +138 -0
- dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
- dataeval/{_internal → utils/torch}/utils.py +3 -1
- dataeval/workflows/__init__.py +1 -1
- dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
- {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
- dataeval-0.73.0.dist-info/RECORD +73 -0
- dataeval/_internal/detectors/__init__.py +0 -0
- dataeval/_internal/detectors/drift/__init__.py +0 -0
- dataeval/_internal/detectors/ood/__init__.py +0 -0
- dataeval/_internal/detectors/ood/aegmm.py +0 -78
- dataeval/_internal/detectors/ood/vaegmm.py +0 -89
- dataeval/_internal/interop.py +0 -49
- dataeval/_internal/metrics/__init__.py +0 -0
- dataeval/_internal/metrics/stats/hashstats.py +0 -75
- dataeval/_internal/metrics/utils.py +0 -447
- dataeval/_internal/models/__init__.py +0 -0
- dataeval/_internal/models/pytorch/__init__.py +0 -0
- dataeval/_internal/models/pytorch/utils.py +0 -67
- dataeval/_internal/models/tensorflow/__init__.py +0 -0
- dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
- dataeval/_internal/workflows/__init__.py +0 -0
- dataeval/detectors/drift/kernels/__init__.py +0 -10
- dataeval/detectors/drift/updates/__init__.py +0 -8
- dataeval/utils/tensorflow/models/__init__.py +0 -9
- dataeval/utils/tensorflow/recon/__init__.py +0 -3
- dataeval/utils/torch/datasets/__init__.py +0 -12
- dataeval/utils/torch/models/__init__.py +0 -11
- dataeval/utils/torch/trainer/__init__.py +0 -7
- dataeval-0.72.1.dist-info/RECORD +0 -81
- {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
- {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0
@@ -1,78 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
from typing import Callable
|
12
|
-
|
13
|
-
import tensorflow as tf
|
14
|
-
import tf_keras as keras
|
15
|
-
from numpy.typing import ArrayLike
|
16
|
-
|
17
|
-
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScoreOutput
|
18
|
-
from dataeval._internal.interop import to_numpy
|
19
|
-
from dataeval._internal.models.tensorflow.autoencoder import AEGMM
|
20
|
-
from dataeval._internal.models.tensorflow.gmm import gmm_energy
|
21
|
-
from dataeval._internal.models.tensorflow.losses import LossGMM
|
22
|
-
from dataeval._internal.models.tensorflow.utils import predict_batch
|
23
|
-
from dataeval._internal.output import set_metadata
|
24
|
-
|
25
|
-
|
26
|
-
class OOD_AEGMM(OODGMMBase):
|
27
|
-
"""
|
28
|
-
AE with Gaussian Mixture Model based outlier detector.
|
29
|
-
|
30
|
-
Parameters
|
31
|
-
----------
|
32
|
-
model : AEGMM
|
33
|
-
An AEGMM model.
|
34
|
-
"""
|
35
|
-
|
36
|
-
def __init__(self, model: AEGMM) -> None:
|
37
|
-
super().__init__(model)
|
38
|
-
|
39
|
-
def fit(
|
40
|
-
self,
|
41
|
-
x_ref: ArrayLike,
|
42
|
-
threshold_perc: float = 100.0,
|
43
|
-
loss_fn: Callable[..., tf.Tensor] | None = None,
|
44
|
-
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
45
|
-
epochs: int = 20,
|
46
|
-
batch_size: int = 64,
|
47
|
-
verbose: bool = True,
|
48
|
-
) -> None:
|
49
|
-
if loss_fn is None:
|
50
|
-
loss_fn = LossGMM()
|
51
|
-
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
52
|
-
|
53
|
-
@set_metadata("dataeval.detectors")
|
54
|
-
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
55
|
-
"""
|
56
|
-
Compute the :term:`out of distribution<Out-of-distribution (OOD)>` score for a given dataset.
|
57
|
-
|
58
|
-
Parameters
|
59
|
-
----------
|
60
|
-
X : ArrayLike
|
61
|
-
Input data to score.
|
62
|
-
batch_size : int, default 1e10
|
63
|
-
Number of instances to process in each batch.
|
64
|
-
Use a smaller batch size if your dataset is large or if you encounter memory issues.
|
65
|
-
|
66
|
-
Returns
|
67
|
-
-------
|
68
|
-
OODScoreOutput
|
69
|
-
An object containing the instance-level OOD score.
|
70
|
-
|
71
|
-
Note
|
72
|
-
----
|
73
|
-
This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
|
74
|
-
"""
|
75
|
-
self._validate(X := to_numpy(X))
|
76
|
-
_, z, _ = predict_batch(X, self.model, batch_size=batch_size)
|
77
|
-
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
78
|
-
return OODScoreOutput(energy.numpy()) # type: ignore
|
@@ -1,89 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Source code derived from Alibi-Detect 0.11.4
|
3
|
-
https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
|
4
|
-
|
5
|
-
Original code Copyright (c) 2023 Seldon Technologies Ltd
|
6
|
-
Licensed under Apache Software License (Apache 2.0)
|
7
|
-
"""
|
8
|
-
|
9
|
-
from __future__ import annotations
|
10
|
-
|
11
|
-
from typing import Callable
|
12
|
-
|
13
|
-
import numpy as np
|
14
|
-
import tensorflow as tf
|
15
|
-
import tf_keras as keras
|
16
|
-
from numpy.typing import ArrayLike
|
17
|
-
|
18
|
-
from dataeval._internal.detectors.ood.base import OODGMMBase, OODScoreOutput
|
19
|
-
from dataeval._internal.interop import to_numpy
|
20
|
-
from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
|
21
|
-
from dataeval._internal.models.tensorflow.gmm import gmm_energy
|
22
|
-
from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
|
23
|
-
from dataeval._internal.models.tensorflow.utils import predict_batch
|
24
|
-
from dataeval._internal.output import set_metadata
|
25
|
-
|
26
|
-
|
27
|
-
class OOD_VAEGMM(OODGMMBase):
|
28
|
-
"""
|
29
|
-
VAE with Gaussian Mixture Model based outlier detector.
|
30
|
-
|
31
|
-
Parameters
|
32
|
-
----------
|
33
|
-
model : VAEGMM
|
34
|
-
A VAEGMM model.
|
35
|
-
samples
|
36
|
-
Number of samples sampled to evaluate each instance.
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(self, model: VAEGMM, samples: int = 10) -> None:
|
40
|
-
super().__init__(model)
|
41
|
-
self.samples = samples
|
42
|
-
|
43
|
-
def fit(
|
44
|
-
self,
|
45
|
-
x_ref: ArrayLike,
|
46
|
-
threshold_perc: float = 100.0,
|
47
|
-
loss_fn: Callable[..., tf.Tensor] | None = None,
|
48
|
-
optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
|
49
|
-
epochs: int = 20,
|
50
|
-
batch_size: int = 64,
|
51
|
-
verbose: bool = True,
|
52
|
-
) -> None:
|
53
|
-
if loss_fn is None:
|
54
|
-
loss_fn = LossGMM(elbo=Elbo(0.05))
|
55
|
-
super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
|
56
|
-
|
57
|
-
@set_metadata("dataeval.detectors")
|
58
|
-
def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
|
59
|
-
"""
|
60
|
-
Compute the out of distributuion<Out-of-distribution (OOD)>` score for a given dataset.
|
61
|
-
|
62
|
-
Parameters
|
63
|
-
----------
|
64
|
-
X : ArrayLike
|
65
|
-
Input data to score.
|
66
|
-
batch_size : int, default 1e10
|
67
|
-
Number of instances to process in each batch.
|
68
|
-
Use a smaller batch size if your dataset is large or if you encounter memory issues.
|
69
|
-
|
70
|
-
Returns
|
71
|
-
-------
|
72
|
-
OODScoreOutput
|
73
|
-
An object containing the instance-level OOD score.
|
74
|
-
|
75
|
-
Note
|
76
|
-
----
|
77
|
-
This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
|
78
|
-
"""
|
79
|
-
self._validate(X := to_numpy(X))
|
80
|
-
|
81
|
-
# draw samples from latent space
|
82
|
-
X_samples = np.repeat(X, self.samples, axis=0)
|
83
|
-
_, z, _ = predict_batch(X_samples, self.model, batch_size=batch_size)
|
84
|
-
|
85
|
-
# compute average energy for samples
|
86
|
-
energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
|
87
|
-
energy_samples = energy.numpy().reshape((-1, self.samples)) # type: ignore
|
88
|
-
iscore = np.mean(energy_samples, axis=-1)
|
89
|
-
return OODScoreOutput(iscore)
|
dataeval/_internal/interop.py
DELETED
@@ -1,49 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from importlib import import_module
|
4
|
-
from typing import Any, Iterable, Iterator
|
5
|
-
|
6
|
-
import numpy as np
|
7
|
-
from numpy.typing import ArrayLike, NDArray
|
8
|
-
|
9
|
-
module_cache = {}
|
10
|
-
|
11
|
-
|
12
|
-
def try_import(module_name):
|
13
|
-
if module_name in module_cache:
|
14
|
-
return module_cache[module_name]
|
15
|
-
|
16
|
-
try:
|
17
|
-
module = import_module(module_name)
|
18
|
-
except ImportError: # pragma: no cover - covered by test_mindeps.py
|
19
|
-
module = None
|
20
|
-
|
21
|
-
module_cache[module_name] = module
|
22
|
-
return module
|
23
|
-
|
24
|
-
|
25
|
-
def as_numpy(array: ArrayLike | None) -> NDArray[Any]:
|
26
|
-
return to_numpy(array, copy=False)
|
27
|
-
|
28
|
-
|
29
|
-
def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
|
30
|
-
if array is None:
|
31
|
-
return np.ndarray([])
|
32
|
-
|
33
|
-
if isinstance(array, np.ndarray):
|
34
|
-
return array.copy() if copy else array
|
35
|
-
|
36
|
-
tf = try_import("tensorflow")
|
37
|
-
if tf and tf.is_tensor(array):
|
38
|
-
return array.numpy().copy() if copy else array.numpy() # type: ignore
|
39
|
-
|
40
|
-
torch = try_import("torch")
|
41
|
-
if torch and isinstance(array, torch.Tensor):
|
42
|
-
return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
|
43
|
-
|
44
|
-
return np.array(array, copy=copy)
|
45
|
-
|
46
|
-
|
47
|
-
def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
|
48
|
-
for array in iterable:
|
49
|
-
yield to_numpy(array)
|
File without changes
|
@@ -1,75 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from dataclasses import dataclass
|
4
|
-
from typing import Iterable
|
5
|
-
|
6
|
-
from numpy.typing import ArrayLike
|
7
|
-
|
8
|
-
from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
|
9
|
-
from dataeval._internal.metrics.utils import pchash, xxhash
|
10
|
-
from dataeval._internal.output import set_metadata
|
11
|
-
|
12
|
-
|
13
|
-
@dataclass(frozen=True)
|
14
|
-
class HashStatsOutput(BaseStatsOutput):
|
15
|
-
"""
|
16
|
-
Output class for :func:`hashstats` stats metric
|
17
|
-
|
18
|
-
Attributes
|
19
|
-
----------
|
20
|
-
xxhash : List[str]
|
21
|
-
xxHash hash of the images as a hex string
|
22
|
-
pchash : List[str]
|
23
|
-
:term:`Perception-based Hash` of the images as a hex string
|
24
|
-
"""
|
25
|
-
|
26
|
-
xxhash: list[str]
|
27
|
-
pchash: list[str]
|
28
|
-
|
29
|
-
|
30
|
-
class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
|
31
|
-
output_class = HashStatsOutput
|
32
|
-
image_function_map = {
|
33
|
-
"xxhash": lambda x: xxhash(x.image),
|
34
|
-
"pchash": lambda x: pchash(x.image),
|
35
|
-
}
|
36
|
-
|
37
|
-
|
38
|
-
@set_metadata("dataeval.metrics")
|
39
|
-
def hashstats(
|
40
|
-
images: Iterable[ArrayLike],
|
41
|
-
bboxes: Iterable[ArrayLike] | None = None,
|
42
|
-
) -> HashStatsOutput:
|
43
|
-
"""
|
44
|
-
Calculates hashes for each image
|
45
|
-
|
46
|
-
This function computes hashes from the images including exact hashes and perception-based
|
47
|
-
hashes. These hash values can be used to determine if images are exact or near matches.
|
48
|
-
|
49
|
-
Parameters
|
50
|
-
----------
|
51
|
-
images : ArrayLike
|
52
|
-
Images to hashing
|
53
|
-
bboxes : Iterable[ArrayLike] or None
|
54
|
-
Bounding boxes in `xyxy` format for each image
|
55
|
-
|
56
|
-
Returns
|
57
|
-
-------
|
58
|
-
HashStatsOutput
|
59
|
-
A dictionary-like object containing the computed hashes for each image.
|
60
|
-
|
61
|
-
See Also
|
62
|
-
--------
|
63
|
-
:term:`Duplicates`
|
64
|
-
|
65
|
-
Examples
|
66
|
-
--------
|
67
|
-
Calculating the :term:`statistics<Statistics>` on the images, whose shape is (C, H, W)
|
68
|
-
|
69
|
-
>>> results = hashstats(images)
|
70
|
-
>>> print(results.xxhash)
|
71
|
-
['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
|
72
|
-
>>> print(results.pchash)
|
73
|
-
['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
|
74
|
-
"""
|
75
|
-
return run_stats(images, bboxes, False, [HashStatsProcessor])[0]
|