dataeval 0.72.1__py3-none-any.whl → 0.73.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. dataeval/__init__.py +4 -4
  2. dataeval/detectors/__init__.py +4 -3
  3. dataeval/detectors/drift/__init__.py +9 -10
  4. dataeval/{_internal/detectors → detectors}/drift/base.py +39 -91
  5. dataeval/{_internal/detectors → detectors}/drift/cvm.py +4 -3
  6. dataeval/{_internal/detectors → detectors}/drift/ks.py +4 -3
  7. dataeval/{_internal/detectors → detectors}/drift/mmd.py +23 -25
  8. dataeval/{_internal/detectors → detectors}/drift/torch.py +13 -11
  9. dataeval/{_internal/detectors → detectors}/drift/uncertainty.py +7 -5
  10. dataeval/detectors/drift/updates.py +61 -0
  11. dataeval/detectors/linters/__init__.py +3 -3
  12. dataeval/{_internal/detectors → detectors/linters}/clusterer.py +41 -39
  13. dataeval/{_internal/detectors → detectors/linters}/duplicates.py +19 -9
  14. dataeval/{_internal/detectors → detectors/linters}/merged_stats.py +3 -1
  15. dataeval/{_internal/detectors → detectors/linters}/outliers.py +14 -21
  16. dataeval/detectors/ood/__init__.py +6 -6
  17. dataeval/{_internal/detectors → detectors}/ood/ae.py +20 -12
  18. dataeval/detectors/ood/aegmm.py +66 -0
  19. dataeval/{_internal/detectors → detectors}/ood/base.py +33 -21
  20. dataeval/{_internal/detectors → detectors}/ood/llr.py +43 -33
  21. dataeval/detectors/ood/metadata_ks_compare.py +99 -0
  22. dataeval/detectors/ood/metadata_least_likely.py +119 -0
  23. dataeval/detectors/ood/metadata_ood_mi.py +92 -0
  24. dataeval/{_internal/detectors → detectors}/ood/vae.py +23 -17
  25. dataeval/detectors/ood/vaegmm.py +75 -0
  26. dataeval/interop.py +56 -0
  27. dataeval/metrics/__init__.py +1 -1
  28. dataeval/metrics/bias/__init__.py +4 -4
  29. dataeval/{_internal/metrics → metrics/bias}/balance.py +75 -13
  30. dataeval/{_internal/metrics → metrics/bias}/coverage.py +41 -7
  31. dataeval/{_internal/metrics → metrics/bias}/diversity.py +75 -18
  32. dataeval/metrics/bias/metadata.py +358 -0
  33. dataeval/{_internal/metrics → metrics/bias}/parity.py +54 -44
  34. dataeval/metrics/estimators/__init__.py +3 -3
  35. dataeval/{_internal/metrics → metrics/estimators}/ber.py +25 -22
  36. dataeval/{_internal/metrics → metrics/estimators}/divergence.py +11 -12
  37. dataeval/{_internal/metrics → metrics/estimators}/uap.py +5 -3
  38. dataeval/metrics/stats/__init__.py +7 -7
  39. dataeval/{_internal/metrics → metrics}/stats/base.py +59 -35
  40. dataeval/{_internal/metrics → metrics}/stats/boxratiostats.py +18 -14
  41. dataeval/{_internal/metrics → metrics}/stats/datasetstats.py +18 -16
  42. dataeval/{_internal/metrics → metrics}/stats/dimensionstats.py +9 -7
  43. dataeval/metrics/stats/hashstats.py +156 -0
  44. dataeval/{_internal/metrics → metrics}/stats/labelstats.py +5 -3
  45. dataeval/{_internal/metrics → metrics}/stats/pixelstats.py +9 -8
  46. dataeval/{_internal/metrics → metrics}/stats/visualstats.py +10 -9
  47. dataeval/{_internal/output.py → output.py} +26 -6
  48. dataeval/utils/__init__.py +8 -3
  49. dataeval/utils/image.py +71 -0
  50. dataeval/utils/lazy.py +26 -0
  51. dataeval/utils/metadata.py +258 -0
  52. dataeval/utils/shared.py +151 -0
  53. dataeval/{_internal → utils}/split_dataset.py +98 -33
  54. dataeval/utils/tensorflow/__init__.py +7 -6
  55. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/gmm.py +8 -2
  56. dataeval/{_internal/models/tensorflow/losses.py → utils/tensorflow/_internal/loss.py} +28 -18
  57. dataeval/{_internal/models/tensorflow/pixelcnn.py → utils/tensorflow/_internal/models.py} +387 -97
  58. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/trainer.py +15 -6
  59. dataeval/{_internal/models/tensorflow → utils/tensorflow/_internal}/utils.py +84 -85
  60. dataeval/utils/tensorflow/loss/__init__.py +6 -2
  61. dataeval/utils/torch/__init__.py +7 -3
  62. dataeval/{_internal/models/pytorch → utils/torch}/blocks.py +19 -14
  63. dataeval/{_internal → utils/torch}/datasets.py +48 -42
  64. dataeval/utils/torch/models.py +138 -0
  65. dataeval/{_internal/models/pytorch/autoencoder.py → utils/torch/trainer.py} +7 -136
  66. dataeval/{_internal → utils/torch}/utils.py +3 -1
  67. dataeval/workflows/__init__.py +1 -1
  68. dataeval/{_internal/workflows → workflows}/sufficiency.py +39 -34
  69. {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/METADATA +4 -3
  70. dataeval-0.73.0.dist-info/RECORD +73 -0
  71. dataeval/_internal/detectors/__init__.py +0 -0
  72. dataeval/_internal/detectors/drift/__init__.py +0 -0
  73. dataeval/_internal/detectors/ood/__init__.py +0 -0
  74. dataeval/_internal/detectors/ood/aegmm.py +0 -78
  75. dataeval/_internal/detectors/ood/vaegmm.py +0 -89
  76. dataeval/_internal/interop.py +0 -49
  77. dataeval/_internal/metrics/__init__.py +0 -0
  78. dataeval/_internal/metrics/stats/hashstats.py +0 -75
  79. dataeval/_internal/metrics/utils.py +0 -447
  80. dataeval/_internal/models/__init__.py +0 -0
  81. dataeval/_internal/models/pytorch/__init__.py +0 -0
  82. dataeval/_internal/models/pytorch/utils.py +0 -67
  83. dataeval/_internal/models/tensorflow/__init__.py +0 -0
  84. dataeval/_internal/models/tensorflow/autoencoder.py +0 -320
  85. dataeval/_internal/workflows/__init__.py +0 -0
  86. dataeval/detectors/drift/kernels/__init__.py +0 -10
  87. dataeval/detectors/drift/updates/__init__.py +0 -8
  88. dataeval/utils/tensorflow/models/__init__.py +0 -9
  89. dataeval/utils/tensorflow/recon/__init__.py +0 -3
  90. dataeval/utils/torch/datasets/__init__.py +0 -12
  91. dataeval/utils/torch/models/__init__.py +0 -11
  92. dataeval/utils/torch/trainer/__init__.py +0 -7
  93. dataeval-0.72.1.dist-info/RECORD +0 -81
  94. {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/LICENSE.txt +0 -0
  95. {dataeval-0.72.1.dist-info → dataeval-0.73.0.dist-info}/WHEEL +0 -0
@@ -1,78 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- from typing import Callable
12
-
13
- import tensorflow as tf
14
- import tf_keras as keras
15
- from numpy.typing import ArrayLike
16
-
17
- from dataeval._internal.detectors.ood.base import OODGMMBase, OODScoreOutput
18
- from dataeval._internal.interop import to_numpy
19
- from dataeval._internal.models.tensorflow.autoencoder import AEGMM
20
- from dataeval._internal.models.tensorflow.gmm import gmm_energy
21
- from dataeval._internal.models.tensorflow.losses import LossGMM
22
- from dataeval._internal.models.tensorflow.utils import predict_batch
23
- from dataeval._internal.output import set_metadata
24
-
25
-
26
- class OOD_AEGMM(OODGMMBase):
27
- """
28
- AE with Gaussian Mixture Model based outlier detector.
29
-
30
- Parameters
31
- ----------
32
- model : AEGMM
33
- An AEGMM model.
34
- """
35
-
36
- def __init__(self, model: AEGMM) -> None:
37
- super().__init__(model)
38
-
39
- def fit(
40
- self,
41
- x_ref: ArrayLike,
42
- threshold_perc: float = 100.0,
43
- loss_fn: Callable[..., tf.Tensor] | None = None,
44
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
45
- epochs: int = 20,
46
- batch_size: int = 64,
47
- verbose: bool = True,
48
- ) -> None:
49
- if loss_fn is None:
50
- loss_fn = LossGMM()
51
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
52
-
53
- @set_metadata("dataeval.detectors")
54
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
55
- """
56
- Compute the :term:`out of distribution<Out-of-distribution (OOD)>` score for a given dataset.
57
-
58
- Parameters
59
- ----------
60
- X : ArrayLike
61
- Input data to score.
62
- batch_size : int, default 1e10
63
- Number of instances to process in each batch.
64
- Use a smaller batch size if your dataset is large or if you encounter memory issues.
65
-
66
- Returns
67
- -------
68
- OODScoreOutput
69
- An object containing the instance-level OOD score.
70
-
71
- Note
72
- ----
73
- This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
74
- """
75
- self._validate(X := to_numpy(X))
76
- _, z, _ = predict_batch(X, self.model, batch_size=batch_size)
77
- energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
78
- return OODScoreOutput(energy.numpy()) # type: ignore
@@ -1,89 +0,0 @@
1
- """
2
- Source code derived from Alibi-Detect 0.11.4
3
- https://github.com/SeldonIO/alibi-detect/tree/v0.11.4
4
-
5
- Original code Copyright (c) 2023 Seldon Technologies Ltd
6
- Licensed under Apache Software License (Apache 2.0)
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- from typing import Callable
12
-
13
- import numpy as np
14
- import tensorflow as tf
15
- import tf_keras as keras
16
- from numpy.typing import ArrayLike
17
-
18
- from dataeval._internal.detectors.ood.base import OODGMMBase, OODScoreOutput
19
- from dataeval._internal.interop import to_numpy
20
- from dataeval._internal.models.tensorflow.autoencoder import VAEGMM
21
- from dataeval._internal.models.tensorflow.gmm import gmm_energy
22
- from dataeval._internal.models.tensorflow.losses import Elbo, LossGMM
23
- from dataeval._internal.models.tensorflow.utils import predict_batch
24
- from dataeval._internal.output import set_metadata
25
-
26
-
27
- class OOD_VAEGMM(OODGMMBase):
28
- """
29
- VAE with Gaussian Mixture Model based outlier detector.
30
-
31
- Parameters
32
- ----------
33
- model : VAEGMM
34
- A VAEGMM model.
35
- samples
36
- Number of samples sampled to evaluate each instance.
37
- """
38
-
39
- def __init__(self, model: VAEGMM, samples: int = 10) -> None:
40
- super().__init__(model)
41
- self.samples = samples
42
-
43
- def fit(
44
- self,
45
- x_ref: ArrayLike,
46
- threshold_perc: float = 100.0,
47
- loss_fn: Callable[..., tf.Tensor] | None = None,
48
- optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
49
- epochs: int = 20,
50
- batch_size: int = 64,
51
- verbose: bool = True,
52
- ) -> None:
53
- if loss_fn is None:
54
- loss_fn = LossGMM(elbo=Elbo(0.05))
55
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
56
-
57
- @set_metadata("dataeval.detectors")
58
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScoreOutput:
59
- """
60
- Compute the out of distributuion<Out-of-distribution (OOD)>` score for a given dataset.
61
-
62
- Parameters
63
- ----------
64
- X : ArrayLike
65
- Input data to score.
66
- batch_size : int, default 1e10
67
- Number of instances to process in each batch.
68
- Use a smaller batch size if your dataset is large or if you encounter memory issues.
69
-
70
- Returns
71
- -------
72
- OODScoreOutput
73
- An object containing the instance-level OOD score.
74
-
75
- Note
76
- ----
77
- This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
78
- """
79
- self._validate(X := to_numpy(X))
80
-
81
- # draw samples from latent space
82
- X_samples = np.repeat(X, self.samples, axis=0)
83
- _, z, _ = predict_batch(X_samples, self.model, batch_size=batch_size)
84
-
85
- # compute average energy for samples
86
- energy, _ = gmm_energy(z, self.gmm_params, return_mean=False)
87
- energy_samples = energy.numpy().reshape((-1, self.samples)) # type: ignore
88
- iscore = np.mean(energy_samples, axis=-1)
89
- return OODScoreOutput(iscore)
@@ -1,49 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from importlib import import_module
4
- from typing import Any, Iterable, Iterator
5
-
6
- import numpy as np
7
- from numpy.typing import ArrayLike, NDArray
8
-
9
- module_cache = {}
10
-
11
-
12
- def try_import(module_name):
13
- if module_name in module_cache:
14
- return module_cache[module_name]
15
-
16
- try:
17
- module = import_module(module_name)
18
- except ImportError: # pragma: no cover - covered by test_mindeps.py
19
- module = None
20
-
21
- module_cache[module_name] = module
22
- return module
23
-
24
-
25
- def as_numpy(array: ArrayLike | None) -> NDArray[Any]:
26
- return to_numpy(array, copy=False)
27
-
28
-
29
- def to_numpy(array: ArrayLike | None, copy: bool = True) -> NDArray[Any]:
30
- if array is None:
31
- return np.ndarray([])
32
-
33
- if isinstance(array, np.ndarray):
34
- return array.copy() if copy else array
35
-
36
- tf = try_import("tensorflow")
37
- if tf and tf.is_tensor(array):
38
- return array.numpy().copy() if copy else array.numpy() # type: ignore
39
-
40
- torch = try_import("torch")
41
- if torch and isinstance(array, torch.Tensor):
42
- return array.detach().cpu().numpy().copy() if copy else array.detach().cpu().numpy() # type: ignore
43
-
44
- return np.array(array, copy=copy)
45
-
46
-
47
- def to_numpy_iter(iterable: Iterable[ArrayLike]) -> Iterator[NDArray[Any]]:
48
- for array in iterable:
49
- yield to_numpy(array)
File without changes
@@ -1,75 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from dataclasses import dataclass
4
- from typing import Iterable
5
-
6
- from numpy.typing import ArrayLike
7
-
8
- from dataeval._internal.metrics.stats.base import BaseStatsOutput, StatsProcessor, run_stats
9
- from dataeval._internal.metrics.utils import pchash, xxhash
10
- from dataeval._internal.output import set_metadata
11
-
12
-
13
- @dataclass(frozen=True)
14
- class HashStatsOutput(BaseStatsOutput):
15
- """
16
- Output class for :func:`hashstats` stats metric
17
-
18
- Attributes
19
- ----------
20
- xxhash : List[str]
21
- xxHash hash of the images as a hex string
22
- pchash : List[str]
23
- :term:`Perception-based Hash` of the images as a hex string
24
- """
25
-
26
- xxhash: list[str]
27
- pchash: list[str]
28
-
29
-
30
- class HashStatsProcessor(StatsProcessor[HashStatsOutput]):
31
- output_class = HashStatsOutput
32
- image_function_map = {
33
- "xxhash": lambda x: xxhash(x.image),
34
- "pchash": lambda x: pchash(x.image),
35
- }
36
-
37
-
38
- @set_metadata("dataeval.metrics")
39
- def hashstats(
40
- images: Iterable[ArrayLike],
41
- bboxes: Iterable[ArrayLike] | None = None,
42
- ) -> HashStatsOutput:
43
- """
44
- Calculates hashes for each image
45
-
46
- This function computes hashes from the images including exact hashes and perception-based
47
- hashes. These hash values can be used to determine if images are exact or near matches.
48
-
49
- Parameters
50
- ----------
51
- images : ArrayLike
52
- Images to hashing
53
- bboxes : Iterable[ArrayLike] or None
54
- Bounding boxes in `xyxy` format for each image
55
-
56
- Returns
57
- -------
58
- HashStatsOutput
59
- A dictionary-like object containing the computed hashes for each image.
60
-
61
- See Also
62
- --------
63
- :term:`Duplicates`
64
-
65
- Examples
66
- --------
67
- Calculating the :term:`statistics<Statistics>` on the images, whose shape is (C, H, W)
68
-
69
- >>> results = hashstats(images)
70
- >>> print(results.xxhash)
71
- ['a72434443d6e7336', 'efc12c2f14581d79', '4a1e03483a27d674', '3a3ecedbcf814226']
72
- >>> print(results.pchash)
73
- ['8f25506af46a7c6a', '8000808000008080', '8e71f18e0ef18e0e', 'a956d6a956d6a928']
74
- """
75
- return run_stats(images, bboxes, False, [HashStatsProcessor])[0]