dataeval 0.81.0__tar.gz → 0.82.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-0.81.0 → dataeval-0.82.0}/PKG-INFO +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/README.md +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/pyproject.toml +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/__init__.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/__init__.py +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_base.py +8 -64
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_mmd.py +3 -29
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_uncertainty.py +2 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/updates.py +20 -3
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/linters/__init__.py +3 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/linters/duplicates.py +11 -43
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/linters/outliers.py +22 -156
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/__init__.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/ae.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/mixin.py +2 -3
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/vae.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metadata/__init__.py +2 -1
- dataeval-0.82.0/src/dataeval/metadata/_distance.py +167 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metadata/_ood.py +25 -46
- dataeval-0.82.0/src/dataeval/metadata/_utils.py +44 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/__init__.py +5 -4
- dataeval-0.82.0/src/dataeval/metrics/bias/_balance.py +168 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/_coverage.py +4 -106
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/_diversity.py +9 -107
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/_parity.py +5 -71
- dataeval-0.82.0/src/dataeval/metrics/estimators/__init__.py +20 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/estimators/_ber.py +2 -20
- dataeval-0.82.0/src/dataeval/metrics/estimators/_clusterer.py +44 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/estimators/_divergence.py +2 -19
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/estimators/_uap.py +2 -16
- dataeval-0.82.0/src/dataeval/metrics/stats/__init__.py +38 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_base.py +38 -125
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_boxratiostats.py +12 -12
- dataeval-0.82.0/src/dataeval/metrics/stats/_dimensionstats.py +75 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_hashstats.py +19 -35
- dataeval-0.82.0/src/dataeval/metrics/stats/_imagestats.py +94 -0
- dataeval-0.82.0/src/dataeval/metrics/stats/_labelstats.py +131 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_pixelstats.py +19 -51
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_visualstats.py +19 -51
- dataeval-0.82.0/src/dataeval/outputs/__init__.py +53 -0
- dataeval-0.81.0/src/dataeval/_output.py → dataeval-0.82.0/src/dataeval/outputs/_base.py +53 -37
- dataeval-0.82.0/src/dataeval/outputs/_bias.py +381 -0
- dataeval-0.82.0/src/dataeval/outputs/_drift.py +83 -0
- dataeval-0.81.0/src/dataeval/metrics/estimators/_clusterer.py → dataeval-0.82.0/src/dataeval/outputs/_estimators.py +42 -32
- dataeval-0.82.0/src/dataeval/outputs/_linters.py +184 -0
- dataeval-0.81.0/src/dataeval/detectors/ood/output.py → dataeval-0.82.0/src/dataeval/outputs/_ood.py +22 -22
- dataeval-0.82.0/src/dataeval/outputs/_stats.py +387 -0
- dataeval-0.82.0/src/dataeval/outputs/_utils.py +44 -0
- dataeval-0.81.0/src/dataeval/workflows/sufficiency.py → dataeval-0.82.0/src/dataeval/outputs/_workflows.py +206 -415
- dataeval-0.82.0/src/dataeval/typing.py +234 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_method.py +1 -5
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_plot.py +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/__init__.py +5 -1
- dataeval-0.82.0/src/dataeval/utils/data/_dataset.py +217 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_embeddings.py +4 -5
- dataeval-0.82.0/src/dataeval/utils/data/_images.py +68 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_metadata.py +15 -7
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_selection.py +22 -15
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_split.py +2 -27
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_targets.py +14 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_base.py +5 -5
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_cifar10.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_milco.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_mnist.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_ships.py +1 -1
- {dataeval-0.81.0/src/dataeval/utils/data → dataeval-0.82.0/src/dataeval/utils/data/datasets}/_types.py +10 -16
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_voc.py +1 -1
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_classfilter.py +4 -7
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_indices.py +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_limit.py +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_reverse.py +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_shuffle.py +2 -2
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/workflows/__init__.py +2 -1
- dataeval-0.82.0/src/dataeval/workflows/sufficiency.py +237 -0
- dataeval-0.81.0/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -129
- dataeval-0.81.0/src/dataeval/metrics/bias/_balance.py +0 -304
- dataeval-0.81.0/src/dataeval/metrics/estimators/__init__.py +0 -19
- dataeval-0.81.0/src/dataeval/metrics/stats/__init__.py +0 -35
- dataeval-0.81.0/src/dataeval/metrics/stats/_datasetstats.py +0 -198
- dataeval-0.81.0/src/dataeval/metrics/stats/_dimensionstats.py +0 -116
- dataeval-0.81.0/src/dataeval/metrics/stats/_labelstats.py +0 -210
- dataeval-0.81.0/src/dataeval/typing.py +0 -54
- dataeval-0.81.0/src/dataeval/utils/data/_images.py +0 -65
- {dataeval-0.81.0 → dataeval-0.82.0}/LICENSE.txt +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/_log.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/config.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/__init__.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_cvm.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_ks.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_torch.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/base.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/__init__.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/py.typed +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_array.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_bin.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_clusterer.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_fast_mst.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_image.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_mst.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/collate.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/__init__.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_fileio.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_mixin.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/__init__.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/metadata.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/__init__.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/_blocks.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/_gmm.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/_internal.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/models.py +0 -0
- {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/trainer.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: dataeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.82.0
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
5
5
|
Home-page: https://dataeval.ai/
|
6
6
|
License: MIT
|
@@ -74,7 +74,7 @@ DataEval is easy to install, supports a wide range of Python versions, and is
|
|
74
74
|
compatible with many of the most popular packages in the scientific and T&E
|
75
75
|
communities.
|
76
76
|
|
77
|
-
DataEval also has native
|
77
|
+
DataEval also has native interoperability between JATIC's suite of tools when
|
78
78
|
using MAITE-compliant datasets and models.
|
79
79
|
<!-- end JATIC interop -->
|
80
80
|
|
@@ -32,7 +32,7 @@ DataEval is easy to install, supports a wide range of Python versions, and is
|
|
32
32
|
compatible with many of the most popular packages in the scientific and T&E
|
33
33
|
communities.
|
34
34
|
|
35
|
-
DataEval also has native
|
35
|
+
DataEval also has native interoperability between JATIC's suite of tools when
|
36
36
|
using MAITE-compliant datasets and models.
|
37
37
|
<!-- end JATIC interop -->
|
38
38
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "dataeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.82.0" # dynamic
|
4
4
|
description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
|
5
5
|
license = "MIT"
|
6
6
|
readme = "README.md"
|
@@ -14,9 +14,9 @@ __all__ = [
|
|
14
14
|
]
|
15
15
|
|
16
16
|
from dataeval.detectors.drift import updates
|
17
|
-
from dataeval.detectors.drift._base import DriftOutput
|
18
17
|
from dataeval.detectors.drift._cvm import DriftCVM
|
19
18
|
from dataeval.detectors.drift._ks import DriftKS
|
20
|
-
from dataeval.detectors.drift._mmd import DriftMMD
|
19
|
+
from dataeval.detectors.drift._mmd import DriftMMD
|
21
20
|
from dataeval.detectors.drift._torch import preprocess_drift
|
22
21
|
from dataeval.detectors.drift._uncertainty import DriftUncertainty
|
22
|
+
from dataeval.outputs._drift import DriftMMDOutput, DriftOutput
|
@@ -11,84 +11,28 @@ from __future__ import annotations
|
|
11
11
|
__all__ = []
|
12
12
|
|
13
13
|
import math
|
14
|
-
from abc import
|
15
|
-
from dataclasses import dataclass
|
14
|
+
from abc import abstractmethod
|
16
15
|
from functools import wraps
|
17
|
-
from typing import Any, Callable, Literal, TypeVar
|
16
|
+
from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
|
18
17
|
|
19
18
|
import numpy as np
|
20
19
|
from numpy.typing import NDArray
|
21
20
|
|
22
|
-
from dataeval.
|
21
|
+
from dataeval.outputs import DriftOutput
|
22
|
+
from dataeval.outputs._base import set_metadata
|
23
23
|
from dataeval.typing import Array, ArrayLike
|
24
24
|
from dataeval.utils._array import as_numpy, to_numpy
|
25
25
|
|
26
26
|
R = TypeVar("R")
|
27
27
|
|
28
28
|
|
29
|
-
|
29
|
+
@runtime_checkable
|
30
|
+
class UpdateStrategy(Protocol):
|
30
31
|
"""
|
31
|
-
|
32
|
-
|
33
|
-
Parameters
|
34
|
-
----------
|
35
|
-
n : int
|
36
|
-
Update with last n instances seen by the detector.
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(self, n: int) -> None:
|
40
|
-
self.n = n
|
41
|
-
|
42
|
-
@abstractmethod
|
43
|
-
def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
|
44
|
-
"""Abstract implementation of update strategy"""
|
45
|
-
|
46
|
-
|
47
|
-
@dataclass(frozen=True)
|
48
|
-
class DriftBaseOutput(Output):
|
49
|
-
"""
|
50
|
-
Base output class for Drift Detector classes
|
51
|
-
"""
|
52
|
-
|
53
|
-
drifted: bool
|
54
|
-
threshold: float
|
55
|
-
p_val: float
|
56
|
-
distance: float
|
57
|
-
|
58
|
-
|
59
|
-
@dataclass(frozen=True)
|
60
|
-
class DriftOutput(DriftBaseOutput):
|
61
|
-
"""
|
62
|
-
Output class for :class:`.DriftCVM`, :class:`.DriftKS`, and :class:`.DriftUncertainty` drift detectors.
|
63
|
-
|
64
|
-
Attributes
|
65
|
-
----------
|
66
|
-
drifted : bool
|
67
|
-
:term:`Drift` prediction for the images
|
68
|
-
threshold : float
|
69
|
-
Threshold after multivariate correction if needed
|
70
|
-
p_val : float
|
71
|
-
Instance-level p-value
|
72
|
-
distance : float
|
73
|
-
Instance-level distance
|
74
|
-
feature_drift : NDArray
|
75
|
-
Feature-level array of images detected to have drifted
|
76
|
-
feature_threshold : float
|
77
|
-
Feature-level threshold to determine drift
|
78
|
-
p_vals : NDArray
|
79
|
-
Feature-level p-values
|
80
|
-
distances : NDArray
|
81
|
-
Feature-level distances
|
32
|
+
Protocol for reference dataset update strategy for drift detectors
|
82
33
|
"""
|
83
34
|
|
84
|
-
|
85
|
-
# threshold: float
|
86
|
-
# p_val: float
|
87
|
-
# distance: float
|
88
|
-
feature_drift: NDArray[np.bool_]
|
89
|
-
feature_threshold: float
|
90
|
-
p_vals: NDArray[np.float32]
|
91
|
-
distances: NDArray[np.float32]
|
35
|
+
def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]: ...
|
92
36
|
|
93
37
|
|
94
38
|
def update_x_ref(fn: Callable[..., R]) -> Callable[..., R]:
|
@@ -10,44 +10,18 @@ from __future__ import annotations
|
|
10
10
|
|
11
11
|
__all__ = []
|
12
12
|
|
13
|
-
from dataclasses import dataclass
|
14
13
|
from typing import Callable
|
15
14
|
|
16
15
|
import torch
|
17
16
|
|
18
|
-
from dataeval._output import set_metadata
|
19
17
|
from dataeval.config import get_device
|
20
|
-
from dataeval.detectors.drift._base import BaseDrift,
|
18
|
+
from dataeval.detectors.drift._base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
|
21
19
|
from dataeval.detectors.drift._torch import GaussianRBF, mmd2_from_kernel_matrix
|
20
|
+
from dataeval.outputs import DriftMMDOutput
|
21
|
+
from dataeval.outputs._base import set_metadata
|
22
22
|
from dataeval.typing import ArrayLike
|
23
23
|
|
24
24
|
|
25
|
-
@dataclass(frozen=True)
|
26
|
-
class DriftMMDOutput(DriftBaseOutput):
|
27
|
-
"""
|
28
|
-
Output class for :class:`.DriftMMD` :term:`drift<Drift>` detector.
|
29
|
-
|
30
|
-
Attributes
|
31
|
-
----------
|
32
|
-
drifted : bool
|
33
|
-
Drift prediction for the images
|
34
|
-
threshold : float
|
35
|
-
:term:`P-Value` used for significance of the permutation test
|
36
|
-
p_val : float
|
37
|
-
P-value obtained from the permutation test
|
38
|
-
distance : float
|
39
|
-
MMD^2 between the reference and test set
|
40
|
-
distance_threshold : float
|
41
|
-
MMD^2 threshold above which drift is flagged
|
42
|
-
"""
|
43
|
-
|
44
|
-
# drifted: bool
|
45
|
-
# threshold: float
|
46
|
-
# p_val: float
|
47
|
-
# distance: float
|
48
|
-
distance_threshold: float
|
49
|
-
|
50
|
-
|
51
25
|
class DriftMMD(BaseDrift):
|
52
26
|
"""
|
53
27
|
:term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
|
@@ -19,9 +19,10 @@ from scipy.special import softmax
|
|
19
19
|
from scipy.stats import entropy
|
20
20
|
|
21
21
|
from dataeval.config import get_device
|
22
|
-
from dataeval.detectors.drift._base import
|
22
|
+
from dataeval.detectors.drift._base import UpdateStrategy
|
23
23
|
from dataeval.detectors.drift._ks import DriftKS
|
24
24
|
from dataeval.detectors.drift._torch import preprocess_drift
|
25
|
+
from dataeval.outputs import DriftOutput
|
25
26
|
from dataeval.typing import ArrayLike
|
26
27
|
|
27
28
|
|
@@ -7,15 +7,32 @@ from __future__ import annotations
|
|
7
7
|
|
8
8
|
__all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
|
9
9
|
|
10
|
+
from abc import ABC, abstractmethod
|
10
11
|
from typing import Any
|
11
12
|
|
12
13
|
import numpy as np
|
13
14
|
from numpy.typing import NDArray
|
14
15
|
|
15
|
-
|
16
|
+
|
17
|
+
class BaseUpdateStrategy(ABC):
|
18
|
+
"""
|
19
|
+
Updates reference dataset for drift detector
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
n : int
|
24
|
+
Update with last n instances seen by the detector.
|
25
|
+
"""
|
26
|
+
|
27
|
+
def __init__(self, n: int) -> None:
|
28
|
+
self.n = n
|
29
|
+
|
30
|
+
@abstractmethod
|
31
|
+
def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
|
32
|
+
"""Abstract implementation of update strategy"""
|
16
33
|
|
17
34
|
|
18
|
-
class LastSeenUpdate(
|
35
|
+
class LastSeenUpdate(BaseUpdateStrategy):
|
19
36
|
"""
|
20
37
|
Updates reference dataset for :term:`drift<Drift>` detector using last seen method.
|
21
38
|
|
@@ -30,7 +47,7 @@ class LastSeenUpdate(UpdateStrategy):
|
|
30
47
|
return x_updated[-self.n :]
|
31
48
|
|
32
49
|
|
33
|
-
class ReservoirSamplingUpdate(
|
50
|
+
class ReservoirSamplingUpdate(BaseUpdateStrategy):
|
34
51
|
"""
|
35
52
|
Updates reference dataset for :term:`drift<Drift>` detector using reservoir sampling method.
|
36
53
|
|
@@ -9,5 +9,6 @@ __all__ = [
|
|
9
9
|
"OutliersOutput",
|
10
10
|
]
|
11
11
|
|
12
|
-
from dataeval.detectors.linters.duplicates import Duplicates
|
13
|
-
from dataeval.detectors.linters.outliers import Outliers
|
12
|
+
from dataeval.detectors.linters.duplicates import Duplicates
|
13
|
+
from dataeval.detectors.linters.outliers import Outliers
|
14
|
+
from dataeval.outputs._linters import DuplicatesOutput, OutliersOutput
|
@@ -2,40 +2,15 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import Any, Generic, Iterable, Sequence, TypeVar, overload
|
5
|
+
from typing import Any, Sequence, overload
|
7
6
|
|
8
|
-
from
|
9
|
-
|
10
|
-
from dataeval._output import Output, set_metadata
|
7
|
+
from dataeval.metrics.stats import hashstats
|
11
8
|
from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
|
12
|
-
from dataeval.
|
13
|
-
from dataeval.
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
|
18
|
-
|
19
|
-
|
20
|
-
@dataclass(frozen=True)
|
21
|
-
class DuplicatesOutput(Generic[TIndexCollection], Output):
|
22
|
-
"""
|
23
|
-
Output class for :class:`.Duplicates` lint detector.
|
24
|
-
|
25
|
-
Attributes
|
26
|
-
----------
|
27
|
-
exact : list[list[int] | dict[int, list[int]]]
|
28
|
-
Indices of images that are exact matches
|
29
|
-
near: list[list[int] | dict[int, list[int]]]
|
30
|
-
Indices of images that are near matches
|
31
|
-
|
32
|
-
- For a single dataset, indices are returned as a list of index groups.
|
33
|
-
- For multiple datasets, indices are returned as dictionaries where the key is the
|
34
|
-
index of the dataset, and the value is the list index groups from that dataset.
|
35
|
-
"""
|
36
|
-
|
37
|
-
exact: list[TIndexCollection]
|
38
|
-
near: list[TIndexCollection]
|
9
|
+
from dataeval.outputs import DuplicatesOutput, HashStatsOutput
|
10
|
+
from dataeval.outputs._base import set_metadata
|
11
|
+
from dataeval.outputs._linters import DatasetDuplicateGroupMap, DuplicateGroup
|
12
|
+
from dataeval.typing import Array, Dataset
|
13
|
+
from dataeval.utils.data._images import Images
|
39
14
|
|
40
15
|
|
41
16
|
class Duplicates:
|
@@ -134,22 +109,15 @@ class Duplicates:
|
|
134
109
|
|
135
110
|
return DuplicatesOutput(**duplicates)
|
136
111
|
|
137
|
-
@overload
|
138
|
-
def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]: ...
|
139
|
-
@overload
|
140
|
-
def evaluate(self, data: Dataset[tuple[ArrayLike, Any, dict[str, Any]]]) -> DuplicatesOutput[DuplicateGroup]: ...
|
141
|
-
|
142
112
|
@set_metadata(state=["only_exact"])
|
143
|
-
def evaluate(
|
144
|
-
self, data: Iterable[ArrayLike] | Dataset[tuple[ArrayLike, Any, dict[str, Any]]]
|
145
|
-
) -> DuplicatesOutput[DuplicateGroup]:
|
113
|
+
def evaluate(self, data: Dataset[Array] | Dataset[tuple[Array, Any, Any]]) -> DuplicatesOutput[DuplicateGroup]:
|
146
114
|
"""
|
147
115
|
Returns duplicate image indices for both exact matches and near matches
|
148
116
|
|
149
117
|
Parameters
|
150
118
|
----------
|
151
|
-
data : Iterable[
|
152
|
-
A dataset of images in an
|
119
|
+
data : Iterable[Array], shape - (N, C, H, W) | Dataset[tuple[Array, Any, Any]]
|
120
|
+
A dataset of images in an Array format or the output(s) from a hashstats analysis
|
153
121
|
|
154
122
|
Returns
|
155
123
|
-------
|
@@ -166,7 +134,7 @@ class Duplicates:
|
|
166
134
|
>>> all_dupes.evaluate(duplicate_images)
|
167
135
|
DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
|
168
136
|
""" # noqa: E501
|
169
|
-
images = (
|
137
|
+
images = Images(data) if isinstance(data, Dataset) else data
|
170
138
|
self.stats = hashstats(images)
|
171
139
|
duplicates = self._get_duplicates(self.stats.dict())
|
172
140
|
return DuplicatesOutput(**duplicates)
|
@@ -2,142 +2,19 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
__all__ = []
|
4
4
|
|
5
|
-
import
|
6
|
-
from dataclasses import dataclass
|
7
|
-
from typing import Any, Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
|
5
|
+
from typing import Any, Literal, Sequence, overload
|
8
6
|
|
9
7
|
import numpy as np
|
10
8
|
from numpy.typing import NDArray
|
11
|
-
from torch.utils.data import Dataset
|
12
|
-
|
13
|
-
from dataeval._output import Output, set_metadata
|
14
|
-
from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, combine_stats, get_dataset_step_from_idx
|
15
|
-
from dataeval.metrics.stats._datasetstats import DatasetStatsOutput, datasetstats
|
16
|
-
from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
|
17
|
-
from dataeval.metrics.stats._labelstats import LabelStatsOutput
|
18
|
-
from dataeval.metrics.stats._pixelstats import PixelStatsOutput
|
19
|
-
from dataeval.metrics.stats._visualstats import VisualStatsOutput
|
20
|
-
from dataeval.typing import ArrayLike
|
21
|
-
|
22
|
-
with contextlib.suppress(ImportError):
|
23
|
-
import pandas as pd
|
24
|
-
|
25
|
-
|
26
|
-
IndexIssueMap = dict[int, dict[str, float]]
|
27
|
-
OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
|
28
|
-
TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
|
29
|
-
|
30
|
-
|
31
|
-
def _reorganize_by_class_and_metric(result, lstats):
|
32
|
-
"""Flip result from grouping by image to grouping by class and metric"""
|
33
|
-
metrics = {}
|
34
|
-
class_wise = {label: {} for label in lstats.image_indices_per_label}
|
35
|
-
|
36
|
-
# Group metrics and calculate class-wise counts
|
37
|
-
for img, group in result.items():
|
38
|
-
for extreme in group:
|
39
|
-
metrics.setdefault(extreme, []).append(img)
|
40
|
-
for label, images in lstats.image_indices_per_label.items():
|
41
|
-
if img in images:
|
42
|
-
class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
|
43
|
-
|
44
|
-
return metrics, class_wise
|
45
|
-
|
46
|
-
|
47
|
-
def _create_table(metrics, class_wise):
|
48
|
-
"""Create table for displaying the results"""
|
49
|
-
max_class_length = max(len(str(label)) for label in class_wise) + 2
|
50
|
-
max_total = max(len(metrics[group]) for group in metrics) + 2
|
51
|
-
|
52
|
-
table_header = " | ".join(
|
53
|
-
[f"{'Class':>{max_class_length}}"]
|
54
|
-
+ [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
|
55
|
-
+ [f"{'Total':<{max_total}}"]
|
56
|
-
)
|
57
|
-
table_rows = []
|
58
|
-
|
59
|
-
for class_cat, results in class_wise.items():
|
60
|
-
table_value = [f"{class_cat:>{max_class_length}}"]
|
61
|
-
total = 0
|
62
|
-
for group in sorted(metrics.keys()):
|
63
|
-
count = results.get(group, 0)
|
64
|
-
table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
|
65
|
-
total += count
|
66
|
-
table_value.append(f"{total:^{max_total}}")
|
67
|
-
table_rows.append(" | ".join(table_value))
|
68
|
-
|
69
|
-
table = [table_header] + table_rows
|
70
|
-
return table
|
71
|
-
|
72
|
-
|
73
|
-
def _create_pandas_dataframe(class_wise):
|
74
|
-
"""Create data for pandas dataframe"""
|
75
|
-
data = []
|
76
|
-
for label, metrics_dict in class_wise.items():
|
77
|
-
row = {"Class": label}
|
78
|
-
total = sum(metrics_dict.values())
|
79
|
-
row.update(metrics_dict) # Add metric counts
|
80
|
-
row["Total"] = total
|
81
|
-
data.append(row)
|
82
|
-
return data
|
83
|
-
|
84
|
-
|
85
|
-
@dataclass(frozen=True)
|
86
|
-
class OutliersOutput(Generic[TIndexIssueMap], Output):
|
87
|
-
"""
|
88
|
-
Output class for :class:`.Outliers` lint detector.
|
89
|
-
|
90
|
-
Attributes
|
91
|
-
----------
|
92
|
-
issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
|
93
|
-
Indices of image Outliers with their associated issue type and calculated values.
|
94
|
-
|
95
|
-
- For a single dataset, a dictionary containing the indices of outliers and
|
96
|
-
a dictionary showing the issues and calculated values for the given index.
|
97
|
-
- For multiple stats outputs, a list of dictionaries containing the indices of
|
98
|
-
outliers and their associated issues and calculated values.
|
99
|
-
"""
|
100
9
|
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
def to_table(self, labelstats: LabelStatsOutput) -> str:
|
110
|
-
if isinstance(self.issues, dict):
|
111
|
-
metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
112
|
-
listed_table = _create_table(metrics, classwise)
|
113
|
-
table = "\n".join(listed_table)
|
114
|
-
else:
|
115
|
-
outertable = []
|
116
|
-
for d in self.issues:
|
117
|
-
metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
|
118
|
-
listed_table = _create_table(metrics, classwise)
|
119
|
-
str_table = "\n".join(listed_table)
|
120
|
-
outertable.append(str_table)
|
121
|
-
table = "\n\n".join(outertable)
|
122
|
-
return table
|
123
|
-
|
124
|
-
def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
|
125
|
-
import pandas as pd
|
126
|
-
|
127
|
-
if isinstance(self.issues, dict):
|
128
|
-
_, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
|
129
|
-
data = _create_pandas_dataframe(classwise)
|
130
|
-
df = pd.DataFrame(data)
|
131
|
-
else:
|
132
|
-
df_list = []
|
133
|
-
for i, d in enumerate(self.issues):
|
134
|
-
_, classwise = _reorganize_by_class_and_metric(d, labelstats)
|
135
|
-
data = _create_pandas_dataframe(classwise)
|
136
|
-
single_df = pd.DataFrame(data)
|
137
|
-
single_df["Dataset"] = i
|
138
|
-
df_list.append(single_df)
|
139
|
-
df = pd.concat(df_list)
|
140
|
-
return df
|
10
|
+
from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
|
11
|
+
from dataeval.metrics.stats._imagestats import imagestats
|
12
|
+
from dataeval.outputs import DimensionStatsOutput, ImageStatsOutput, OutliersOutput, PixelStatsOutput, VisualStatsOutput
|
13
|
+
from dataeval.outputs._base import set_metadata
|
14
|
+
from dataeval.outputs._linters import IndexIssueMap, OutlierStatsOutput
|
15
|
+
from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX
|
16
|
+
from dataeval.typing import Array, Dataset
|
17
|
+
from dataeval.utils.data._images import Images
|
141
18
|
|
142
19
|
|
143
20
|
def _get_outlier_mask(
|
@@ -227,7 +104,7 @@ class Outliers:
|
|
227
104
|
outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
|
228
105
|
outlier_threshold: float | None = None,
|
229
106
|
):
|
230
|
-
self.stats:
|
107
|
+
self.stats: ImageStatsOutput
|
231
108
|
self.use_dimension = use_dimension
|
232
109
|
self.use_pixel = use_pixel
|
233
110
|
self.use_visual = use_visual
|
@@ -248,23 +125,23 @@ class Outliers:
|
|
248
125
|
return dict(sorted(flagged_images.items()))
|
249
126
|
|
250
127
|
@overload
|
251
|
-
def from_stats(self, stats: OutlierStatsOutput |
|
128
|
+
def from_stats(self, stats: OutlierStatsOutput | ImageStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
|
252
129
|
|
253
130
|
@overload
|
254
131
|
def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
|
255
132
|
|
256
133
|
@set_metadata(state=["outlier_method", "outlier_threshold"])
|
257
134
|
def from_stats(
|
258
|
-
self, stats: OutlierStatsOutput |
|
135
|
+
self, stats: OutlierStatsOutput | ImageStatsOutput | Sequence[OutlierStatsOutput]
|
259
136
|
) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
|
260
137
|
"""
|
261
138
|
Returns indices of Outliers with the issues identified for each.
|
262
139
|
|
263
140
|
Parameters
|
264
141
|
----------
|
265
|
-
stats : OutlierStatsOutput |
|
142
|
+
stats : OutlierStatsOutput | ImageStatsOutput | Sequence[OutlierStatsOutput]
|
266
143
|
The output(s) from a dimensionstats, pixelstats, or visualstats metric
|
267
|
-
analysis or an aggregate
|
144
|
+
analysis or an aggregate ImageStatsOutput
|
268
145
|
|
269
146
|
Returns
|
270
147
|
-------
|
@@ -291,11 +168,7 @@ class Outliers:
|
|
291
168
|
>>> results.issues[1]
|
292
169
|
{}
|
293
170
|
""" # noqa: E501
|
294
|
-
if isinstance(stats,
|
295
|
-
outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
|
296
|
-
return OutliersOutput(outliers)
|
297
|
-
|
298
|
-
if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
|
171
|
+
if isinstance(stats, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
|
299
172
|
return OutliersOutput(self._get_outliers(stats.dict()))
|
300
173
|
|
301
174
|
if not isinstance(stats, Sequence):
|
@@ -306,7 +179,7 @@ class Outliers:
|
|
306
179
|
stats_map: dict[type, list[int]] = {}
|
307
180
|
for i, stats_output in enumerate(stats):
|
308
181
|
if not isinstance(
|
309
|
-
stats_output, (
|
182
|
+
stats_output, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
|
310
183
|
):
|
311
184
|
raise TypeError(
|
312
185
|
"Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
|
@@ -323,22 +196,15 @@ class Outliers:
|
|
323
196
|
|
324
197
|
return OutliersOutput(output_list)
|
325
198
|
|
326
|
-
@overload
|
327
|
-
def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]: ...
|
328
|
-
@overload
|
329
|
-
def evaluate(self, data: Dataset[tuple[ArrayLike, Any, dict[str, Any]]]) -> OutliersOutput[IndexIssueMap]: ...
|
330
|
-
|
331
199
|
@set_metadata(state=["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
|
332
|
-
def evaluate(
|
333
|
-
self, data: Iterable[ArrayLike] | Dataset[tuple[ArrayLike, Any, dict[str, Any]]]
|
334
|
-
) -> OutliersOutput[IndexIssueMap]:
|
200
|
+
def evaluate(self, data: Dataset[Array] | Dataset[tuple[Array, Any, Any]]) -> OutliersOutput[IndexIssueMap]:
|
335
201
|
"""
|
336
202
|
Returns indices of Outliers with the issues identified for each
|
337
203
|
|
338
204
|
Parameters
|
339
205
|
----------
|
340
|
-
data : Iterable[
|
341
|
-
A dataset of images in an
|
206
|
+
data : Iterable[Array], shape - (C, H, W)
|
207
|
+
A dataset of images in an Array format
|
342
208
|
|
343
209
|
Returns
|
344
210
|
-------
|
@@ -355,9 +221,9 @@ class Outliers:
|
|
355
221
|
>>> list(results.issues)
|
356
222
|
[10, 12]
|
357
223
|
>>> results.issues[10]
|
358
|
-
{'
|
224
|
+
{'contrast': 1.25, 'zeros': 0.05493, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}
|
359
225
|
"""
|
360
|
-
images = (
|
361
|
-
self.stats =
|
226
|
+
images = Images(data) if isinstance(data, Dataset) else data
|
227
|
+
self.stats = imagestats(images)
|
362
228
|
outliers = self._get_outliers(self.stats.dict())
|
363
229
|
return OutliersOutput(outliers)
|
@@ -5,4 +5,4 @@ Out-of-distribution (OOD) detectors identify data that is different from the dat
|
|
5
5
|
__all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
|
6
6
|
|
7
7
|
from dataeval.detectors.ood.ae import OOD_AE
|
8
|
-
from dataeval.
|
8
|
+
from dataeval.outputs._ood import OODOutput, OODScoreOutput
|
@@ -19,7 +19,7 @@ import torch
|
|
19
19
|
from numpy.typing import NDArray
|
20
20
|
|
21
21
|
from dataeval.detectors.ood.base import OODBase
|
22
|
-
from dataeval.
|
22
|
+
from dataeval.outputs import OODScoreOutput
|
23
23
|
from dataeval.typing import ArrayLike
|
24
24
|
from dataeval.utils.torch._internal import predict_batch
|
25
25
|
|
@@ -1,7 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from dataeval.detectors.ood.output import OODOutput, OODScoreOutput
|
4
|
-
|
5
3
|
__all__ = []
|
6
4
|
|
7
5
|
from abc import ABC, abstractmethod
|
@@ -10,7 +8,8 @@ from typing import Callable, Generic, Literal, TypeVar
|
|
10
8
|
import numpy as np
|
11
9
|
from numpy.typing import NDArray
|
12
10
|
|
13
|
-
from dataeval.
|
11
|
+
from dataeval.outputs import OODOutput, OODScoreOutput
|
12
|
+
from dataeval.outputs._base import set_metadata
|
14
13
|
from dataeval.typing import ArrayLike
|
15
14
|
from dataeval.utils._array import as_numpy, to_numpy
|
16
15
|
|
@@ -18,7 +18,7 @@ import numpy as np
|
|
18
18
|
import torch
|
19
19
|
|
20
20
|
from dataeval.detectors.ood.base import OODBase
|
21
|
-
from dataeval.
|
21
|
+
from dataeval.outputs import OODScoreOutput
|
22
22
|
from dataeval.typing import ArrayLike
|
23
23
|
from dataeval.utils._array import as_numpy
|
24
24
|
from dataeval.utils.torch._internal import predict_batch
|
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Explanatory functions using metadata and additional features such as ood or drift"""
|
2
2
|
|
3
|
-
__all__ = ["most_deviated_factors"]
|
3
|
+
__all__ = ["most_deviated_factors", "metadata_distance"]
|
4
4
|
|
5
|
+
from dataeval.metadata._distance import metadata_distance
|
5
6
|
from dataeval.metadata._ood import most_deviated_factors
|