dataeval 0.81.0__tar.gz → 0.82.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {dataeval-0.81.0 → dataeval-0.82.0}/PKG-INFO +2 -2
  2. {dataeval-0.81.0 → dataeval-0.82.0}/README.md +1 -1
  3. {dataeval-0.81.0 → dataeval-0.82.0}/pyproject.toml +1 -1
  4. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/__init__.py +1 -1
  5. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/__init__.py +2 -2
  6. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_base.py +8 -64
  7. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_mmd.py +3 -29
  8. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_uncertainty.py +2 -1
  9. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/updates.py +20 -3
  10. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/linters/__init__.py +3 -2
  11. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/linters/duplicates.py +11 -43
  12. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/linters/outliers.py +22 -156
  13. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/__init__.py +1 -1
  14. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/ae.py +1 -1
  15. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/mixin.py +2 -3
  16. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/vae.py +1 -1
  17. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metadata/__init__.py +2 -1
  18. dataeval-0.82.0/src/dataeval/metadata/_distance.py +167 -0
  19. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metadata/_ood.py +25 -46
  20. dataeval-0.82.0/src/dataeval/metadata/_utils.py +44 -0
  21. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/__init__.py +5 -4
  22. dataeval-0.82.0/src/dataeval/metrics/bias/_balance.py +168 -0
  23. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/_coverage.py +4 -106
  24. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/_diversity.py +9 -107
  25. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/bias/_parity.py +5 -71
  26. dataeval-0.82.0/src/dataeval/metrics/estimators/__init__.py +20 -0
  27. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/estimators/_ber.py +2 -20
  28. dataeval-0.82.0/src/dataeval/metrics/estimators/_clusterer.py +44 -0
  29. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/estimators/_divergence.py +2 -19
  30. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/estimators/_uap.py +2 -16
  31. dataeval-0.82.0/src/dataeval/metrics/stats/__init__.py +38 -0
  32. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_base.py +38 -125
  33. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_boxratiostats.py +12 -12
  34. dataeval-0.82.0/src/dataeval/metrics/stats/_dimensionstats.py +75 -0
  35. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_hashstats.py +19 -35
  36. dataeval-0.82.0/src/dataeval/metrics/stats/_imagestats.py +94 -0
  37. dataeval-0.82.0/src/dataeval/metrics/stats/_labelstats.py +131 -0
  38. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_pixelstats.py +19 -51
  39. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/stats/_visualstats.py +19 -51
  40. dataeval-0.82.0/src/dataeval/outputs/__init__.py +53 -0
  41. dataeval-0.81.0/src/dataeval/_output.py → dataeval-0.82.0/src/dataeval/outputs/_base.py +53 -37
  42. dataeval-0.82.0/src/dataeval/outputs/_bias.py +381 -0
  43. dataeval-0.82.0/src/dataeval/outputs/_drift.py +83 -0
  44. dataeval-0.81.0/src/dataeval/metrics/estimators/_clusterer.py → dataeval-0.82.0/src/dataeval/outputs/_estimators.py +42 -32
  45. dataeval-0.82.0/src/dataeval/outputs/_linters.py +184 -0
  46. dataeval-0.81.0/src/dataeval/detectors/ood/output.py → dataeval-0.82.0/src/dataeval/outputs/_ood.py +22 -22
  47. dataeval-0.82.0/src/dataeval/outputs/_stats.py +387 -0
  48. dataeval-0.82.0/src/dataeval/outputs/_utils.py +44 -0
  49. dataeval-0.81.0/src/dataeval/workflows/sufficiency.py → dataeval-0.82.0/src/dataeval/outputs/_workflows.py +206 -415
  50. dataeval-0.82.0/src/dataeval/typing.py +234 -0
  51. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_method.py +1 -5
  52. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_plot.py +2 -2
  53. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/__init__.py +5 -1
  54. dataeval-0.82.0/src/dataeval/utils/data/_dataset.py +217 -0
  55. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_embeddings.py +4 -5
  56. dataeval-0.82.0/src/dataeval/utils/data/_images.py +68 -0
  57. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_metadata.py +15 -7
  58. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_selection.py +22 -15
  59. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_split.py +2 -27
  60. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/_targets.py +14 -2
  61. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_base.py +5 -5
  62. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_cifar10.py +1 -1
  63. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_milco.py +1 -1
  64. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_mnist.py +1 -1
  65. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_ships.py +1 -1
  66. {dataeval-0.81.0/src/dataeval/utils/data → dataeval-0.82.0/src/dataeval/utils/data/datasets}/_types.py +10 -16
  67. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_voc.py +1 -1
  68. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_classfilter.py +4 -7
  69. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_indices.py +2 -2
  70. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_limit.py +2 -2
  71. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_reverse.py +2 -2
  72. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/_shuffle.py +2 -2
  73. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/workflows/__init__.py +2 -1
  74. dataeval-0.82.0/src/dataeval/workflows/sufficiency.py +237 -0
  75. dataeval-0.81.0/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -129
  76. dataeval-0.81.0/src/dataeval/metrics/bias/_balance.py +0 -304
  77. dataeval-0.81.0/src/dataeval/metrics/estimators/__init__.py +0 -19
  78. dataeval-0.81.0/src/dataeval/metrics/stats/__init__.py +0 -35
  79. dataeval-0.81.0/src/dataeval/metrics/stats/_datasetstats.py +0 -198
  80. dataeval-0.81.0/src/dataeval/metrics/stats/_dimensionstats.py +0 -116
  81. dataeval-0.81.0/src/dataeval/metrics/stats/_labelstats.py +0 -210
  82. dataeval-0.81.0/src/dataeval/typing.py +0 -54
  83. dataeval-0.81.0/src/dataeval/utils/data/_images.py +0 -65
  84. {dataeval-0.81.0 → dataeval-0.82.0}/LICENSE.txt +0 -0
  85. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/_log.py +0 -0
  86. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/config.py +0 -0
  87. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/__init__.py +0 -0
  88. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_cvm.py +0 -0
  89. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_ks.py +0 -0
  90. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/drift/_torch.py +0 -0
  91. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/base.py +0 -0
  92. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
  93. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/metrics/__init__.py +0 -0
  94. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/py.typed +0 -0
  95. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/__init__.py +0 -0
  96. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_array.py +0 -0
  97. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_bin.py +0 -0
  98. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_clusterer.py +0 -0
  99. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_fast_mst.py +0 -0
  100. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_image.py +0 -0
  101. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/_mst.py +0 -0
  102. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/collate.py +0 -0
  103. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/__init__.py +0 -0
  104. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_fileio.py +0 -0
  105. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/datasets/_mixin.py +0 -0
  106. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/data/selections/__init__.py +0 -0
  107. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/metadata.py +0 -0
  108. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/__init__.py +0 -0
  109. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/_blocks.py +0 -0
  110. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/_gmm.py +0 -0
  111. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/_internal.py +0 -0
  112. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/models.py +0 -0
  113. {dataeval-0.81.0 → dataeval-0.82.0}/src/dataeval/utils/torch/trainer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.81.0
3
+ Version: 0.82.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -74,7 +74,7 @@ DataEval is easy to install, supports a wide range of Python versions, and is
74
74
  compatible with many of the most popular packages in the scientific and T&E
75
75
  communities.
76
76
 
77
- DataEval also has native interopability between JATIC's suite of tools when
77
+ DataEval also has native interoperability between JATIC's suite of tools when
78
78
  using MAITE-compliant datasets and models.
79
79
  <!-- end JATIC interop -->
80
80
 
@@ -32,7 +32,7 @@ DataEval is easy to install, supports a wide range of Python versions, and is
32
32
  compatible with many of the most popular packages in the scientific and T&E
33
33
  communities.
34
34
 
35
- DataEval also has native interopability between JATIC's suite of tools when
35
+ DataEval also has native interoperability between JATIC's suite of tools when
36
36
  using MAITE-compliant datasets and models.
37
37
  <!-- end JATIC interop -->
38
38
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.81.0" # dynamic
3
+ version = "0.82.0" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -8,7 +8,7 @@ shifts that impact performance of deployed models.
8
8
  from __future__ import annotations
9
9
 
10
10
  __all__ = ["config", "detectors", "log", "metrics", "typing", "utils", "workflows"]
11
- __version__ = "0.81.0"
11
+ __version__ = "0.82.0"
12
12
 
13
13
  import logging
14
14
 
@@ -14,9 +14,9 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  from dataeval.detectors.drift import updates
17
- from dataeval.detectors.drift._base import DriftOutput
18
17
  from dataeval.detectors.drift._cvm import DriftCVM
19
18
  from dataeval.detectors.drift._ks import DriftKS
20
- from dataeval.detectors.drift._mmd import DriftMMD, DriftMMDOutput
19
+ from dataeval.detectors.drift._mmd import DriftMMD
21
20
  from dataeval.detectors.drift._torch import preprocess_drift
22
21
  from dataeval.detectors.drift._uncertainty import DriftUncertainty
22
+ from dataeval.outputs._drift import DriftMMDOutput, DriftOutput
@@ -11,84 +11,28 @@ from __future__ import annotations
11
11
  __all__ = []
12
12
 
13
13
  import math
14
- from abc import ABC, abstractmethod
15
- from dataclasses import dataclass
14
+ from abc import abstractmethod
16
15
  from functools import wraps
17
- from typing import Any, Callable, Literal, TypeVar
16
+ from typing import Any, Callable, Literal, Protocol, TypeVar, runtime_checkable
18
17
 
19
18
  import numpy as np
20
19
  from numpy.typing import NDArray
21
20
 
22
- from dataeval._output import Output, set_metadata
21
+ from dataeval.outputs import DriftOutput
22
+ from dataeval.outputs._base import set_metadata
23
23
  from dataeval.typing import Array, ArrayLike
24
24
  from dataeval.utils._array import as_numpy, to_numpy
25
25
 
26
26
  R = TypeVar("R")
27
27
 
28
28
 
29
- class UpdateStrategy(ABC):
29
+ @runtime_checkable
30
+ class UpdateStrategy(Protocol):
30
31
  """
31
- Updates reference dataset for drift detector
32
-
33
- Parameters
34
- ----------
35
- n : int
36
- Update with last n instances seen by the detector.
37
- """
38
-
39
- def __init__(self, n: int) -> None:
40
- self.n = n
41
-
42
- @abstractmethod
43
- def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
44
- """Abstract implementation of update strategy"""
45
-
46
-
47
- @dataclass(frozen=True)
48
- class DriftBaseOutput(Output):
49
- """
50
- Base output class for Drift Detector classes
51
- """
52
-
53
- drifted: bool
54
- threshold: float
55
- p_val: float
56
- distance: float
57
-
58
-
59
- @dataclass(frozen=True)
60
- class DriftOutput(DriftBaseOutput):
61
- """
62
- Output class for :class:`.DriftCVM`, :class:`.DriftKS`, and :class:`.DriftUncertainty` drift detectors.
63
-
64
- Attributes
65
- ----------
66
- drifted : bool
67
- :term:`Drift` prediction for the images
68
- threshold : float
69
- Threshold after multivariate correction if needed
70
- p_val : float
71
- Instance-level p-value
72
- distance : float
73
- Instance-level distance
74
- feature_drift : NDArray
75
- Feature-level array of images detected to have drifted
76
- feature_threshold : float
77
- Feature-level threshold to determine drift
78
- p_vals : NDArray
79
- Feature-level p-values
80
- distances : NDArray
81
- Feature-level distances
32
+ Protocol for reference dataset update strategy for drift detectors
82
33
  """
83
34
 
84
- # drifted: bool
85
- # threshold: float
86
- # p_val: float
87
- # distance: float
88
- feature_drift: NDArray[np.bool_]
89
- feature_threshold: float
90
- p_vals: NDArray[np.float32]
91
- distances: NDArray[np.float32]
35
+ def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]: ...
92
36
 
93
37
 
94
38
  def update_x_ref(fn: Callable[..., R]) -> Callable[..., R]:
@@ -10,44 +10,18 @@ from __future__ import annotations
10
10
 
11
11
  __all__ = []
12
12
 
13
- from dataclasses import dataclass
14
13
  from typing import Callable
15
14
 
16
15
  import torch
17
16
 
18
- from dataeval._output import set_metadata
19
17
  from dataeval.config import get_device
20
- from dataeval.detectors.drift._base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
18
+ from dataeval.detectors.drift._base import BaseDrift, UpdateStrategy, preprocess_x, update_x_ref
21
19
  from dataeval.detectors.drift._torch import GaussianRBF, mmd2_from_kernel_matrix
20
+ from dataeval.outputs import DriftMMDOutput
21
+ from dataeval.outputs._base import set_metadata
22
22
  from dataeval.typing import ArrayLike
23
23
 
24
24
 
25
- @dataclass(frozen=True)
26
- class DriftMMDOutput(DriftBaseOutput):
27
- """
28
- Output class for :class:`.DriftMMD` :term:`drift<Drift>` detector.
29
-
30
- Attributes
31
- ----------
32
- drifted : bool
33
- Drift prediction for the images
34
- threshold : float
35
- :term:`P-Value` used for significance of the permutation test
36
- p_val : float
37
- P-value obtained from the permutation test
38
- distance : float
39
- MMD^2 between the reference and test set
40
- distance_threshold : float
41
- MMD^2 threshold above which drift is flagged
42
- """
43
-
44
- # drifted: bool
45
- # threshold: float
46
- # p_val: float
47
- # distance: float
48
- distance_threshold: float
49
-
50
-
51
25
  class DriftMMD(BaseDrift):
52
26
  """
53
27
  :term:`Maximum Mean Discrepancy (MMD) Drift Detection` algorithm \
@@ -19,9 +19,10 @@ from scipy.special import softmax
19
19
  from scipy.stats import entropy
20
20
 
21
21
  from dataeval.config import get_device
22
- from dataeval.detectors.drift._base import DriftOutput, UpdateStrategy
22
+ from dataeval.detectors.drift._base import UpdateStrategy
23
23
  from dataeval.detectors.drift._ks import DriftKS
24
24
  from dataeval.detectors.drift._torch import preprocess_drift
25
+ from dataeval.outputs import DriftOutput
25
26
  from dataeval.typing import ArrayLike
26
27
 
27
28
 
@@ -7,15 +7,32 @@ from __future__ import annotations
7
7
 
8
8
  __all__ = ["LastSeenUpdate", "ReservoirSamplingUpdate"]
9
9
 
10
+ from abc import ABC, abstractmethod
10
11
  from typing import Any
11
12
 
12
13
  import numpy as np
13
14
  from numpy.typing import NDArray
14
15
 
15
- from dataeval.detectors.drift._base import UpdateStrategy
16
+
17
+ class BaseUpdateStrategy(ABC):
18
+ """
19
+ Updates reference dataset for drift detector
20
+
21
+ Parameters
22
+ ----------
23
+ n : int
24
+ Update with last n instances seen by the detector.
25
+ """
26
+
27
+ def __init__(self, n: int) -> None:
28
+ self.n = n
29
+
30
+ @abstractmethod
31
+ def __call__(self, x_ref: NDArray[Any], x: NDArray[Any], count: int) -> NDArray[Any]:
32
+ """Abstract implementation of update strategy"""
16
33
 
17
34
 
18
- class LastSeenUpdate(UpdateStrategy):
35
+ class LastSeenUpdate(BaseUpdateStrategy):
19
36
  """
20
37
  Updates reference dataset for :term:`drift<Drift>` detector using last seen method.
21
38
 
@@ -30,7 +47,7 @@ class LastSeenUpdate(UpdateStrategy):
30
47
  return x_updated[-self.n :]
31
48
 
32
49
 
33
- class ReservoirSamplingUpdate(UpdateStrategy):
50
+ class ReservoirSamplingUpdate(BaseUpdateStrategy):
34
51
  """
35
52
  Updates reference dataset for :term:`drift<Drift>` detector using reservoir sampling method.
36
53
 
@@ -9,5 +9,6 @@ __all__ = [
9
9
  "OutliersOutput",
10
10
  ]
11
11
 
12
- from dataeval.detectors.linters.duplicates import Duplicates, DuplicatesOutput
13
- from dataeval.detectors.linters.outliers import Outliers, OutliersOutput
12
+ from dataeval.detectors.linters.duplicates import Duplicates
13
+ from dataeval.detectors.linters.outliers import Outliers
14
+ from dataeval.outputs._linters import DuplicatesOutput, OutliersOutput
@@ -2,40 +2,15 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- from dataclasses import dataclass
6
- from typing import Any, Generic, Iterable, Sequence, TypeVar, overload
5
+ from typing import Any, Sequence, overload
7
6
 
8
- from torch.utils.data import Dataset
9
-
10
- from dataeval._output import Output, set_metadata
7
+ from dataeval.metrics.stats import hashstats
11
8
  from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
12
- from dataeval.metrics.stats._hashstats import HashStatsOutput, hashstats
13
- from dataeval.typing import ArrayLike
14
-
15
- DuplicateGroup = list[int]
16
- DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
17
- TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
18
-
19
-
20
- @dataclass(frozen=True)
21
- class DuplicatesOutput(Generic[TIndexCollection], Output):
22
- """
23
- Output class for :class:`.Duplicates` lint detector.
24
-
25
- Attributes
26
- ----------
27
- exact : list[list[int] | dict[int, list[int]]]
28
- Indices of images that are exact matches
29
- near: list[list[int] | dict[int, list[int]]]
30
- Indices of images that are near matches
31
-
32
- - For a single dataset, indices are returned as a list of index groups.
33
- - For multiple datasets, indices are returned as dictionaries where the key is the
34
- index of the dataset, and the value is the list index groups from that dataset.
35
- """
36
-
37
- exact: list[TIndexCollection]
38
- near: list[TIndexCollection]
9
+ from dataeval.outputs import DuplicatesOutput, HashStatsOutput
10
+ from dataeval.outputs._base import set_metadata
11
+ from dataeval.outputs._linters import DatasetDuplicateGroupMap, DuplicateGroup
12
+ from dataeval.typing import Array, Dataset
13
+ from dataeval.utils.data._images import Images
39
14
 
40
15
 
41
16
  class Duplicates:
@@ -134,22 +109,15 @@ class Duplicates:
134
109
 
135
110
  return DuplicatesOutput(**duplicates)
136
111
 
137
- @overload
138
- def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput[DuplicateGroup]: ...
139
- @overload
140
- def evaluate(self, data: Dataset[tuple[ArrayLike, Any, dict[str, Any]]]) -> DuplicatesOutput[DuplicateGroup]: ...
141
-
142
112
  @set_metadata(state=["only_exact"])
143
- def evaluate(
144
- self, data: Iterable[ArrayLike] | Dataset[tuple[ArrayLike, Any, dict[str, Any]]]
145
- ) -> DuplicatesOutput[DuplicateGroup]:
113
+ def evaluate(self, data: Dataset[Array] | Dataset[tuple[Array, Any, Any]]) -> DuplicatesOutput[DuplicateGroup]:
146
114
  """
147
115
  Returns duplicate image indices for both exact matches and near matches
148
116
 
149
117
  Parameters
150
118
  ----------
151
- data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
152
- A dataset of images in an ArrayLike format or the output(s) from a hashstats analysis
119
+ data : Iterable[Array], shape - (N, C, H, W) | Dataset[tuple[Array, Any, Any]]
120
+ A dataset of images in an Array format or the output(s) from a hashstats analysis
153
121
 
154
122
  Returns
155
123
  -------
@@ -166,7 +134,7 @@ class Duplicates:
166
134
  >>> all_dupes.evaluate(duplicate_images)
167
135
  DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
168
136
  """ # noqa: E501
169
- images = (d[0] for d in data) if isinstance(data, Dataset) else data
137
+ images = Images(data) if isinstance(data, Dataset) else data
170
138
  self.stats = hashstats(images)
171
139
  duplicates = self._get_duplicates(self.stats.dict())
172
140
  return DuplicatesOutput(**duplicates)
@@ -2,142 +2,19 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
- import contextlib
6
- from dataclasses import dataclass
7
- from typing import Any, Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
5
+ from typing import Any, Literal, Sequence, overload
8
6
 
9
7
  import numpy as np
10
8
  from numpy.typing import NDArray
11
- from torch.utils.data import Dataset
12
-
13
- from dataeval._output import Output, set_metadata
14
- from dataeval.metrics.stats._base import BOX_COUNT, SOURCE_INDEX, combine_stats, get_dataset_step_from_idx
15
- from dataeval.metrics.stats._datasetstats import DatasetStatsOutput, datasetstats
16
- from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput
17
- from dataeval.metrics.stats._labelstats import LabelStatsOutput
18
- from dataeval.metrics.stats._pixelstats import PixelStatsOutput
19
- from dataeval.metrics.stats._visualstats import VisualStatsOutput
20
- from dataeval.typing import ArrayLike
21
-
22
- with contextlib.suppress(ImportError):
23
- import pandas as pd
24
-
25
-
26
- IndexIssueMap = dict[int, dict[str, float]]
27
- OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
28
- TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
29
-
30
-
31
- def _reorganize_by_class_and_metric(result, lstats):
32
- """Flip result from grouping by image to grouping by class and metric"""
33
- metrics = {}
34
- class_wise = {label: {} for label in lstats.image_indices_per_label}
35
-
36
- # Group metrics and calculate class-wise counts
37
- for img, group in result.items():
38
- for extreme in group:
39
- metrics.setdefault(extreme, []).append(img)
40
- for label, images in lstats.image_indices_per_label.items():
41
- if img in images:
42
- class_wise[label][extreme] = class_wise[label].get(extreme, 0) + 1
43
-
44
- return metrics, class_wise
45
-
46
-
47
- def _create_table(metrics, class_wise):
48
- """Create table for displaying the results"""
49
- max_class_length = max(len(str(label)) for label in class_wise) + 2
50
- max_total = max(len(metrics[group]) for group in metrics) + 2
51
-
52
- table_header = " | ".join(
53
- [f"{'Class':>{max_class_length}}"]
54
- + [f"{group:^{max(5, len(str(group))) + 2}}" for group in sorted(metrics.keys())]
55
- + [f"{'Total':<{max_total}}"]
56
- )
57
- table_rows = []
58
-
59
- for class_cat, results in class_wise.items():
60
- table_value = [f"{class_cat:>{max_class_length}}"]
61
- total = 0
62
- for group in sorted(metrics.keys()):
63
- count = results.get(group, 0)
64
- table_value.append(f"{count:^{max(5, len(str(group))) + 2}}")
65
- total += count
66
- table_value.append(f"{total:^{max_total}}")
67
- table_rows.append(" | ".join(table_value))
68
-
69
- table = [table_header] + table_rows
70
- return table
71
-
72
-
73
- def _create_pandas_dataframe(class_wise):
74
- """Create data for pandas dataframe"""
75
- data = []
76
- for label, metrics_dict in class_wise.items():
77
- row = {"Class": label}
78
- total = sum(metrics_dict.values())
79
- row.update(metrics_dict) # Add metric counts
80
- row["Total"] = total
81
- data.append(row)
82
- return data
83
-
84
-
85
- @dataclass(frozen=True)
86
- class OutliersOutput(Generic[TIndexIssueMap], Output):
87
- """
88
- Output class for :class:`.Outliers` lint detector.
89
-
90
- Attributes
91
- ----------
92
- issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
93
- Indices of image Outliers with their associated issue type and calculated values.
94
-
95
- - For a single dataset, a dictionary containing the indices of outliers and
96
- a dictionary showing the issues and calculated values for the given index.
97
- - For multiple stats outputs, a list of dictionaries containing the indices of
98
- outliers and their associated issues and calculated values.
99
- """
100
9
 
101
- issues: TIndexIssueMap
102
-
103
- def __len__(self) -> int:
104
- if isinstance(self.issues, dict):
105
- return len(self.issues)
106
- else:
107
- return sum(len(d) for d in self.issues)
108
-
109
- def to_table(self, labelstats: LabelStatsOutput) -> str:
110
- if isinstance(self.issues, dict):
111
- metrics, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
112
- listed_table = _create_table(metrics, classwise)
113
- table = "\n".join(listed_table)
114
- else:
115
- outertable = []
116
- for d in self.issues:
117
- metrics, classwise = _reorganize_by_class_and_metric(d, labelstats)
118
- listed_table = _create_table(metrics, classwise)
119
- str_table = "\n".join(listed_table)
120
- outertable.append(str_table)
121
- table = "\n\n".join(outertable)
122
- return table
123
-
124
- def to_dataframe(self, labelstats: LabelStatsOutput) -> pd.DataFrame:
125
- import pandas as pd
126
-
127
- if isinstance(self.issues, dict):
128
- _, classwise = _reorganize_by_class_and_metric(self.issues, labelstats)
129
- data = _create_pandas_dataframe(classwise)
130
- df = pd.DataFrame(data)
131
- else:
132
- df_list = []
133
- for i, d in enumerate(self.issues):
134
- _, classwise = _reorganize_by_class_and_metric(d, labelstats)
135
- data = _create_pandas_dataframe(classwise)
136
- single_df = pd.DataFrame(data)
137
- single_df["Dataset"] = i
138
- df_list.append(single_df)
139
- df = pd.concat(df_list)
140
- return df
10
+ from dataeval.metrics.stats._base import combine_stats, get_dataset_step_from_idx
11
+ from dataeval.metrics.stats._imagestats import imagestats
12
+ from dataeval.outputs import DimensionStatsOutput, ImageStatsOutput, OutliersOutput, PixelStatsOutput, VisualStatsOutput
13
+ from dataeval.outputs._base import set_metadata
14
+ from dataeval.outputs._linters import IndexIssueMap, OutlierStatsOutput
15
+ from dataeval.outputs._stats import BOX_COUNT, SOURCE_INDEX
16
+ from dataeval.typing import Array, Dataset
17
+ from dataeval.utils.data._images import Images
141
18
 
142
19
 
143
20
  def _get_outlier_mask(
@@ -227,7 +104,7 @@ class Outliers:
227
104
  outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
228
105
  outlier_threshold: float | None = None,
229
106
  ):
230
- self.stats: DatasetStatsOutput
107
+ self.stats: ImageStatsOutput
231
108
  self.use_dimension = use_dimension
232
109
  self.use_pixel = use_pixel
233
110
  self.use_visual = use_visual
@@ -248,23 +125,23 @@ class Outliers:
248
125
  return dict(sorted(flagged_images.items()))
249
126
 
250
127
  @overload
251
- def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
128
+ def from_stats(self, stats: OutlierStatsOutput | ImageStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
252
129
 
253
130
  @overload
254
131
  def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
255
132
 
256
133
  @set_metadata(state=["outlier_method", "outlier_threshold"])
257
134
  def from_stats(
258
- self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
135
+ self, stats: OutlierStatsOutput | ImageStatsOutput | Sequence[OutlierStatsOutput]
259
136
  ) -> OutliersOutput[IndexIssueMap] | OutliersOutput[list[IndexIssueMap]]:
260
137
  """
261
138
  Returns indices of Outliers with the issues identified for each.
262
139
 
263
140
  Parameters
264
141
  ----------
265
- stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
142
+ stats : OutlierStatsOutput | ImageStatsOutput | Sequence[OutlierStatsOutput]
266
143
  The output(s) from a dimensionstats, pixelstats, or visualstats metric
267
- analysis or an aggregate DatasetStatsOutput
144
+ analysis or an aggregate ImageStatsOutput
268
145
 
269
146
  Returns
270
147
  -------
@@ -291,11 +168,7 @@ class Outliers:
291
168
  >>> results.issues[1]
292
169
  {}
293
170
  """ # noqa: E501
294
- if isinstance(stats, DatasetStatsOutput):
295
- outliers = self._get_outliers({k: v for o in stats._outputs() for k, v in o.dict().items()})
296
- return OutliersOutput(outliers)
297
-
298
- if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
171
+ if isinstance(stats, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
299
172
  return OutliersOutput(self._get_outliers(stats.dict()))
300
173
 
301
174
  if not isinstance(stats, Sequence):
@@ -306,7 +179,7 @@ class Outliers:
306
179
  stats_map: dict[type, list[int]] = {}
307
180
  for i, stats_output in enumerate(stats):
308
181
  if not isinstance(
309
- stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
182
+ stats_output, (ImageStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
310
183
  ):
311
184
  raise TypeError(
312
185
  "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
@@ -323,22 +196,15 @@ class Outliers:
323
196
 
324
197
  return OutliersOutput(output_list)
325
198
 
326
- @overload
327
- def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]: ...
328
- @overload
329
- def evaluate(self, data: Dataset[tuple[ArrayLike, Any, dict[str, Any]]]) -> OutliersOutput[IndexIssueMap]: ...
330
-
331
199
  @set_metadata(state=["use_dimension", "use_pixel", "use_visual", "outlier_method", "outlier_threshold"])
332
- def evaluate(
333
- self, data: Iterable[ArrayLike] | Dataset[tuple[ArrayLike, Any, dict[str, Any]]]
334
- ) -> OutliersOutput[IndexIssueMap]:
200
+ def evaluate(self, data: Dataset[Array] | Dataset[tuple[Array, Any, Any]]) -> OutliersOutput[IndexIssueMap]:
335
201
  """
336
202
  Returns indices of Outliers with the issues identified for each
337
203
 
338
204
  Parameters
339
205
  ----------
340
- data : Iterable[ArrayLike], shape - (C, H, W)
341
- A dataset of images in an ArrayLike format
206
+ data : Iterable[Array], shape - (C, H, W)
207
+ A dataset of images in an Array format
342
208
 
343
209
  Returns
344
210
  -------
@@ -355,9 +221,9 @@ class Outliers:
355
221
  >>> list(results.issues)
356
222
  [10, 12]
357
223
  >>> results.issues[10]
358
- {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
224
+ {'contrast': 1.25, 'zeros': 0.05493, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}
359
225
  """
360
- images = (d[0] for d in data) if isinstance(data, Dataset) else data
361
- self.stats = datasetstats(images=images)
226
+ images = Images(data) if isinstance(data, Dataset) else data
227
+ self.stats = imagestats(images)
362
228
  outliers = self._get_outliers(self.stats.dict())
363
229
  return OutliersOutput(outliers)
@@ -5,4 +5,4 @@ Out-of-distribution (OOD) detectors identify data that is different from the dat
5
5
  __all__ = ["OODOutput", "OODScoreOutput", "OOD_AE"]
6
6
 
7
7
  from dataeval.detectors.ood.ae import OOD_AE
8
- from dataeval.detectors.ood.output import OODOutput, OODScoreOutput
8
+ from dataeval.outputs._ood import OODOutput, OODScoreOutput
@@ -19,7 +19,7 @@ import torch
19
19
  from numpy.typing import NDArray
20
20
 
21
21
  from dataeval.detectors.ood.base import OODBase
22
- from dataeval.detectors.ood.output import OODScoreOutput
22
+ from dataeval.outputs import OODScoreOutput
23
23
  from dataeval.typing import ArrayLike
24
24
  from dataeval.utils.torch._internal import predict_batch
25
25
 
@@ -1,7 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataeval.detectors.ood.output import OODOutput, OODScoreOutput
4
-
5
3
  __all__ = []
6
4
 
7
5
  from abc import ABC, abstractmethod
@@ -10,7 +8,8 @@ from typing import Callable, Generic, Literal, TypeVar
10
8
  import numpy as np
11
9
  from numpy.typing import NDArray
12
10
 
13
- from dataeval._output import set_metadata
11
+ from dataeval.outputs import OODOutput, OODScoreOutput
12
+ from dataeval.outputs._base import set_metadata
14
13
  from dataeval.typing import ArrayLike
15
14
  from dataeval.utils._array import as_numpy, to_numpy
16
15
 
@@ -18,7 +18,7 @@ import numpy as np
18
18
  import torch
19
19
 
20
20
  from dataeval.detectors.ood.base import OODBase
21
- from dataeval.detectors.ood.output import OODScoreOutput
21
+ from dataeval.outputs import OODScoreOutput
22
22
  from dataeval.typing import ArrayLike
23
23
  from dataeval.utils._array import as_numpy
24
24
  from dataeval.utils.torch._internal import predict_batch
@@ -1,5 +1,6 @@
1
1
  """Explanatory functions using metadata and additional features such as ood or drift"""
2
2
 
3
- __all__ = ["most_deviated_factors"]
3
+ __all__ = ["most_deviated_factors", "metadata_distance"]
4
4
 
5
+ from dataeval.metadata._distance import metadata_distance
5
6
  from dataeval.metadata._ood import most_deviated_factors