dataeval 0.69.4__tar.gz → 0.70.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {dataeval-0.69.4 → dataeval-0.70.0}/PKG-INFO +3 -3
  2. {dataeval-0.69.4 → dataeval-0.70.0}/pyproject.toml +5 -3
  3. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/__init__.py +3 -3
  4. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/base.py +5 -6
  5. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/mmd.py +3 -3
  6. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/duplicates.py +62 -45
  7. dataeval-0.70.0/src/dataeval/_internal/detectors/merged_stats.py +47 -0
  8. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/ae.py +3 -3
  9. dataeval-0.70.0/src/dataeval/_internal/detectors/outliers.py +269 -0
  10. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/interop.py +11 -7
  11. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/balance.py +9 -9
  12. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/ber.py +3 -3
  13. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/divergence.py +3 -3
  14. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/diversity.py +6 -6
  15. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/parity.py +24 -16
  16. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/base.py +231 -0
  17. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/boxratiostats.py +159 -0
  18. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/datasetstats.py +97 -0
  19. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/dimensionstats.py +111 -0
  20. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/hashstats.py +73 -0
  21. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/labelstats.py +125 -0
  22. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/pixelstats.py +117 -0
  23. dataeval-0.70.0/src/dataeval/_internal/metrics/stats/visualstats.py +122 -0
  24. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/uap.py +2 -2
  25. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/utils.py +28 -13
  26. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/output.py +3 -18
  27. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/workflows/sufficiency.py +123 -133
  28. dataeval-0.70.0/src/dataeval/metrics/stats/__init__.py +17 -0
  29. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/workflows/__init__.py +2 -2
  30. dataeval-0.69.4/src/dataeval/_internal/detectors/merged_stats.py +0 -78
  31. dataeval-0.69.4/src/dataeval/_internal/detectors/outliers.py +0 -197
  32. dataeval-0.69.4/src/dataeval/_internal/flags.py +0 -77
  33. dataeval-0.69.4/src/dataeval/_internal/metrics/stats.py +0 -397
  34. dataeval-0.69.4/src/dataeval/flags/__init__.py +0 -3
  35. dataeval-0.69.4/src/dataeval/metrics/stats/__init__.py +0 -6
  36. {dataeval-0.69.4 → dataeval-0.70.0}/LICENSE.txt +0 -0
  37. {dataeval-0.69.4 → dataeval-0.70.0}/README.md +0 -0
  38. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/datasets.py +0 -0
  39. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/__init__.py +0 -0
  40. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/clusterer.py +0 -0
  41. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/__init__.py +0 -0
  42. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/cvm.py +0 -0
  43. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/ks.py +0 -0
  44. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/torch.py +0 -0
  45. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/drift/uncertainty.py +0 -0
  46. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/__init__.py +0 -0
  47. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/aegmm.py +0 -0
  48. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/base.py +0 -0
  49. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/llr.py +0 -0
  50. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/vae.py +0 -0
  51. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/detectors/ood/vaegmm.py +0 -0
  52. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/__init__.py +0 -0
  53. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/metrics/coverage.py +0 -0
  54. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/__init__.py +0 -0
  55. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/pytorch/__init__.py +0 -0
  56. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/pytorch/autoencoder.py +0 -0
  57. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/pytorch/blocks.py +0 -0
  58. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/pytorch/utils.py +0 -0
  59. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/__init__.py +0 -0
  60. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/autoencoder.py +0 -0
  61. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/gmm.py +0 -0
  62. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/losses.py +0 -0
  63. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/pixelcnn.py +0 -0
  64. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/trainer.py +0 -0
  65. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/models/tensorflow/utils.py +0 -0
  66. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/utils.py +0 -0
  67. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/_internal/workflows/__init__.py +0 -0
  68. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/detectors/__init__.py +0 -0
  69. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/detectors/drift/__init__.py +0 -0
  70. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/detectors/drift/kernels/__init__.py +0 -0
  71. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/detectors/drift/updates/__init__.py +0 -0
  72. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/detectors/linters/__init__.py +0 -0
  73. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/detectors/ood/__init__.py +0 -0
  74. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/metrics/__init__.py +0 -0
  75. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/metrics/bias/__init__.py +0 -0
  76. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/metrics/estimators/__init__.py +0 -0
  77. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/py.typed +0 -0
  78. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/tensorflow/__init__.py +0 -0
  79. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/tensorflow/loss/__init__.py +0 -0
  80. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/tensorflow/models/__init__.py +0 -0
  81. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/tensorflow/recon/__init__.py +0 -0
  82. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/torch/__init__.py +0 -0
  83. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/torch/models/__init__.py +0 -0
  84. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/torch/trainer/__init__.py +0 -0
  85. {dataeval-0.69.4 → dataeval-0.70.0}/src/dataeval/utils/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.69.4
3
+ Version: 0.70.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -32,8 +32,8 @@ Requires-Dist: scipy (>=1.10)
32
32
  Requires-Dist: tensorflow (>=2.14.1,<2.16) ; extra == "tensorflow" or extra == "all"
33
33
  Requires-Dist: tensorflow-io-gcs-filesystem (>=0.35.0,<0.37) ; extra == "tensorflow" or extra == "all"
34
34
  Requires-Dist: tensorflow_probability (>=0.22.1,<0.24) ; extra == "tensorflow" or extra == "all"
35
- Requires-Dist: torch (>=2.0.1,!=2.2.0) ; extra == "torch" or extra == "all"
36
- Requires-Dist: torchvision (>=0.16.0) ; extra == "torch" or extra == "all"
35
+ Requires-Dist: torch (>=2.2.0) ; extra == "torch" or extra == "all"
36
+ Requires-Dist: torchvision (>=0.17.0) ; extra == "torch" or extra == "all"
37
37
  Requires-Dist: xxhash (>=3.3)
38
38
  Project-URL: Documentation, https://dataeval.readthedocs.io/
39
39
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.69.4" # dynamic
3
+ version = "0.70.0" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -54,8 +54,8 @@ nvidia-cudnn-cu11 = {version = ">=8.6.0.163", optional = true}
54
54
  tensorflow = {version = ">=2.14.1, <2.16", optional = true}
55
55
  tensorflow-io-gcs-filesystem = {version = ">=0.35.0, <0.37", optional = true}
56
56
  tensorflow_probability = {version = ">=0.22.1, <0.24", optional = true}
57
- torch = {version = ">=2.0.1, !=2.2.0", source = "pytorch", optional = true}
58
- torchvision = {version = ">=0.16.0", source = "pytorch", optional = true}
57
+ torch = {version = ">=2.2.0", source = "pytorch", optional = true}
58
+ torchvision = {version = ">=0.17.0", source = "pytorch", optional = true}
59
59
 
60
60
  [tool.poetry.extras]
61
61
  tensorflow = ["tensorflow", "tensorflow-io-gcs-filesystem", "tensorflow_probability", "nvidia-cudnn-cu11"]
@@ -131,6 +131,7 @@ omit = [
131
131
  "*/_internal/models/pytorch/blocks.py",
132
132
  "*/_internal/models/pytorch/utils.py",
133
133
  "*/_internal/models/tensorflow/pixelcnn.py",
134
+ "*/_internal/datasets.py",
134
135
  ]
135
136
  fail_under = 90
136
137
 
@@ -173,6 +174,7 @@ docstring-code-line-length = "dynamic"
173
174
 
174
175
  [tool.codespell]
175
176
  skip = './*env*,./prototype,./output,./docs/_build,./docs/.jupyter_cache,CHANGELOG.md,poetry.lock,*.html'
177
+ ignore-words-list = ["Hart"]
176
178
 
177
179
  [build-system]
178
180
  requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"]
@@ -1,4 +1,4 @@
1
- __version__ = "0.69.4"
1
+ __version__ = "0.70.0"
2
2
 
3
3
  from importlib.util import find_spec
4
4
 
@@ -7,9 +7,9 @@ _IS_TENSORFLOW_AVAILABLE = find_spec("tensorflow") is not None and find_spec("te
7
7
 
8
8
  del find_spec
9
9
 
10
- from . import detectors, flags, metrics # noqa: E402
10
+ from . import detectors, metrics # noqa: E402
11
11
 
12
- __all__ = ["detectors", "flags", "metrics"]
12
+ __all__ = ["detectors", "metrics"]
13
13
 
14
14
  if _IS_TORCH_AVAILABLE: # pragma: no cover
15
15
  from . import torch, utils, workflows
@@ -16,7 +16,7 @@ from typing import Callable, Literal
16
16
  import numpy as np
17
17
  from numpy.typing import ArrayLike, NDArray
18
18
 
19
- from dataeval._internal.interop import to_numpy
19
+ from dataeval._internal.interop import as_numpy, to_numpy
20
20
  from dataeval._internal.output import OutputMetadata, set_metadata
21
21
 
22
22
 
@@ -234,7 +234,7 @@ class BaseDrift:
234
234
  if correction not in ["bonferroni", "fdr"]:
235
235
  raise ValueError("`correction` must be `bonferroni` or `fdr`.")
236
236
 
237
- self._x_ref = x_ref
237
+ self._x_ref = to_numpy(x_ref)
238
238
  self.x_ref_preprocessed = x_ref_preprocessed
239
239
 
240
240
  # Other attributes
@@ -242,7 +242,7 @@ class BaseDrift:
242
242
  self.update_x_ref = update_x_ref
243
243
  self.preprocess_fn = preprocess_fn
244
244
  self.correction = correction
245
- self.n = len(self._x_ref) # type: ignore
245
+ self.n = len(self._x_ref)
246
246
 
247
247
  # Ref counter for preprocessed x
248
248
  self._x_refcount = 0
@@ -260,9 +260,8 @@ class BaseDrift:
260
260
  if not self.x_ref_preprocessed:
261
261
  self.x_ref_preprocessed = True
262
262
  if self.preprocess_fn is not None:
263
- self._x_ref = self.preprocess_fn(self._x_ref)
263
+ self._x_ref = as_numpy(self.preprocess_fn(self._x_ref))
264
264
 
265
- self._x_ref = to_numpy(self._x_ref)
266
265
  return self._x_ref
267
266
 
268
267
  def _preprocess(self, x: ArrayLike) -> ArrayLike:
@@ -380,7 +379,7 @@ class BaseDriftUnivariate(BaseDrift):
380
379
  self._n_features = self.x_ref.reshape(self.x_ref.shape[0], -1).shape[-1]
381
380
  else:
382
381
  # infer number of features after applying preprocessing step
383
- x = to_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
382
+ x = as_numpy(self.preprocess_fn(self._x_ref[0:1])) # type: ignore
384
383
  self._n_features = x.reshape(x.shape[0], -1).shape[-1]
385
384
 
386
385
  return self._n_features
@@ -14,7 +14,7 @@ from typing import Callable
14
14
  import torch
15
15
  from numpy.typing import ArrayLike
16
16
 
17
- from dataeval._internal.interop import to_numpy
17
+ from dataeval._internal.interop import as_numpy
18
18
  from dataeval._internal.output import set_metadata
19
19
 
20
20
  from .base import BaseDrift, DriftBaseOutput, UpdateStrategy, preprocess_x, update_x_ref
@@ -110,7 +110,7 @@ class DriftMMD(BaseDrift):
110
110
  self.device = get_device(device)
111
111
 
112
112
  # initialize kernel
113
- sigma_tensor = torch.from_numpy(to_numpy(sigma)).to(self.device) if sigma is not None else None
113
+ sigma_tensor = torch.from_numpy(as_numpy(sigma)).to(self.device) if sigma is not None else None
114
114
  self.kernel = kernel(sigma_tensor).to(self.device) if kernel == GaussianRBF else kernel
115
115
 
116
116
  # compute kernel matrix for the reference data
@@ -147,7 +147,7 @@ class DriftMMD(BaseDrift):
147
147
  p-value obtained from the permutation test, MMD^2 between the reference and test set,
148
148
  and MMD^2 threshold above which drift is flagged
149
149
  """
150
- x = to_numpy(x)
150
+ x = as_numpy(x)
151
151
  x_ref = torch.from_numpy(self.x_ref).to(self.device)
152
152
  n = x.shape[0]
153
153
  kernel_mat = self._kernel_matrix(x_ref, torch.from_numpy(x).to(self.device))
@@ -1,13 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Generic, Iterable, Sequence, TypeVar, cast
4
+ from typing import Generic, Iterable, Sequence, TypeVar
5
5
 
6
6
  from numpy.typing import ArrayLike
7
7
 
8
8
  from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
9
- from dataeval._internal.flags import ImageStat
10
- from dataeval._internal.metrics.stats import StatsOutput, imagestats
9
+ from dataeval._internal.metrics.stats.hashstats import HashStatsOutput, hashstats
11
10
  from dataeval._internal.output import OutputMetadata, set_metadata
12
11
 
13
12
  DuplicateGroup = list[int]
@@ -53,26 +52,23 @@ class Duplicates:
53
52
  -------
54
53
  Initialize the Duplicates class:
55
54
 
56
- >>> dups = Duplicates()
55
+ >>> all_dupes = Duplicates()
56
+ >>> exact_dupes = Duplicates(only_exact=True)
57
57
  """
58
58
 
59
59
  def __init__(self, only_exact: bool = False):
60
- self.stats: StatsOutput
60
+ self.stats: HashStatsOutput
61
61
  self.only_exact = only_exact
62
62
 
63
- def _get_duplicates(self) -> dict[str, list[list[int]]]:
64
- stats_dict = self.stats.dict()
65
- if "xxhash" in stats_dict:
66
- exact_dict: dict[int, list] = {}
67
- for i, value in enumerate(stats_dict["xxhash"]):
68
- exact_dict.setdefault(value, []).append(i)
69
- exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
70
- else:
71
- exact = []
63
+ def _get_duplicates(self, stats: dict) -> dict[str, list[list[int]]]:
64
+ exact_dict: dict[int, list] = {}
65
+ for i, value in enumerate(stats["xxhash"]):
66
+ exact_dict.setdefault(value, []).append(i)
67
+ exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
72
68
 
73
- if "pchash" in stats_dict and not self.only_exact:
69
+ if not self.only_exact:
74
70
  near_dict: dict[int, list] = {}
75
- for i, value in enumerate(stats_dict["pchash"]):
71
+ for i, value in enumerate(stats["pchash"]):
76
72
  near_dict.setdefault(value, []).append(i)
77
73
  near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
78
74
  else:
@@ -84,14 +80,14 @@ class Duplicates:
84
80
  }
85
81
 
86
82
  @set_metadata("dataeval.detectors", ["only_exact"])
87
- def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> DuplicatesOutput:
83
+ def from_stats(self, hashes: HashStatsOutput | Sequence[HashStatsOutput]) -> DuplicatesOutput:
88
84
  """
89
85
  Returns duplicate image indices for both exact matches and near matches
90
86
 
91
87
  Parameters
92
88
  ----------
93
- data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
94
- A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
89
+ data : HashStatsOutput | Sequence[HashStatsOutput]
90
+ The output(s) from a hashstats analysis
95
91
 
96
92
  Returns
97
93
  -------
@@ -100,39 +96,60 @@ class Duplicates:
100
96
 
101
97
  See Also
102
98
  --------
103
- imagestats
99
+ hashstats
104
100
 
105
101
  Example
106
102
  -------
107
- >>> dups.evaluate(images)
108
- DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
109
- """ # noqa: E501
103
+ >>> exact_dupes.from_stats([hashes1, hashes2])
104
+ DuplicatesOutput(exact=[{0: [3, 20]}, {0: [16], 1: [12]}], near=[])
105
+ """
110
106
 
111
- stats, dataset_steps = combine_stats(data)
107
+ if isinstance(hashes, HashStatsOutput):
108
+ return DuplicatesOutput(**self._get_duplicates(hashes.dict()))
112
109
 
113
- if isinstance(stats, StatsOutput):
114
- if not stats.xxhash:
115
- raise ValueError("StatsOutput must include xxhash information of the images.")
116
- if not self.only_exact and not stats.pchash:
117
- raise ValueError("StatsOutput must include pchash information of the images for near matches.")
118
- self.stats = stats
119
- else:
120
- flags = ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH)
121
- self.stats = imagestats(cast(Iterable[ArrayLike], data), flags)
110
+ if not isinstance(hashes, Sequence):
111
+ raise TypeError("Invalid stats output type; only use output from hashstats.")
122
112
 
123
- duplicates = self._get_duplicates()
113
+ combined, dataset_steps = combine_stats(hashes)
114
+ duplicates = self._get_duplicates(combined.dict())
124
115
 
125
116
  # split up results from combined dataset into individual dataset buckets
126
- if dataset_steps:
127
- dup_list: list[list[int]]
128
- for dup_type, dup_list in duplicates.items():
129
- dup_list_dict = []
130
- for idxs in dup_list:
131
- dup_dict = {}
132
- for idx in idxs:
133
- k, v = get_dataset_step_from_idx(idx, dataset_steps)
134
- dup_dict.setdefault(k, []).append(v)
135
- dup_list_dict.append(dup_dict)
136
- duplicates[dup_type] = dup_list_dict
117
+ for dup_type, dup_list in duplicates.items():
118
+ dup_list_dict = []
119
+ for idxs in dup_list:
120
+ dup_dict = {}
121
+ for idx in idxs:
122
+ k, v = get_dataset_step_from_idx(idx, dataset_steps)
123
+ dup_dict.setdefault(k, []).append(v)
124
+ dup_list_dict.append(dup_dict)
125
+ duplicates[dup_type] = dup_list_dict
126
+
127
+ return DuplicatesOutput(**duplicates)
128
+
129
+ @set_metadata("dataeval.detectors", ["only_exact"])
130
+ def evaluate(self, data: Iterable[ArrayLike]) -> DuplicatesOutput:
131
+ """
132
+ Returns duplicate image indices for both exact matches and near matches
133
+
134
+ Parameters
135
+ ----------
136
+ data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
137
+ A dataset of images in an ArrayLike format or the output(s) from a hashstats analysis
138
+
139
+ Returns
140
+ -------
141
+ DuplicatesOutput
142
+ List of groups of indices that are exact and near matches
137
143
 
144
+ See Also
145
+ --------
146
+ hashstats
147
+
148
+ Example
149
+ -------
150
+ >>> all_dupes.evaluate(images)
151
+ DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
152
+ """ # noqa: E501
153
+ self.stats = hashstats(data)
154
+ duplicates = self._get_duplicates(self.stats.dict())
138
155
  return DuplicatesOutput(**duplicates)
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from copy import deepcopy
4
+ from typing import Sequence, TypeVar
5
+
6
+ import numpy as np
7
+
8
+ from dataeval._internal.metrics.stats.base import BaseStatsOutput
9
+
10
+ TStatsOutput = TypeVar("TStatsOutput", bound=BaseStatsOutput)
11
+
12
+
13
+ def add_stats(a: TStatsOutput, b: TStatsOutput) -> TStatsOutput:
14
+ if type(a) is not type(b):
15
+ raise TypeError(f"Types {type(a)} and {type(b)} cannot be added.")
16
+
17
+ sum_dict = deepcopy(a.dict())
18
+
19
+ for k in sum_dict:
20
+ if isinstance(sum_dict[k], list):
21
+ sum_dict[k].extend(b.dict()[k])
22
+ else:
23
+ sum_dict[k] = np.concatenate((sum_dict[k], b.dict()[k]))
24
+
25
+ return type(a)(**sum_dict)
26
+
27
+
28
+ def combine_stats(stats: Sequence[TStatsOutput]) -> tuple[TStatsOutput, list[int]]:
29
+ output = None
30
+ dataset_steps = []
31
+ cur_len = 0
32
+ for s in stats:
33
+ output = s if output is None else add_stats(output, s)
34
+ cur_len += len(s)
35
+ dataset_steps.append(cur_len)
36
+ if output is None:
37
+ raise TypeError("Cannot combine empty sequence of stats.")
38
+ return output, dataset_steps
39
+
40
+
41
+ def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
42
+ last_step = 0
43
+ for i, step in enumerate(dataset_steps):
44
+ if idx < step:
45
+ return i, idx - last_step
46
+ last_step = step
47
+ return -1, idx
@@ -16,7 +16,7 @@ import tensorflow as tf
16
16
  from numpy.typing import ArrayLike
17
17
 
18
18
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
19
- from dataeval._internal.interop import to_numpy
19
+ from dataeval._internal.interop import as_numpy
20
20
  from dataeval._internal.models.tensorflow.autoencoder import AE
21
21
  from dataeval._internal.models.tensorflow.utils import predict_batch
22
22
 
@@ -46,10 +46,10 @@ class OOD_AE(OODBase):
46
46
  ) -> None:
47
47
  if loss_fn is None:
48
48
  loss_fn = keras.losses.MeanSquaredError()
49
- super().fit(to_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
49
+ super().fit(as_numpy(x_ref), threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
50
50
 
51
51
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
52
- self._validate(X := to_numpy(X))
52
+ self._validate(X := as_numpy(X))
53
53
 
54
54
  # reconstruct instances
55
55
  X_recon = predict_batch(X, self.model, batch_size=batch_size)
@@ -0,0 +1,269 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Generic, Iterable, Literal, Sequence, TypeVar, Union, overload
5
+
6
+ import numpy as np
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
10
+ from dataeval._internal.metrics.stats.base import BOX_COUNT, SOURCE_INDEX
11
+ from dataeval._internal.metrics.stats.datasetstats import DatasetStatsOutput, datasetstats
12
+ from dataeval._internal.metrics.stats.dimensionstats import DimensionStatsOutput
13
+ from dataeval._internal.metrics.stats.pixelstats import PixelStatsOutput
14
+ from dataeval._internal.metrics.stats.visualstats import VisualStatsOutput
15
+ from dataeval._internal.output import OutputMetadata, set_metadata
16
+
17
+ IndexIssueMap = dict[int, dict[str, float]]
18
+ OutlierStatsOutput = Union[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
19
+ TIndexIssueMap = TypeVar("TIndexIssueMap", IndexIssueMap, list[IndexIssueMap])
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class OutliersOutput(Generic[TIndexIssueMap], OutputMetadata):
24
+ """
25
+ Attributes
26
+ ----------
27
+ issues : dict[int, dict[str, float]] | list[dict[int, dict[str, float]]]
28
+ Indices of image outliers with their associated issue type and calculated values.
29
+
30
+ - For a single dataset, a dictionary containing the indices of outliers and
31
+ a dictionary showing the issues and calculated values for the given index.
32
+ - For multiple stats outputs, a list of dictionaries containing the indices of
33
+ outliers and their associated issues and calculated values.
34
+ """
35
+
36
+ issues: TIndexIssueMap
37
+
38
+ def __len__(self):
39
+ if isinstance(self.issues, dict):
40
+ return len(self.issues)
41
+ else:
42
+ return sum(len(d) for d in self.issues)
43
+
44
+
45
+ def _get_outlier_mask(
46
+ values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
47
+ ) -> NDArray:
48
+ if method == "zscore":
49
+ threshold = threshold if threshold else 3.0
50
+ std = np.std(values)
51
+ abs_diff = np.abs(values - np.mean(values))
52
+ return std != 0 and (abs_diff / std) > threshold
53
+ elif method == "modzscore":
54
+ threshold = threshold if threshold else 3.5
55
+ abs_diff = np.abs(values - np.median(values))
56
+ med_abs_diff = np.median(abs_diff) if np.median(abs_diff) != 0 else np.mean(abs_diff)
57
+ mod_z_score = 0.6745 * abs_diff / med_abs_diff
58
+ return mod_z_score > threshold
59
+ elif method == "iqr":
60
+ threshold = threshold if threshold else 1.5
61
+ qrt = np.percentile(values, q=(25, 75), method="midpoint")
62
+ iqr = (qrt[1] - qrt[0]) * threshold
63
+ return (values < (qrt[0] - iqr)) | (values > (qrt[1] + iqr))
64
+ else:
65
+ raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
66
+
67
+
68
+ class Outliers:
69
+ r"""
70
+ Calculates statistical outliers of a dataset using various statistical tests applied to each image
71
+
72
+ Parameters
73
+ ----------
74
+ outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
75
+ Statistical method used to identify outliers
76
+ outlier_threshold : float, optional - default None
77
+ Threshold value for the given ``outlier_method``, above which data is considered an outlier.
78
+ Uses method specific default if `None`
79
+
80
+ Attributes
81
+ ----------
82
+ stats : tuple[DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput]
83
+ Various stats output classes that hold the value of each metric for each image
84
+
85
+ See Also
86
+ --------
87
+ Duplicates
88
+
89
+ Notes
90
+ ------
91
+ There are 3 different statistical methods:
92
+
93
+ - zscore
94
+ - modzscore
95
+ - iqr
96
+
97
+ | The z score method is based on the difference between the data point and the mean of the data.
98
+ The default threshold value for `zscore` is 3.
99
+ | Z score = :math:`|x_i - \mu| / \sigma`
100
+
101
+ | The modified z score method is based on the difference between the data point and the median of the data.
102
+ The default threshold value for `modzscore` is 3.5.
103
+ | Modified z score = :math:`0.6745 * |x_i - x̃| / MAD`, where :math:`MAD` is the median absolute deviation
104
+
105
+ | The interquartile range method is based on the difference between the data point and
106
+ the difference between the 75th and 25th qartile. The default threshold value for `iqr` is 1.5.
107
+ | Interquartile range = :math:`threshold * (Q_3 - Q_1)`
108
+
109
+ Examples
110
+ --------
111
+ Initialize the Outliers class:
112
+
113
+ >>> outliers = Outliers()
114
+
115
+ Specifying an outlier method:
116
+
117
+ >>> outliers = Outliers(outlier_method="iqr")
118
+
119
+ Specifying an outlier method and threshold:
120
+
121
+ >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=3.5)
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ use_dimension: bool = True,
127
+ use_pixel: bool = True,
128
+ use_visual: bool = True,
129
+ outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
130
+ outlier_threshold: float | None = None,
131
+ ):
132
+ self.stats: DatasetStatsOutput
133
+ self.use_dimension = use_dimension
134
+ self.use_pixel = use_pixel
135
+ self.use_visual = use_visual
136
+ self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
137
+ self.outlier_threshold = outlier_threshold
138
+
139
+ def _get_outliers(self, stats: dict) -> dict[int, dict[str, float]]:
140
+ flagged_images: dict[int, dict[str, float]] = {}
141
+ for stat, values in stats.items():
142
+ if stat in (SOURCE_INDEX, BOX_COUNT):
143
+ continue
144
+ if values.ndim == 1:
145
+ mask = _get_outlier_mask(values.astype(np.float64), self.outlier_method, self.outlier_threshold)
146
+ indices = np.flatnonzero(mask)
147
+ for i, value in zip(indices, values[mask]):
148
+ flagged_images.setdefault(i, {}).update({stat: value})
149
+
150
+ return dict(sorted(flagged_images.items()))
151
+
152
+ @overload
153
+ def from_stats(self, stats: OutlierStatsOutput | DatasetStatsOutput) -> OutliersOutput[IndexIssueMap]: ...
154
+
155
+ @overload
156
+ def from_stats(self, stats: Sequence[OutlierStatsOutput]) -> OutliersOutput[list[IndexIssueMap]]: ...
157
+
158
+ @set_metadata("dataeval.detectors", ["outlier_method", "outlier_threshold"])
159
+ def from_stats(
160
+ self, stats: OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
161
+ ) -> OutliersOutput:
162
+ """
163
+ Returns indices of outliers with the issues identified for each
164
+
165
+ Parameters
166
+ ----------
167
+ stats : OutlierStatsOutput | DatasetStatsOutput | Sequence[OutlierStatsOutput]
168
+ The output(s) from a dimensionstats, pixelstats, or visualstats metric
169
+ analysis or an aggregate DatasetStatsOutput
170
+
171
+ Returns
172
+ -------
173
+ OutliersOutput
174
+ Output class containing the indices of outliers and a dictionary showing
175
+ the issues and calculated values for the given index.
176
+
177
+ See Also
178
+ --------
179
+ dimensionstats
180
+ pixelstats
181
+ visualstats
182
+
183
+ Example
184
+ -------
185
+ Evaluate the dataset:
186
+
187
+ >>> results = outliers.from_stats([stats1, stats2])
188
+ >>> len(results)
189
+ 2
190
+ >>> results.issues[0]
191
+ {10: {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}, 12: {'std': 0.00536, 'var': 2.87e-05, 'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128}}
192
+ >>> results.issues[1]
193
+ {}
194
+ """ # noqa: E501
195
+ if isinstance(stats, DatasetStatsOutput):
196
+ outliers = self._get_outliers({k: v for o in stats.outputs() for k, v in o.dict().items()})
197
+ return OutliersOutput(outliers)
198
+
199
+ if isinstance(stats, (DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)):
200
+ return OutliersOutput(self._get_outliers(stats.dict()))
201
+
202
+ if not isinstance(stats, Sequence):
203
+ raise TypeError(
204
+ "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
205
+ )
206
+
207
+ stats_map: dict[type, list[int]] = {}
208
+ for i, stats_output in enumerate(stats):
209
+ if not isinstance(
210
+ stats_output, (DatasetStatsOutput, DimensionStatsOutput, PixelStatsOutput, VisualStatsOutput)
211
+ ):
212
+ raise TypeError(
213
+ "Invalid stats output type; only use output from dimensionstats, pixelstats or visualstats."
214
+ )
215
+ stats_map.setdefault(type(stats_output), []).append(i)
216
+
217
+ output_list: list[dict[int, dict[str, float]]] = [{} for _ in stats]
218
+ for _, indices in stats_map.items():
219
+ substats, dataset_steps = combine_stats([stats[i] for i in indices])
220
+ outliers = self._get_outliers(substats.dict())
221
+ for idx, issue in outliers.items():
222
+ k, v = get_dataset_step_from_idx(idx, dataset_steps)
223
+ output_list[indices[k]][v] = issue
224
+
225
+ return OutliersOutput(output_list)
226
+
227
+ @set_metadata(
228
+ "dataeval.detectors",
229
+ [
230
+ "use_dimension",
231
+ "use_pixel",
232
+ "use_visual",
233
+ "outlier_method",
234
+ "outlier_threshold",
235
+ ],
236
+ )
237
+ def evaluate(self, data: Iterable[ArrayLike]) -> OutliersOutput[IndexIssueMap]:
238
+ """
239
+ Returns indices of outliers with the issues identified for each
240
+
241
+ Parameters
242
+ ----------
243
+ data : Iterable[ArrayLike], shape - (C, H, W)
244
+ A dataset of images in an ArrayLike format
245
+
246
+ Returns
247
+ -------
248
+ OutliersOutput
249
+ Output class containing the indices of outliers and a dictionary showing
250
+ the issues and calculated values for the given index.
251
+
252
+ Example
253
+ -------
254
+ Evaluate the dataset:
255
+
256
+ >>> results = outliers.evaluate(images)
257
+ >>> list(results.issues)
258
+ [10, 12]
259
+ >>> results.issues[10]
260
+ {'skew': -3.906, 'kurtosis': 13.266, 'entropy': 0.2128, 'contrast': 1.25, 'zeros': 0.05493}
261
+ """
262
+ self.stats = datasetstats(
263
+ images=data,
264
+ use_dimension=self.use_dimension,
265
+ use_pixel=self.use_pixel,
266
+ use_visual=self.use_visual,
267
+ )
268
+ outliers = self._get_outliers({k: v for o in self.stats.outputs() for k, v in o.dict().items()})
269
+ return OutliersOutput(outliers)