dataeval 0.64.0__py3-none-any.whl → 0.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. dataeval/__init__.py +13 -9
  2. dataeval/_internal/detectors/clusterer.py +63 -49
  3. dataeval/_internal/detectors/drift/base.py +248 -51
  4. dataeval/_internal/detectors/drift/cvm.py +28 -26
  5. dataeval/_internal/detectors/drift/ks.py +31 -28
  6. dataeval/_internal/detectors/drift/mmd.py +62 -42
  7. dataeval/_internal/detectors/drift/torch.py +69 -60
  8. dataeval/_internal/detectors/drift/uncertainty.py +32 -32
  9. dataeval/_internal/detectors/duplicates.py +67 -31
  10. dataeval/_internal/detectors/ood/ae.py +15 -29
  11. dataeval/_internal/detectors/ood/aegmm.py +33 -27
  12. dataeval/_internal/detectors/ood/base.py +86 -47
  13. dataeval/_internal/detectors/ood/llr.py +34 -31
  14. dataeval/_internal/detectors/ood/vae.py +32 -31
  15. dataeval/_internal/detectors/ood/vaegmm.py +34 -28
  16. dataeval/_internal/detectors/{linter.py → outliers.py} +60 -38
  17. dataeval/_internal/flags.py +44 -21
  18. dataeval/_internal/interop.py +5 -3
  19. dataeval/_internal/metrics/balance.py +42 -5
  20. dataeval/_internal/metrics/ber.py +11 -8
  21. dataeval/_internal/metrics/coverage.py +15 -8
  22. dataeval/_internal/metrics/divergence.py +41 -7
  23. dataeval/_internal/metrics/diversity.py +57 -19
  24. dataeval/_internal/metrics/parity.py +141 -66
  25. dataeval/_internal/metrics/stats.py +330 -313
  26. dataeval/_internal/metrics/uap.py +33 -4
  27. dataeval/_internal/metrics/utils.py +79 -40
  28. dataeval/_internal/models/pytorch/autoencoder.py +127 -22
  29. dataeval/_internal/models/tensorflow/autoencoder.py +33 -30
  30. dataeval/_internal/models/tensorflow/gmm.py +4 -2
  31. dataeval/_internal/models/tensorflow/losses.py +17 -13
  32. dataeval/_internal/models/tensorflow/pixelcnn.py +19 -18
  33. dataeval/_internal/models/tensorflow/trainer.py +10 -7
  34. dataeval/_internal/models/tensorflow/utils.py +23 -20
  35. dataeval/_internal/output.py +85 -0
  36. dataeval/_internal/utils.py +5 -3
  37. dataeval/_internal/workflows/sufficiency.py +122 -121
  38. dataeval/detectors/__init__.py +6 -25
  39. dataeval/detectors/drift/__init__.py +16 -0
  40. dataeval/detectors/drift/kernels/__init__.py +6 -0
  41. dataeval/detectors/drift/updates/__init__.py +3 -0
  42. dataeval/detectors/linters/__init__.py +5 -0
  43. dataeval/detectors/ood/__init__.py +11 -0
  44. dataeval/flags/__init__.py +2 -2
  45. dataeval/metrics/__init__.py +2 -26
  46. dataeval/metrics/bias/__init__.py +14 -0
  47. dataeval/metrics/estimators/__init__.py +9 -0
  48. dataeval/metrics/stats/__init__.py +6 -0
  49. dataeval/tensorflow/__init__.py +3 -0
  50. dataeval/tensorflow/loss/__init__.py +3 -0
  51. dataeval/tensorflow/models/__init__.py +5 -0
  52. dataeval/tensorflow/recon/__init__.py +3 -0
  53. dataeval/torch/__init__.py +3 -0
  54. dataeval/{models/torch → torch/models}/__init__.py +1 -2
  55. dataeval/torch/trainer/__init__.py +3 -0
  56. dataeval/utils/__init__.py +3 -6
  57. dataeval/workflows/__init__.py +2 -4
  58. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/METADATA +1 -1
  59. dataeval-0.66.0.dist-info/RECORD +72 -0
  60. dataeval/_internal/metrics/base.py +0 -10
  61. dataeval/models/__init__.py +0 -15
  62. dataeval/models/tensorflow/__init__.py +0 -6
  63. dataeval-0.64.0.dist-info/RECORD +0 -60
  64. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/LICENSE.txt +0 -0
  65. {dataeval-0.64.0.dist-info → dataeval-0.66.0.dist-info}/WHEEL +0 -0
@@ -6,15 +6,17 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from functools import partial
10
- from typing import Callable, Optional, Tuple
12
+ from typing import Callable
11
13
 
12
14
  import keras
13
15
  import numpy as np
14
16
  import tensorflow as tf
15
17
  from keras.layers import Input
16
18
  from keras.models import Model
17
- from numpy.typing import ArrayLike
19
+ from numpy.typing import ArrayLike, NDArray
18
20
 
19
21
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
20
22
  from dataeval._internal.interop import to_numpy
@@ -24,8 +26,8 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
24
26
 
25
27
 
26
28
  def build_model(
27
- dist: PixelCNN, input_shape: Optional[tuple] = None, filepath: Optional[str] = None
28
- ) -> Tuple[keras.Model, PixelCNN]:
29
+ dist: PixelCNN, input_shape: tuple | None = None, filepath: str | None = None
30
+ ) -> tuple[keras.Model, PixelCNN]:
29
31
  """
30
32
  Create keras.Model from TF distribution.
31
33
 
@@ -52,7 +54,7 @@ def build_model(
52
54
 
53
55
 
54
56
  def mutate_categorical(
55
- X: np.ndarray,
57
+ X: NDArray,
56
58
  rate: float,
57
59
  seed: int = 0,
58
60
  feature_range: tuple = (0, 255),
@@ -87,28 +89,29 @@ def mutate_categorical(
87
89
 
88
90
 
89
91
  class OOD_LLR(OODBase):
92
+ """
93
+ Likelihood Ratios based outlier detector.
94
+
95
+ Parameters
96
+ ----------
97
+ model : PixelCNN
98
+ Generative distribution model.
99
+ model_background : Optional[PixelCNN], default None
100
+ Optional model for the background. Only needed if it is different from `model`.
101
+ log_prob : Optional[Callable], default None
102
+ Function used to evaluate log probabilities under the model
103
+ if the model does not have a `log_prob` function.
104
+ sequential : bool, default False
105
+ Whether the data is sequential. Used to create targets during training.
106
+ """
107
+
90
108
  def __init__(
91
109
  self,
92
110
  model: PixelCNN,
93
- model_background: Optional[PixelCNN] = None,
94
- log_prob: Optional[Callable] = None,
111
+ model_background: PixelCNN | None = None,
112
+ log_prob: Callable | None = None,
95
113
  sequential: bool = False,
96
114
  ) -> None:
97
- """
98
- Likelihood Ratios based outlier detector.
99
-
100
- Parameters
101
- ----------
102
- model : PixelCNN
103
- Generative distribution model.
104
- model_background : Optional[PixelCNN], default None
105
- Optional model for the background. Only needed if it is different from `model`.
106
- log_prob : Optional[Callable], default None
107
- Function used to evaluate log probabilities under the model
108
- if the model does not have a `log_prob` function.
109
- sequential : bool, default False
110
- Whether the data is sequential. Used to create targets during training.
111
- """
112
115
  self.dist_s = model
113
116
  self.dist_b = (
114
117
  model.copy()
@@ -123,13 +126,13 @@ class OOD_LLR(OODBase):
123
126
 
124
127
  self._ref_score: OODScore
125
128
  self._threshold_perc: float
126
- self._data_info: Optional[Tuple[tuple, type]] = None
129
+ self._data_info: tuple[tuple, type] | None = None
127
130
 
128
131
  def fit(
129
132
  self,
130
133
  x_ref: ArrayLike,
131
134
  threshold_perc: float = 100.0,
132
- loss_fn: Optional[Callable] = None,
135
+ loss_fn: Callable | None = None,
133
136
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
134
137
  epochs: int = 20,
135
138
  batch_size: int = 64,
@@ -144,10 +147,10 @@ class OOD_LLR(OODBase):
144
147
  Parameters
145
148
  ----------
146
149
  x_ref : ArrayLike
147
- Training batch.
150
+ Training data.
148
151
  threshold_perc : float, default 100.0
149
152
  Percentage of reference data that is normal.
150
- loss_fn : Optional[Callable], default None
153
+ loss_fn : Callable | None, default None
151
154
  Loss function used for training.
152
155
  optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
153
156
  Optimizer used for training.
@@ -221,10 +224,10 @@ class OOD_LLR(OODBase):
221
224
  def _logp(
222
225
  self,
223
226
  dist,
224
- X: np.ndarray,
227
+ X: NDArray,
225
228
  return_per_feature: bool = False,
226
229
  batch_size: int = int(1e10),
227
- ) -> np.ndarray:
230
+ ) -> NDArray:
228
231
  """
229
232
  Compute log probability of a batch of instances under the generative model.
230
233
  """
@@ -235,10 +238,10 @@ class OOD_LLR(OODBase):
235
238
  def _logp_alt(
236
239
  self,
237
240
  model: keras.Model,
238
- X: np.ndarray,
241
+ X: NDArray,
239
242
  return_per_feature: bool = False,
240
243
  batch_size: int = int(1e10),
241
- ) -> np.ndarray:
244
+ ) -> NDArray:
242
245
  """
243
246
  Compute log probability of a batch of instances with the user defined log_prob function.
244
247
  """
@@ -254,7 +257,7 @@ class OOD_LLR(OODBase):
254
257
  axis = tuple(np.arange(len(logp.shape))[1:])
255
258
  return np.mean(logp, axis=axis)
256
259
 
257
- def _llr(self, X: np.ndarray, return_per_feature: bool, batch_size: int = int(1e10)) -> np.ndarray:
260
+ def _llr(self, X: NDArray, return_per_feature: bool, batch_size: int = int(1e10)) -> NDArray:
258
261
  """
259
262
  Compute likelihood ratios.
260
263
 
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from typing import Callable
10
12
 
11
13
  import keras
12
14
  import numpy as np
15
+ import tensorflow as tf
13
16
  from numpy.typing import ArrayLike
14
17
 
15
18
  from dataeval._internal.detectors.ood.base import OODBase, OODScore
@@ -20,17 +23,33 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
20
23
 
21
24
 
22
25
  class OOD_VAE(OODBase):
26
+ """
27
+ VAE based outlier detector.
28
+
29
+ Parameters
30
+ ----------
31
+ model : VAE
32
+ A VAE model.
33
+ samples : int, default 10
34
+ Number of samples sampled to evaluate each instance.
35
+
36
+ Examples
37
+ --------
38
+ Instantiate an OOD detector metric with a generic dataset - batch of images with shape (3,25,25)
39
+
40
+ >>> metric = OOD_VAE(create_model(VAE, dataset[0].shape))
41
+
42
+ Adjusting fit parameters,
43
+ including setting the fit threshold at 85% for a training set with about 15% out-of-distribution
44
+
45
+ >>> metric.fit(dataset, threshold_perc=85, batch_size=128, verbose=False)
46
+
47
+ Detect out of distribution samples at the 'feature' level
48
+
49
+ >>> result = metric.predict(dataset, ood_type="feature")
50
+ """
51
+
23
52
  def __init__(self, model: VAE, samples: int = 10) -> None:
24
- """
25
- VAE based outlier detector.
26
-
27
- Parameters
28
- ----------
29
- model : VAE
30
- A VAE model.
31
- samples : int, default 10
32
- Number of samples sampled to evaluate each instance.
33
- """
34
53
  super().__init__(model)
35
54
  self.samples = samples
36
55
 
@@ -38,32 +57,14 @@ class OOD_VAE(OODBase):
38
57
  self,
39
58
  x_ref: ArrayLike,
40
59
  threshold_perc: float = 100.0,
41
- loss_fn: Callable = Elbo(0.05),
60
+ loss_fn: Callable[..., tf.Tensor] | None = None,
42
61
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
43
62
  epochs: int = 20,
44
63
  batch_size: int = 64,
45
64
  verbose: bool = True,
46
65
  ) -> None:
47
- """
48
- Train the VAE model.
49
-
50
- Parameters
51
- ----------
52
- x_ref : ArrayLike
53
- Training batch.
54
- threshold_perc : float, default 100.0
55
- Percentage of reference data that is normal.
56
- loss_fn : Callable, default Elbo(0.05)
57
- Loss function used for training.
58
- optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
59
- Optimizer used for training.
60
- epochs : int, default 20
61
- Number of training epochs.
62
- batch_size : int, default 64
63
- Batch size used for training.
64
- verbose : bool, default True
65
- Whether to print training progress.
66
- """
66
+ if loss_fn is None:
67
+ loss_fn = Elbo(0.05)
67
68
  super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
68
69
 
69
70
  def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
@@ -6,10 +6,13 @@ Original code Copyright (c) 2023 Seldon Technologies Ltd
6
6
  Licensed under Apache Software License (Apache 2.0)
7
7
  """
8
8
 
9
+ from __future__ import annotations
10
+
9
11
  from typing import Callable
10
12
 
11
13
  import keras
12
14
  import numpy as np
15
+ import tensorflow as tf
13
16
  from numpy.typing import ArrayLike
14
17
 
15
18
  from dataeval._internal.detectors.ood.base import OODGMMBase, OODScore
@@ -21,17 +24,18 @@ from dataeval._internal.models.tensorflow.utils import predict_batch
21
24
 
22
25
 
23
26
  class OOD_VAEGMM(OODGMMBase):
24
- def __init__(self, model: VAEGMM, samples: int = 10) -> None:
25
- """
26
- VAE with Gaussian Mixture Model based outlier detector.
27
+ """
28
+ VAE with Gaussian Mixture Model based outlier detector.
27
29
 
28
- Parameters
29
- ----------
30
- model : VAEGMM
31
- A VAEGMM model.
32
- samples
33
- Number of samples sampled to evaluate each instance.
34
- """
30
+ Parameters
31
+ ----------
32
+ model : VAEGMM
33
+ A VAEGMM model.
34
+ samples
35
+ Number of samples sampled to evaluate each instance.
36
+ """
37
+
38
+ def __init__(self, model: VAEGMM, samples: int = 10) -> None:
35
39
  super().__init__(model)
36
40
  self.samples = samples
37
41
 
@@ -39,35 +43,37 @@ class OOD_VAEGMM(OODGMMBase):
39
43
  self,
40
44
  x_ref: ArrayLike,
41
45
  threshold_perc: float = 100.0,
42
- loss_fn: Callable = LossGMM(elbo=Elbo(0.05)),
46
+ loss_fn: Callable[..., tf.Tensor] | None = None,
43
47
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
44
48
  epochs: int = 20,
45
49
  batch_size: int = 64,
46
50
  verbose: bool = True,
47
51
  ) -> None:
52
+ if loss_fn is None:
53
+ loss_fn = LossGMM(elbo=Elbo(0.05))
54
+ super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
55
+
56
+ def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
48
57
  """
49
- Train the AE model with recommended loss function and optimizer.
58
+ Compute the out-of-distribution (OOD) score for a given dataset.
50
59
 
51
60
  Parameters
52
61
  ----------
53
62
  X : ArrayLike
54
- Training batch.
55
- threshold_perc : float, default 100.0
56
- Percentage of reference data that is normal.
57
- loss_fn : Callable, default LossGMM(elbo=Elbo(0.05))
58
- Loss function used for training.
59
- optimizer : keras.optimizers.Optimizer, default keras.optimizers.Adam
60
- Optimizer used for training.
61
- epochs : int, default 20
62
- Number of training epochs.
63
- batch_size : int, default 64
64
- Batch size used for training.
65
- verbose : bool, default True
66
- Whether to print training progress.
67
- """
68
- super().fit(x_ref, threshold_perc, loss_fn, optimizer, epochs, batch_size, verbose)
63
+ Input data to score.
64
+ batch_size : int, default 1e10
65
+ Number of instances to process in each batch.
66
+ Use a smaller batch size if your dataset is large or if you encounter memory issues.
69
67
 
70
- def score(self, X: ArrayLike, batch_size: int = int(1e10)) -> OODScore:
68
+ Returns
69
+ -------
70
+ OODScore
71
+ An object containing the instance-level OOD score.
72
+
73
+ Note
74
+ ----
75
+ This model does not produce a feature level score like the OOD_AE or OOD_VAE models.
76
+ """
71
77
  self._validate(X := to_numpy(X))
72
78
 
73
79
  # draw samples from latent space
@@ -1,15 +1,32 @@
1
- from typing import Iterable, Literal, Optional, Sequence, Union
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Iterable, Literal
2
5
 
3
6
  import numpy as np
4
- from numpy.typing import ArrayLike
7
+ from numpy.typing import ArrayLike, NDArray
8
+
9
+ from dataeval._internal.flags import ImageStat, to_distinct, verify_supported
10
+ from dataeval._internal.metrics.stats import StatsOutput, imagestats
11
+ from dataeval._internal.output import OutputMetadata, set_metadata
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class OutliersOutput(OutputMetadata):
16
+ """
17
+ Attributes
18
+ ----------
19
+ issues : Dict[int, Dict[str, float]]
20
+ Dictionary containing the indices of outliers and a dictionary showing
21
+ the issues and calculated values for the given index.
22
+ """
5
23
 
6
- from dataeval._internal.flags import ImageProperty, ImageVisuals, LinterFlags
7
- from dataeval._internal.metrics.stats import ImageStats
24
+ issues: dict[int, dict[str, float]]
8
25
 
9
26
 
10
27
  def _get_outlier_mask(
11
- values: np.ndarray, method: Literal["zscore", "modzscore", "iqr"], threshold: Optional[float]
12
- ) -> np.ndarray:
28
+ values: NDArray, method: Literal["zscore", "modzscore", "iqr"], threshold: float | None
29
+ ) -> NDArray:
13
30
  if method == "zscore":
14
31
  threshold = threshold if threshold else 3.0
15
32
  std = np.std(values)
@@ -18,7 +35,7 @@ def _get_outlier_mask(
18
35
  elif method == "modzscore":
19
36
  threshold = threshold if threshold else 3.5
20
37
  abs_diff = np.abs(values - np.median(values))
21
- med_abs_diff = np.median(abs_diff)
38
+ med_abs_diff = np.median(abs_diff) if np.median(abs_diff) != 0 else np.mean(abs_diff)
22
39
  mod_z_score = 0.6745 * abs_diff / med_abs_diff
23
40
  return mod_z_score > threshold
24
41
  elif method == "iqr":
@@ -30,14 +47,15 @@ def _get_outlier_mask(
30
47
  raise ValueError("Outlier method must be 'zscore' 'modzscore' or 'iqr'.")
31
48
 
32
49
 
33
- class Linter:
50
+ class Outliers:
34
51
  r"""
35
52
  Calculates statistical outliers of a dataset using various statistical tests applied to each image
36
53
 
37
54
  Parameters
38
55
  ----------
39
- flags : [ImageProperty | ImageStatistics | ImageVisuals], default None
56
+ flags : ImageStat, default ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS
40
57
  Metric(s) to calculate for each image - calculates all metrics if None
58
+ Only supports ImageStat.ALL_STATS
41
59
  outlier_method : ["modzscore" | "zscore" | "iqr"], optional - default "modzscore"
42
60
  Statistical method used to identify outliers
43
61
  outlier_threshold : float, optional - default None
@@ -46,8 +64,8 @@ class Linter:
46
64
 
47
65
  Attributes
48
66
  ----------
49
- stats : ImageStats
50
- Class to hold the value of each metric for each image
67
+ stats : Dict[str, Any]
68
+ Dictionary to hold the value of each metric for each image
51
69
 
52
70
  See Also
53
71
  --------
@@ -75,42 +93,40 @@ class Linter:
75
93
 
76
94
  Examples
77
95
  --------
78
- Initialize the Linter class:
96
+ Initialize the Outliers class:
79
97
 
80
- >>> lint = Linter()
98
+ >>> outliers = Outliers()
81
99
 
82
100
  Specifying specific metrics to analyze:
83
101
 
84
- >>> lint = Linter(flags=[ImageProperty.SIZE, ImageVisuals.ALL])
102
+ >>> outliers = Outliers(flags=ImageStat.SIZE | ImageStat.ALL_VISUALS)
85
103
 
86
104
  Specifying an outlier method:
87
105
 
88
- >>> lint = Linter(outlier_method="iqr")
106
+ >>> outliers = Outliers(outlier_method="iqr")
89
107
 
90
108
  Specifying an outlier method and threshold:
91
109
 
92
- >>> lint = Linter(outlier_method="zscore", outlier_threshold=2.5)
110
+ >>> outliers = Outliers(outlier_method="zscore", outlier_threshold=2.5)
93
111
  """
94
112
 
95
113
  def __init__(
96
114
  self,
97
- flags: Optional[Union[LinterFlags, Sequence[LinterFlags]]] = None,
115
+ flags: ImageStat = ImageStat.ALL_PROPERTIES | ImageStat.ALL_VISUALS,
98
116
  outlier_method: Literal["zscore", "modzscore", "iqr"] = "modzscore",
99
- outlier_threshold: Optional[float] = None,
117
+ outlier_threshold: float | None = None,
100
118
  ):
101
- flags = flags if flags is not None else (ImageProperty.ALL, ImageVisuals.ALL)
102
- self.stats = ImageStats(flags)
119
+ verify_supported(flags, ImageStat.ALL_STATS)
120
+ self.flags = flags
103
121
  self.outlier_method: Literal["zscore", "modzscore", "iqr"] = outlier_method
104
122
  self.outlier_threshold = outlier_threshold
105
123
 
106
124
  def _get_outliers(self) -> dict:
107
125
  flagged_images = {}
108
-
109
- for stat, values in self.results.items():
110
- if not isinstance(values, np.ndarray):
111
- continue
112
-
113
- if values.ndim == 1 and np.std(values) != 0:
126
+ stats_dict = self.stats.dict()
127
+ supported = to_distinct(ImageStat.ALL_STATS)
128
+ for stat, values in stats_dict.items():
129
+ if stat in supported.values() and values.ndim == 1 and np.std(values) != 0:
114
130
  mask = _get_outlier_mask(values, self.outlier_method, self.outlier_threshold)
115
131
  indices = np.flatnonzero(mask)
116
132
  for i, value in zip(indices, values[mask]):
@@ -118,30 +134,36 @@ class Linter:
118
134
 
119
135
  return dict(sorted(flagged_images.items()))
120
136
 
121
- def evaluate(self, images: Iterable[ArrayLike]) -> dict:
137
+ @set_metadata("dataeval.detectors", ["flags", "outlier_method", "outlier_threshold"])
138
+ def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> OutliersOutput:
122
139
  """
123
140
  Returns indices of outliers with the issues identified for each
124
141
 
125
142
  Parameters
126
143
  ----------
127
- images : Iterable[ArrayLike], shape - (N, C, H, W)
128
- A dataset in an ArrayLike format.
129
- Function expects the data to have 3 dimensions, CxHxW.
144
+ data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput
145
+ A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
130
146
 
131
147
  Returns
132
148
  -------
133
- Dict[int, Dict[str, float]]
134
- Dictionary containing the indices of outliers and a dictionary showing
149
+ OutliersOutput
150
+ Output class containing the indices of outliers and a dictionary showing
135
151
  the issues and calculated values for the given index.
136
152
 
137
153
  Example
138
154
  -------
139
155
  Evaluate the dataset:
140
156
 
141
- >>> lint.evaluate(images)
142
- {18: {'brightness': 0.78}, 25: {'brightness': 0.98}}
157
+ >>> outliers.evaluate(images)
158
+ OutliersOutput(issues={18: {'brightness': 0.78}, 25: {'brightness': 0.98}})
143
159
  """
144
- self.stats.reset()
145
- self.stats.update(images)
146
- self.results = self.stats.compute()
147
- return self._get_outliers()
160
+ if isinstance(data, StatsOutput):
161
+ flags = set(to_distinct(self.flags).values())
162
+ stats = set(data.dict())
163
+ missing = flags - stats
164
+ if missing:
165
+ raise ValueError(f"StatsOutput is missing {missing} from the required stats: {flags}.")
166
+ self.stats = data
167
+ else:
168
+ self.stats = imagestats(data, self.flags)
169
+ return OutliersOutput(self._get_outliers())
@@ -1,37 +1,33 @@
1
- from enum import Flag, auto
2
- from typing import Union
1
+ from __future__ import annotations
3
2
 
3
+ from enum import IntFlag, auto
4
+ from functools import reduce
5
+ from typing import Iterable, TypeVar, cast
4
6
 
5
- class auto_all:
6
- def __get__(self, _, cls):
7
- return ~cls(0)
7
+ TFlag = TypeVar("TFlag", bound=IntFlag)
8
8
 
9
9
 
10
- class ImageHash(Flag):
10
+ class ImageStat(IntFlag):
11
+ """
12
+ Flags for calculating image and channel statistics
13
+ """
14
+
15
+ # HASHES
11
16
  XXHASH = auto()
12
17
  PCHASH = auto()
13
- ALL = auto_all()
14
-
15
-
16
- class ImageProperty(Flag):
18
+ # PROPERTIES
17
19
  WIDTH = auto()
18
20
  HEIGHT = auto()
19
21
  SIZE = auto()
20
22
  ASPECT_RATIO = auto()
21
23
  CHANNELS = auto()
22
24
  DEPTH = auto()
23
- ALL = auto_all()
24
-
25
-
26
- class ImageVisuals(Flag):
25
+ # VISUALS
27
26
  BRIGHTNESS = auto()
28
27
  BLURRINESS = auto()
29
28
  MISSING = auto()
30
29
  ZERO = auto()
31
- ALL = auto_all()
32
-
33
-
34
- class ImageStatistics(Flag):
30
+ # PIXEL STATS
35
31
  MEAN = auto()
36
32
  STD = auto()
37
33
  VAR = auto()
@@ -40,8 +36,35 @@ class ImageStatistics(Flag):
40
36
  ENTROPY = auto()
41
37
  PERCENTILES = auto()
42
38
  HISTOGRAM = auto()
43
- ALL = auto_all()
39
+ # JOINT FLAGS
40
+ ALL_HASHES = XXHASH | PCHASH
41
+ ALL_PROPERTIES = WIDTH | HEIGHT | SIZE | ASPECT_RATIO | CHANNELS | DEPTH
42
+ ALL_VISUALS = BRIGHTNESS | BLURRINESS | MISSING | ZERO
43
+ ALL_PIXELSTATS = MEAN | STD | VAR | SKEW | KURTOSIS | ENTROPY | PERCENTILES | HISTOGRAM
44
+ ALL_STATS = ALL_PROPERTIES | ALL_VISUALS | ALL_PIXELSTATS
45
+ ALL = ALL_HASHES | ALL_STATS
46
+
47
+
48
+ def is_distinct(flag: IntFlag) -> bool:
49
+ return (flag & (flag - 1) == 0) and flag != 0
50
+
51
+
52
+ def to_distinct(flag: TFlag) -> dict[TFlag, str]:
53
+ """
54
+ Returns a distinct set of all flags set on the input flag and their names
55
+
56
+ NOTE: this is supported natively in Python 3.11, but for earlier versions we need
57
+ to use a combination of list comprehension and bit fiddling to determine distinct
58
+ flag values from joint aliases.
59
+ """
60
+ if isinstance(flag, Iterable): # >= py311
61
+ return {f: f.name.lower() for f in flag if f.name}
62
+ else: # < py311
63
+ return {f: f.name.lower() for f in list(flag.__class__) if f & flag and is_distinct(f) and f.name}
44
64
 
45
65
 
46
- ImageStatsFlags = Union[ImageHash, ImageProperty, ImageVisuals, ImageStatistics]
47
- LinterFlags = Union[ImageProperty, ImageVisuals, ImageStatistics]
66
+ def verify_supported(flag: TFlag, flags: TFlag | Iterable[TFlag]):
67
+ supported = flags if isinstance(flags, flag.__class__) else cast(TFlag, reduce(lambda a, b: a | b, flags)) # type: ignore
68
+ unsupported = flag & ~supported
69
+ if unsupported:
70
+ raise ValueError(f"Unsupported flags {unsupported} called. Only {supported} flags are supported.")
@@ -1,8 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  from importlib import import_module
2
- from typing import Iterable, Optional
4
+ from typing import Iterable
3
5
 
4
6
  import numpy as np
5
- from numpy.typing import ArrayLike
7
+ from numpy.typing import ArrayLike, NDArray
6
8
 
7
9
  module_cache = {}
8
10
 
@@ -20,7 +22,7 @@ def try_import(module_name):
20
22
  return module
21
23
 
22
24
 
23
- def to_numpy(array: Optional[ArrayLike]) -> np.ndarray:
25
+ def to_numpy(array: ArrayLike | None) -> NDArray:
24
26
  if array is None:
25
27
  return np.ndarray([])
26
28