dataeval 0.64.0__py3-none-any.whl → 0.65.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dataeval/__init__.py +2 -2
  2. dataeval/_internal/detectors/clusterer.py +46 -34
  3. dataeval/_internal/detectors/drift/base.py +52 -35
  4. dataeval/_internal/detectors/drift/cvm.py +4 -4
  5. dataeval/_internal/detectors/drift/ks.py +6 -6
  6. dataeval/_internal/detectors/drift/mmd.py +35 -16
  7. dataeval/_internal/detectors/drift/torch.py +6 -5
  8. dataeval/_internal/detectors/drift/uncertainty.py +7 -7
  9. dataeval/_internal/detectors/duplicates.py +55 -29
  10. dataeval/_internal/detectors/linter.py +40 -24
  11. dataeval/_internal/detectors/ood/base.py +36 -15
  12. dataeval/_internal/detectors/ood/llr.py +7 -7
  13. dataeval/_internal/flags.py +42 -21
  14. dataeval/_internal/interop.py +2 -2
  15. dataeval/_internal/metrics/balance.py +10 -2
  16. dataeval/_internal/metrics/ber.py +6 -5
  17. dataeval/_internal/metrics/coverage.py +15 -8
  18. dataeval/_internal/metrics/divergence.py +41 -7
  19. dataeval/_internal/metrics/diversity.py +17 -12
  20. dataeval/_internal/metrics/parity.py +30 -43
  21. dataeval/_internal/metrics/stats.py +196 -317
  22. dataeval/_internal/metrics/uap.py +5 -2
  23. dataeval/_internal/metrics/utils.py +70 -33
  24. dataeval/_internal/models/tensorflow/losses.py +3 -3
  25. dataeval/_internal/models/tensorflow/trainer.py +3 -2
  26. dataeval/_internal/models/tensorflow/utils.py +4 -3
  27. dataeval/_internal/output.py +82 -0
  28. dataeval/_internal/workflows/sufficiency.py +96 -107
  29. dataeval/flags/__init__.py +2 -2
  30. dataeval/metrics/__init__.py +3 -3
  31. {dataeval-0.64.0.dist-info → dataeval-0.65.0.dist-info}/METADATA +1 -1
  32. dataeval-0.65.0.dist-info/RECORD +60 -0
  33. dataeval/_internal/metrics/base.py +0 -10
  34. dataeval-0.64.0.dist-info/RECORD +0 -60
  35. {dataeval-0.64.0.dist-info → dataeval-0.65.0.dist-info}/LICENSE.txt +0 -0
  36. {dataeval-0.64.0.dist-info → dataeval-0.65.0.dist-info}/WHEEL +0 -0
@@ -2,6 +2,7 @@ from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional, Seq
2
2
 
3
3
  import numpy as np
4
4
  import xxhash as xxh
5
+ from numpy.typing import NDArray
5
6
  from PIL import Image
6
7
  from scipy.fftpack import dct
7
8
  from scipy.signal import convolve2d
@@ -25,7 +26,7 @@ def get_method(method_map: Dict[str, Callable], method: str) -> Callable:
25
26
 
26
27
 
27
28
  def get_counts(
28
- data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
29
+ data: NDArray, names: List[str], is_categorical: List[bool], subset_mask: Optional[NDArray[np.bool_]] = None
29
30
  ) -> tuple[Dict, Dict]:
30
31
  """
31
32
  Initialize dictionary of histogram counts --- treat categorical values
@@ -33,7 +34,7 @@ def get_counts(
33
34
 
34
35
  Parameters
35
36
  ----------
36
- subset_mask: Optional[np.ndarray[bool]]
37
+ subset_mask: Optional[NDArray[np.bool_]]
37
38
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
38
39
 
39
40
  Returns
@@ -66,24 +67,24 @@ def get_counts(
66
67
 
67
68
 
68
69
  def entropy(
69
- data: np.ndarray,
70
+ data: NDArray,
70
71
  names: List[str],
71
72
  is_categorical: List[bool],
72
73
  normalized: bool = False,
73
- subset_mask: Optional[np.ndarray] = None,
74
- ) -> np.ndarray:
74
+ subset_mask: Optional[NDArray[np.bool_]] = None,
75
+ ) -> NDArray[np.float64]:
75
76
  """
76
77
  Meant for use with Bias metrics, Balance, Diversity, ClasswiseBalance,
77
78
  and Classwise Diversity.
78
79
 
79
- Compute entropy for discrete/categorical variables and, through standard
80
- histogram binning, for continuous variables.
80
+ Compute entropy for discrete/categorical variables and for continuous variables through standard
81
+ histogram binning.
81
82
 
82
83
  Parameters
83
84
  ----------
84
85
  normalized: bool
85
86
  Flag that determines whether or not to normalize entropy by log(num_bins)
86
- subset_mask: Optional[np.ndarray[bool]]
87
+ subset_mask: Optional[NDArray[np.bool_]]
87
88
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
88
89
 
89
90
  Notes
@@ -93,7 +94,7 @@ def entropy(
93
94
 
94
95
  Returns
95
96
  -------
96
- ent: np.ndarray[float]
97
+ ent: NDArray[np.float64]
97
98
  Entropy estimate per column of X
98
99
 
99
100
  See Also
@@ -119,16 +120,20 @@ def entropy(
119
120
 
120
121
 
121
122
  def get_num_bins(
122
- data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
123
- ) -> np.ndarray:
123
+ data: NDArray, names: List[str], is_categorical: List[bool], subset_mask: Optional[NDArray[np.bool_]] = None
124
+ ) -> NDArray[np.float64]:
124
125
  """
125
126
  Number of bins or unique values for each metadata factor, used to
126
127
  normalize entropy/diversity.
127
128
 
128
129
  Parameters
129
130
  ----------
130
- subset_mask: Optional[np.ndarray[bool]]
131
+ subset_mask: Optional[NDArray[np.bool_]]
131
132
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
133
+
134
+ Returns
135
+ -------
136
+ NDArray[np.float64]
132
137
  """
133
138
  # likely cached
134
139
  hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
@@ -139,7 +144,7 @@ def get_num_bins(
139
144
  return num_bins
140
145
 
141
146
 
142
- def infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
147
+ def infer_categorical(X: NDArray, threshold: float = 0.5) -> NDArray:
143
148
  """
144
149
  Compute fraction of feature values that are unique --- intended to be used
145
150
  for inferring whether variables are categorical.
@@ -154,9 +159,11 @@ def infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
154
159
  return pct_unique < threshold
155
160
 
156
161
 
157
- def preprocess_metadata(class_labels: Sequence[int], metadata: List[Dict]) -> Tuple[np.ndarray, List[str], List[bool]]:
162
+ def preprocess_metadata(
163
+ class_labels: Sequence[int], metadata: List[Dict], cat_thresh: float = 0.2
164
+ ) -> Tuple[NDArray, List[str], List[bool]]:
158
165
  # convert class_labels and list of metadata dicts to dict of ndarrays
159
- metadata_dict: Dict[str, np.ndarray] = {
166
+ metadata_dict: Dict[str, NDArray] = {
160
167
  "class_label": np.asarray(class_labels, dtype=int),
161
168
  **{k: np.array([d[k] for d in metadata]) for k in metadata[0]},
162
169
  }
@@ -172,18 +179,35 @@ def preprocess_metadata(class_labels: Sequence[int], metadata: List[Dict]) -> Tu
172
179
 
173
180
  data = np.stack(list(metadata_dict.values()), axis=-1)
174
181
  names = list(metadata_dict.keys())
175
- is_categorical = [infer_categorical(metadata_dict[var], 0.25)[0] for var in names]
182
+ is_categorical = [infer_categorical(metadata_dict[var], cat_thresh)[0] for var in names]
176
183
 
177
184
  return data, names, is_categorical
178
185
 
179
186
 
180
- def minimum_spanning_tree(X: np.ndarray) -> Any:
187
+ def flatten(X: NDArray):
188
+ """
189
+ Flattens input array from (N, ... ) to (N, -1) where all samples N have all data in their last dimension
190
+
191
+ Parameters
192
+ ----------
193
+ X : NDArray, shape - (N, ... )
194
+ Input array
195
+
196
+ Returns
197
+ -------
198
+ NDArray, shape - (N, -1)
199
+ """
200
+
201
+ return X.reshape((X.shape[0], -1))
202
+
203
+
204
+ def minimum_spanning_tree(X: NDArray) -> Any:
181
205
  """
182
206
  Returns the minimum spanning tree from a NumPy image array.
183
207
 
184
208
  Parameters
185
209
  ----------
186
- X: np.ndarray
210
+ X : NDArray
187
211
  Numpy image array
188
212
 
189
213
  Returns
@@ -191,7 +215,7 @@ def minimum_spanning_tree(X: np.ndarray) -> Any:
191
215
  Data representing the minimum spanning tree
192
216
  """
193
217
  # All features belong on second dimension
194
- X = X.reshape((X.shape[0], -1))
218
+ X = flatten(X)
195
219
  # We add a small constant to the distance matrix to ensure scipy interprets
196
220
  # the input graph as fully-connected.
197
221
  dense_eudist = squareform(pdist(X)) + EPSILON
@@ -199,13 +223,13 @@ def minimum_spanning_tree(X: np.ndarray) -> Any:
199
223
  return mst(eudist_csr)
200
224
 
201
225
 
202
- def get_classes_counts(labels: np.ndarray) -> Tuple[int, int]:
226
+ def get_classes_counts(labels: NDArray) -> Tuple[int, int]:
203
227
  """
204
228
  Returns the classes and counts of from an array of labels
205
229
 
206
230
  Parameters
207
231
  ----------
208
- label: np.ndarray
232
+ label : NDArray
209
233
  Numpy labels array
210
234
 
211
235
  Returns
@@ -226,17 +250,17 @@ def get_classes_counts(labels: np.ndarray) -> Tuple[int, int]:
226
250
 
227
251
 
228
252
  def compute_neighbors(
229
- A: np.ndarray,
230
- B: np.ndarray,
253
+ A: NDArray,
254
+ B: NDArray,
231
255
  k: int = 1,
232
256
  algorithm: Literal["auto", "ball_tree", "kd_tree"] = "auto",
233
- ) -> np.ndarray:
257
+ ) -> NDArray:
234
258
  """
235
259
  For each sample in A, compute the nearest neighbor in B
236
260
 
237
261
  Parameters
238
262
  ----------
239
- A, B : np.ndarray
263
+ A, B : NDArray
240
264
  The n_samples and n_features respectively
241
265
  k : int
242
266
  The number of neighbors to find
@@ -252,11 +276,24 @@ def compute_neighbors(
252
276
  List:
253
277
  Closest points to each point in A and B
254
278
 
279
+ Raises
280
+ ------
281
+ ValueError
282
+ If algorithm is not "auto", "ball_tree", or "kd_tree"
283
+
255
284
  See Also
256
285
  --------
257
286
  sklearn.neighbors.NearestNeighbors
258
287
  """
259
288
 
289
+ if k < 1:
290
+ raise ValueError("k must be >= 1")
291
+ if algorithm not in ["auto", "ball_tree", "kd_tree"]:
292
+ raise ValueError("Algorithm must be 'auto', 'ball_tree', or 'kd_tree'")
293
+
294
+ A = flatten(A)
295
+ B = flatten(B)
296
+
260
297
  nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
261
298
  nns = nbrs.kneighbors(A)[1]
262
299
  nns = nns[:, 1:].squeeze()
@@ -270,7 +307,7 @@ class BitDepth(NamedTuple):
270
307
  pmax: Union[float, int]
271
308
 
272
309
 
273
- def get_bitdepth(image: np.ndarray) -> BitDepth:
310
+ def get_bitdepth(image: NDArray) -> BitDepth:
274
311
  """
275
312
  Approximates the bit depth of the image using the
276
313
  min and max pixel values.
@@ -283,7 +320,7 @@ def get_bitdepth(image: np.ndarray) -> BitDepth:
283
320
  return BitDepth(depth, 0, 2**depth - 1)
284
321
 
285
322
 
286
- def rescale(image: np.ndarray, depth: int = 1) -> np.ndarray:
323
+ def rescale(image: NDArray, depth: int = 1) -> NDArray:
287
324
  """
288
325
  Rescales the image using the bit depth provided.
289
326
  """
@@ -295,7 +332,7 @@ def rescale(image: np.ndarray, depth: int = 1) -> np.ndarray:
295
332
  return normalized * (2**depth - 1)
296
333
 
297
334
 
298
- def normalize_image_shape(image: np.ndarray) -> np.ndarray:
335
+ def normalize_image_shape(image: NDArray) -> NDArray:
299
336
  """
300
337
  Normalizes the image shape into (C,H,W).
301
338
  """
@@ -311,7 +348,7 @@ def normalize_image_shape(image: np.ndarray) -> np.ndarray:
311
348
  raise ValueError("Images must have 2 or more dimensions.")
312
349
 
313
350
 
314
- def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
351
+ def edge_filter(image: NDArray, offset: float = 0.5) -> NDArray:
315
352
  """
316
353
  Returns the image filtered using a 3x3 edge detection kernel:
317
354
  [[ -1, -1, -1 ],
@@ -323,7 +360,7 @@ def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
323
360
  return edges
324
361
 
325
362
 
326
- def pchash(image: np.ndarray) -> str:
363
+ def pchash(image: NDArray) -> str:
327
364
  """
328
365
  Performs a perceptual hash on an image by resizing to a square NxN image
329
366
  using the Lanczos algorithm where N is 32x32 or the largest multiple of
@@ -334,7 +371,7 @@ def pchash(image: np.ndarray) -> str:
334
371
 
335
372
  Parameters
336
373
  ----------
337
- image : np.ndarray
374
+ image : NDArray
338
375
  An image as a numpy array in CxHxW format
339
376
 
340
377
  Returns
@@ -374,7 +411,7 @@ def pchash(image: np.ndarray) -> str:
374
411
  return hash_hex if hash_hex else "0"
375
412
 
376
413
 
377
- def xxhash(image: np.ndarray) -> str:
414
+ def xxhash(image: NDArray) -> str:
378
415
  """
379
416
  Performs a fast non-cryptographic hash using the xxhash algorithm
380
417
  (xxhash.com) against the image as a flattened bytearray. The hash
@@ -382,7 +419,7 @@ def xxhash(image: np.ndarray) -> str:
382
419
 
383
420
  Parameters
384
421
  ----------
385
- image : np.ndarray
422
+ image : NDArray
386
423
  An image as a numpy array
387
424
 
388
425
  Returns
@@ -8,9 +8,9 @@ Licensed under Apache Software License (Apache 2.0)
8
8
 
9
9
  from typing import Literal, Optional, Union, cast
10
10
 
11
- import numpy as np
12
11
  import tensorflow as tf
13
12
  from keras.layers import Flatten
13
+ from numpy.typing import NDArray
14
14
  from tensorflow_probability.python.distributions.mvn_diag import MultivariateNormalDiag
15
15
  from tensorflow_probability.python.distributions.mvn_tril import MultivariateNormalTriL
16
16
  from tensorflow_probability.python.stats import covariance
@@ -35,12 +35,12 @@ class Elbo:
35
35
  def __init__(
36
36
  self,
37
37
  cov_type: Union[Literal["cov_full", "cov_diag"], float] = 1.0,
38
- x: Optional[Union[tf.Tensor, np.ndarray]] = None,
38
+ x: Optional[Union[tf.Tensor, NDArray]] = None,
39
39
  ):
40
40
  if isinstance(cov_type, float):
41
41
  self.cov = ("sim", cov_type)
42
42
  elif cov_type in ["cov_full", "cov_diag"]:
43
- x_np: np.ndarray = x.numpy() if tf.is_tensor(x) else x # type: ignore
43
+ x_np: NDArray = x.numpy() if tf.is_tensor(x) else x # type: ignore
44
44
  cov = covariance(x_np.reshape(x_np.shape[0], -1)) # type: ignore py38
45
45
  if cov_type == "cov_diag": # infer standard deviation from covariance matrix
46
46
  cov = tf.math.sqrt(tf.linalg.diag_part(cov))
@@ -11,12 +11,13 @@ from typing import Callable, Iterable, Optional, Tuple, cast
11
11
  import keras
12
12
  import numpy as np
13
13
  import tensorflow as tf
14
+ from numpy.typing import NDArray
14
15
 
15
16
 
16
17
  def trainer(
17
18
  model: keras.Model,
18
- x_train: np.ndarray,
19
- y_train: Optional[np.ndarray] = None,
19
+ x_train: NDArray,
20
+ y_train: Optional[NDArray] = None,
20
21
  loss_fn: Optional[Callable[..., tf.Tensor]] = None,
21
22
  optimizer: keras.optimizers.Optimizer = keras.optimizers.Adam,
22
23
  preprocess_fn: Optional[Callable[[tf.Tensor], tf.Tensor]] = None,
@@ -21,6 +21,7 @@ from keras.layers import (
21
21
  InputLayer,
22
22
  Reshape,
23
23
  )
24
+ from numpy.typing import NDArray
24
25
  from tensorflow._api.v2.nn import relu, softmax, tanh
25
26
 
26
27
  from dataeval._internal.models.tensorflow.autoencoder import AE, AEGMM, VAE, VAEGMM
@@ -28,12 +29,12 @@ from dataeval._internal.models.tensorflow.pixelcnn import PixelCNN
28
29
 
29
30
 
30
31
  def predict_batch(
31
- x: Union[list, np.ndarray, tf.Tensor],
32
+ x: Union[list, NDArray, tf.Tensor],
32
33
  model: Union[Callable, keras.Model],
33
34
  batch_size: int = int(1e10),
34
35
  preprocess_fn: Optional[Callable] = None,
35
36
  dtype: Union[Type[np.generic], tf.DType] = np.float32,
36
- ) -> Union[np.ndarray, tf.Tensor, tuple, list]:
37
+ ) -> Union[NDArray, tf.Tensor, tuple, list]:
37
38
  """
38
39
  Make batch predictions on a model.
39
40
 
@@ -80,7 +81,7 @@ def predict_batch(
80
81
  else:
81
82
  raise TypeError(
82
83
  f"Model output type {type(preds_tmp)} not supported. The model output "
83
- f"type needs to be one of list, tuple, np.ndarray or tf.Tensor."
84
+ f"type needs to be one of list, tuple, NDArray or tf.Tensor."
84
85
  )
85
86
  concat = np.concatenate if return_np else tf.concat
86
87
  out = cast(
@@ -0,0 +1,82 @@
1
+ import inspect
2
+ from datetime import datetime, timezone
3
+ from functools import wraps
4
+ from typing import Dict, List, Optional
5
+
6
+ import numpy as np
7
+
8
+ from dataeval import __version__
9
+
10
+
11
+ class OutputMetadata:
12
+ _name: str
13
+ _execution_time: str
14
+ _execution_duration: float
15
+ _arguments: Dict[str, str]
16
+ _state: Dict[str, str]
17
+ _version: str
18
+
19
+ def dict(self) -> Dict:
20
+ return {k: v for k, v in self.__dict__.items() if not k.startswith("_")}
21
+
22
+ def meta(self) -> Dict:
23
+ return {k.removeprefix("_"): v for k, v in self.__dict__.items() if k.startswith("_")}
24
+
25
+
26
+ def set_metadata(module_name: str = "", state_attr: Optional[List[str]] = None):
27
+ def decorator(fn):
28
+ @wraps(fn)
29
+ def wrapper(*args, **kwargs):
30
+ def fmt(v):
31
+ if np.isscalar(v):
32
+ return v
33
+ if hasattr(v, "shape"):
34
+ return f"{v.__class__.__name__}: shape={getattr(v, 'shape')}"
35
+ if hasattr(v, "__len__"):
36
+ return f"{v.__class__.__name__}: len={len(v)}"
37
+ return f"{v.__class__.__name__}"
38
+
39
+ time = datetime.now(timezone.utc)
40
+ result = fn(*args, **kwargs)
41
+ duration = (datetime.now(timezone.utc) - time).total_seconds()
42
+ fn_params = inspect.signature(fn).parameters
43
+ # set all params with defaults then update params with mapped arguments and explicit keyword args
44
+ arguments = {k: None if v.default is inspect.Parameter.empty else v.default for k, v in fn_params.items()}
45
+ arguments.update(zip(fn_params, args))
46
+ arguments.update(kwargs)
47
+ arguments = {k: fmt(v) for k, v in arguments.items()}
48
+ state = (
49
+ {k: fmt(getattr(args[0], k)) for k in state_attr if "self" in arguments}
50
+ if "self" in arguments and state_attr
51
+ else {}
52
+ )
53
+ name = args[0].__class__.__name__ if "self" in arguments else fn.__name__
54
+ metadata = {
55
+ "_name": f"{module_name}.{name}",
56
+ "_execution_time": time,
57
+ "_execution_duration": duration,
58
+ "_arguments": {k: v for k, v in arguments.items() if k != "self"},
59
+ "_state": state,
60
+ "_version": __version__,
61
+ }
62
+ for k, v in metadata.items():
63
+ object.__setattr__(result, k, v)
64
+ return result
65
+
66
+ return wrapper
67
+
68
+ return decorator
69
+
70
+
71
+ def populate_defaults(d: dict, c: type) -> dict:
72
+ def default(t):
73
+ name = t._name if hasattr(t, "_name") else t.__name__ # py3.9 : _name, py3.10 : __name__
74
+ if name == "Dict":
75
+ return {}
76
+ if name == "List":
77
+ return []
78
+ if name == "ndarray":
79
+ return np.array([])
80
+ raise TypeError("Unrecognized annotation type")
81
+
82
+ return {k: d[k] if k in d else default(t) for k, t in c.__annotations__.items()}