dataeval 0.61.0__py3-none-any.whl → 0.64.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/clusterer.py +45 -16
  3. dataeval/_internal/detectors/drift/base.py +15 -12
  4. dataeval/_internal/detectors/drift/cvm.py +12 -8
  5. dataeval/_internal/detectors/drift/ks.py +7 -3
  6. dataeval/_internal/detectors/drift/mmd.py +15 -12
  7. dataeval/_internal/detectors/drift/uncertainty.py +6 -5
  8. dataeval/_internal/detectors/duplicates.py +35 -11
  9. dataeval/_internal/detectors/linter.py +85 -16
  10. dataeval/_internal/detectors/ood/ae.py +7 -5
  11. dataeval/_internal/detectors/ood/aegmm.py +6 -5
  12. dataeval/_internal/detectors/ood/base.py +15 -13
  13. dataeval/_internal/detectors/ood/llr.py +8 -5
  14. dataeval/_internal/detectors/ood/vae.py +6 -4
  15. dataeval/_internal/detectors/ood/vaegmm.py +6 -4
  16. dataeval/_internal/interop.py +43 -0
  17. dataeval/_internal/metrics/balance.py +180 -0
  18. dataeval/_internal/metrics/base.py +2 -84
  19. dataeval/_internal/metrics/ber.py +77 -53
  20. dataeval/_internal/metrics/coverage.py +80 -55
  21. dataeval/_internal/metrics/divergence.py +62 -54
  22. dataeval/_internal/metrics/diversity.py +206 -0
  23. dataeval/_internal/metrics/parity.py +292 -163
  24. dataeval/_internal/metrics/stats.py +48 -35
  25. dataeval/_internal/metrics/uap.py +31 -26
  26. dataeval/_internal/metrics/utils.py +237 -2
  27. dataeval/_internal/utils.py +64 -0
  28. dataeval/_internal/workflows/__init__.py +0 -0
  29. dataeval/metrics/__init__.py +25 -5
  30. dataeval/utils/__init__.py +9 -0
  31. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -2
  32. dataeval-0.64.0.dist-info/RECORD +60 -0
  33. dataeval/_internal/metrics/hash.py +0 -79
  34. dataeval-0.61.0.dist-info/RECORD +0 -55
  35. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
  36. {dataeval-0.61.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -1,26 +1,29 @@
1
+ from abc import abstractmethod
1
2
  from enum import Flag
2
3
  from typing import Any, Callable, Dict, Generic, Iterable, List, Optional, Sequence, TypeVar, Union
3
4
 
4
5
  import numpy as np
6
+ from numpy.typing import ArrayLike
5
7
  from scipy.stats import entropy, kurtosis, skew
6
8
 
7
9
  from dataeval._internal.flags import ImageHash, ImageProperty, ImageStatistics, ImageStatsFlags, ImageVisuals
8
- from dataeval._internal.metrics.base import MetricMixin
9
- from dataeval._internal.metrics.hash import pchash, xxhash
10
- from dataeval._internal.metrics.utils import edge_filter, get_bitdepth, normalize_image_shape, rescale
10
+ from dataeval._internal.interop import to_numpy_iter
11
+ from dataeval._internal.metrics.base import EvaluateMixin
12
+ from dataeval._internal.metrics.utils import edge_filter, get_bitdepth, normalize_image_shape, pchash, rescale, xxhash
11
13
 
12
14
  QUARTILES = (0, 25, 50, 75, 100)
13
15
 
14
- TBatch = TypeVar("TBatch", bound=Sequence)
16
+ TBatch = TypeVar("TBatch", bound=Sequence[ArrayLike])
15
17
  TFlag = TypeVar("TFlag", bound=Flag)
16
18
 
17
19
 
18
- class BaseStatsMetric(MetricMixin, Generic[TBatch, TFlag]):
20
+ class BaseStatsMetric(EvaluateMixin, Generic[TBatch, TFlag]):
19
21
  def __init__(self, flags: TFlag):
20
22
  self.flags = flags
21
23
  self.results = []
22
24
 
23
- def update(self, preds: TBatch, targets=None) -> None:
25
+ @abstractmethod
26
+ def update(self, images: TBatch) -> None:
24
27
  """
25
28
  Updates internal metric cache for later calculation
26
29
 
@@ -66,6 +69,16 @@ class BaseStatsMetric(MetricMixin, Generic[TBatch, TFlag]):
66
69
  )
67
70
  return [flag.name.lower() for flag in flags if flag.name is not None]
68
71
 
72
+ def evaluate(self, images: TBatch) -> Dict[str, Any]:
73
+ """Calculate metric results given a single batch of images"""
74
+ if self.results:
75
+ raise RuntimeError("Call reset before calling evaluate")
76
+
77
+ self.update(images)
78
+ results = self.compute()
79
+ self.reset()
80
+ return results
81
+
69
82
 
70
83
  class ImageHashMetric(BaseStatsMetric):
71
84
  """
@@ -80,12 +93,12 @@ class ImageHashMetric(BaseStatsMetric):
80
93
  def __init__(self, flags: ImageHash = ImageHash.ALL):
81
94
  super().__init__(flags)
82
95
 
83
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
84
- for data in preds:
96
+ def update(self, images: Iterable[ArrayLike]) -> None:
97
+ for image in to_numpy_iter(images):
85
98
  results = self._map(
86
99
  {
87
- ImageHash.XXHASH: lambda: xxhash(data),
88
- ImageHash.PCHASH: lambda: pchash(data),
100
+ ImageHash.XXHASH: lambda: xxhash(image),
101
+ ImageHash.PCHASH: lambda: pchash(image),
89
102
  }
90
103
  )
91
104
  self.results.append(results)
@@ -104,16 +117,16 @@ class ImagePropertyMetric(BaseStatsMetric):
104
117
  def __init__(self, flags: ImageProperty = ImageProperty.ALL):
105
118
  super().__init__(flags)
106
119
 
107
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
108
- for data in preds:
120
+ def update(self, images: Iterable[ArrayLike]) -> None:
121
+ for image in to_numpy_iter(images):
109
122
  results = self._map(
110
123
  {
111
- ImageProperty.WIDTH: lambda: np.int32(data.shape[-1]),
112
- ImageProperty.HEIGHT: lambda: np.int32(data.shape[-2]),
113
- ImageProperty.SIZE: lambda: np.int32(data.shape[-1] * data.shape[-2]),
114
- ImageProperty.ASPECT_RATIO: lambda: data.shape[-1] / np.int32(data.shape[-2]),
115
- ImageProperty.CHANNELS: lambda: data.shape[-3],
116
- ImageProperty.DEPTH: lambda: get_bitdepth(data).depth,
124
+ ImageProperty.WIDTH: lambda: np.int32(image.shape[-1]),
125
+ ImageProperty.HEIGHT: lambda: np.int32(image.shape[-2]),
126
+ ImageProperty.SIZE: lambda: np.int32(image.shape[-1] * image.shape[-2]),
127
+ ImageProperty.ASPECT_RATIO: lambda: image.shape[-1] / np.int32(image.shape[-2]),
128
+ ImageProperty.CHANNELS: lambda: image.shape[-3],
129
+ ImageProperty.DEPTH: lambda: get_bitdepth(image).depth,
117
130
  }
118
131
  )
119
132
  self.results.append(results)
@@ -132,14 +145,14 @@ class ImageVisualsMetric(BaseStatsMetric):
132
145
  def __init__(self, flags: ImageVisuals = ImageVisuals.ALL):
133
146
  super().__init__(flags)
134
147
 
135
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
136
- for data in preds:
148
+ def update(self, images: Iterable[ArrayLike]) -> None:
149
+ for image in to_numpy_iter(images):
137
150
  results = self._map(
138
151
  {
139
- ImageVisuals.BRIGHTNESS: lambda: np.mean(rescale(data)),
140
- ImageVisuals.BLURRINESS: lambda: np.std(edge_filter(np.mean(data, axis=0))),
141
- ImageVisuals.MISSING: lambda: np.sum(np.isnan(data)),
142
- ImageVisuals.ZERO: lambda: np.int32(np.count_nonzero(data == 0)),
152
+ ImageVisuals.BRIGHTNESS: lambda: np.mean(rescale(image)),
153
+ ImageVisuals.BLURRINESS: lambda: np.std(edge_filter(np.mean(image, axis=0))),
154
+ ImageVisuals.MISSING: lambda: np.sum(np.isnan(image)),
155
+ ImageVisuals.ZERO: lambda: np.int32(np.count_nonzero(image == 0)),
143
156
  }
144
157
  )
145
158
  self.results.append(results)
@@ -158,9 +171,9 @@ class ImageStatisticsMetric(BaseStatsMetric):
158
171
  def __init__(self, flags: ImageStatistics = ImageStatistics.ALL):
159
172
  super().__init__(flags)
160
173
 
161
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
162
- for data in preds:
163
- scaled = rescale(data)
174
+ def update(self, images: Iterable[ArrayLike]) -> None:
175
+ for image in to_numpy_iter(images):
176
+ scaled = rescale(image)
164
177
  if (ImageStatistics.HISTOGRAM | ImageStatistics.ENTROPY) & self.flags:
165
178
  hist = np.histogram(scaled, bins=256, range=(0, 1))[0]
166
179
 
@@ -192,10 +205,10 @@ class ChannelStatisticsMetric(BaseStatsMetric):
192
205
  def __init__(self, flags: ImageStatistics = ImageStatistics.ALL):
193
206
  super().__init__(flags)
194
207
 
195
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
196
- for data in preds:
197
- scaled = rescale(data)
198
- flattened = scaled.reshape(data.shape[0], -1)
208
+ def update(self, images: Iterable[ArrayLike]) -> None:
209
+ for image in to_numpy_iter(images):
210
+ scaled = rescale(image)
211
+ flattened = scaled.reshape(image.shape[0], -1)
199
212
 
200
213
  if (ImageStatistics.HISTOGRAM | ImageStatistics.ENTROPY) & self.flags:
201
214
  hist = np.apply_along_axis(lambda x: np.histogram(x, bins=256, range=(0, 1))[0], 1, flattened)
@@ -253,8 +266,8 @@ class ImageStats(BaseAggregateMetric):
253
266
  super().__init__(flags)
254
267
  self._length = 0
255
268
 
256
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
257
- for image in preds:
269
+ def update(self, images: Iterable[ArrayLike]) -> None:
270
+ for image in to_numpy_iter(images):
258
271
  self._length += 1
259
272
  img = normalize_image_shape(image)
260
273
  for metric in self._metrics_dict:
@@ -295,8 +308,8 @@ class ChannelStats(BaseAggregateMetric):
295
308
  def __init__(self, flags: Optional[ImageStatistics] = None) -> None:
296
309
  super().__init__(flags)
297
310
 
298
- def update(self, preds: Iterable[np.ndarray], targets=None) -> None:
299
- for image in preds:
311
+ def update(self, images: Iterable[ArrayLike]) -> None:
312
+ for image in to_numpy_iter(images):
300
313
  img = normalize_image_shape(image)
301
314
  for metric in self._metrics_dict:
302
315
  metric.update([img])
@@ -4,42 +4,47 @@ FR Test Statistic based estimate for the upperbound
4
4
  average precision using empirical mean precision
5
5
  """
6
6
 
7
- from typing import Dict
7
+ from typing import NamedTuple
8
8
 
9
- import numpy as np
9
+ from numpy.typing import ArrayLike
10
10
  from sklearn.metrics import average_precision_score
11
11
 
12
- from dataeval._internal.metrics.base import EvaluateMixin
12
+ from dataeval._internal.interop import to_numpy
13
13
 
14
14
 
15
- class UAP(EvaluateMixin):
15
+ class UAPOutput(NamedTuple):
16
16
  """
17
- FR Test Statistic based estimate of the empirical mean precision
17
+ Attributes
18
+ ----------
19
+ uap : float
20
+ The empirical mean precision estimate
21
+ """
22
+
23
+ uap: float
24
+
25
+
26
+ def uap(labels: ArrayLike, scores: ArrayLike) -> UAPOutput:
27
+ """
28
+ FR Test Statistic based estimate of the empirical mean precision for
29
+ the upperbound average precision
18
30
 
19
31
  Parameters
20
32
  ----------
21
- labels : np.ndarray
33
+ labels : ArrayLike
22
34
  A numpy array of n_samples of class labels with M unique classes.
23
-
24
- scores : np.ndarray
35
+ scores : ArrayLike
25
36
  A 2D array of class probabilities per image
37
+
38
+ Returns
39
+ -------
40
+ Dict[str, float]
41
+ uap : The empirical mean precision estimate
42
+
43
+ Raises
44
+ ------
45
+ ValueError
46
+ If unique classes M < 2
26
47
  """
27
48
 
28
- def __init__(self, labels: np.ndarray, scores: np.ndarray) -> None:
29
- self.labels = labels
30
- self.scores = scores
31
-
32
- def evaluate(self) -> Dict[str, float]:
33
- """
34
- Returns
35
- -------
36
- Dict[str, float]
37
- uap : The empirical mean precision estimate
38
-
39
- Raises
40
- ------
41
- ValueError
42
- If unique classes M < 2
43
- """
44
- uap = float(average_precision_score(self.labels, self.scores, average="weighted"))
45
- return {"uap": uap}
49
+ precision = float(average_precision_score(to_numpy(labels), to_numpy(scores), average="weighted"))
50
+ return UAPOutput(precision)
@@ -1,15 +1,180 @@
1
- from typing import Any, Literal, NamedTuple, Tuple, Union
1
+ from typing import Any, Callable, Dict, List, Literal, NamedTuple, Optional, Sequence, Tuple, Union
2
2
 
3
3
  import numpy as np
4
+ import xxhash as xxh
5
+ from PIL import Image
6
+ from scipy.fftpack import dct
4
7
  from scipy.signal import convolve2d
5
8
  from scipy.sparse import csr_matrix
6
9
  from scipy.sparse.csgraph import minimum_spanning_tree as mst
7
10
  from scipy.spatial.distance import pdist, squareform
11
+ from scipy.stats import entropy as sp_entropy
8
12
  from sklearn.neighbors import NearestNeighbors
9
13
 
10
14
  EPSILON = 1e-5
11
15
  EDGE_KERNEL = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]], dtype=np.int8)
12
16
  BIT_DEPTH = (1, 8, 12, 16, 32)
17
+ HASH_SIZE = 8
18
+ MAX_FACTOR = 4
19
+
20
+
21
+ def get_method(method_map: Dict[str, Callable], method: str) -> Callable:
22
+ if method not in method_map:
23
+ raise ValueError(f"Specified method {method} is not a valid method: {method_map}.")
24
+ return method_map[method]
25
+
26
+
27
+ def get_counts(
28
+ data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
29
+ ) -> tuple[Dict, Dict]:
30
+ """
31
+ Initialize dictionary of histogram counts --- treat categorical values
32
+ as histogram bins.
33
+
34
+ Parameters
35
+ ----------
36
+ subset_mask: Optional[np.ndarray[bool]]
37
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
38
+
39
+ Returns
40
+ -------
41
+ counts: Dict
42
+ histogram counts per metadata factor in `factors`. Each
43
+ factor will have a different number of bins. Counts get reused
44
+ across metrics, so hist_counts are cached but only if computed
45
+ globally, i.e. without masked samples.
46
+ """
47
+
48
+ hist_counts, hist_bins = {}, {}
49
+ # np.where needed to satisfy linter
50
+ mask = np.where(subset_mask if subset_mask is not None else np.ones(data.shape[0], dtype=bool))
51
+
52
+ for cdx, fn in enumerate(names):
53
+ # linter doesn't like double indexing
54
+ col_data = data[mask, cdx].squeeze()
55
+ if is_categorical[cdx]:
56
+ # if discrete, use unique values as bins
57
+ bins, cnts = np.unique(col_data, return_counts=True)
58
+ else:
59
+ bins = hist_bins.get(fn, "auto")
60
+ cnts, bins = np.histogram(col_data, bins=bins, density=True)
61
+
62
+ hist_counts[fn] = cnts
63
+ hist_bins[fn] = bins
64
+
65
+ return hist_counts, hist_bins
66
+
67
+
68
+ def entropy(
69
+ data: np.ndarray,
70
+ names: List[str],
71
+ is_categorical: List[bool],
72
+ normalized: bool = False,
73
+ subset_mask: Optional[np.ndarray] = None,
74
+ ) -> np.ndarray:
75
+ """
76
+ Meant for use with Bias metrics, Balance, Diversity, ClasswiseBalance,
77
+ and Classwise Diversity.
78
+
79
+ Compute entropy for discrete/categorical variables and, through standard
80
+ histogram binning, for continuous variables.
81
+
82
+ Parameters
83
+ ----------
84
+ normalized: bool
85
+ Flag that determines whether or not to normalize entropy by log(num_bins)
86
+ subset_mask: Optional[np.ndarray[bool]]
87
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
88
+
89
+ Notes
90
+ -----
91
+ For continuous variables, histogram bins are chosen automatically. See
92
+ numpy.histogram for details.
93
+
94
+ Returns
95
+ -------
96
+ ent: np.ndarray[float]
97
+ Entropy estimate per column of X
98
+
99
+ See Also
100
+ --------
101
+ numpy.histogram
102
+ scipy.stats.entropy
103
+ """
104
+
105
+ num_factors = len(names)
106
+ hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
107
+
108
+ ev_index = np.empty(num_factors)
109
+ for col, cnts in enumerate(hist_counts.values()):
110
+ # entropy in nats, normalizes counts
111
+ ev_index[col] = sp_entropy(cnts)
112
+ if normalized:
113
+ if len(cnts) == 1:
114
+ # log(0)
115
+ ev_index[col] = 0
116
+ else:
117
+ ev_index[col] /= np.log(len(cnts))
118
+ return ev_index
119
+
120
+
121
+ def get_num_bins(
122
+ data: np.ndarray, names: List[str], is_categorical: List[bool], subset_mask: Optional[np.ndarray] = None
123
+ ) -> np.ndarray:
124
+ """
125
+ Number of bins or unique values for each metadata factor, used to
126
+ normalize entropy/diversity.
127
+
128
+ Parameters
129
+ ----------
130
+ subset_mask: Optional[np.ndarray[bool]]
131
+ Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
132
+ """
133
+ # likely cached
134
+ hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
135
+ num_bins = np.empty(len(hist_counts))
136
+ for idx, cnts in enumerate(hist_counts.values()):
137
+ num_bins[idx] = len(cnts)
138
+
139
+ return num_bins
140
+
141
+
142
+ def infer_categorical(X: np.ndarray, threshold: float = 0.5) -> np.ndarray:
143
+ """
144
+ Compute fraction of feature values that are unique --- intended to be used
145
+ for inferring whether variables are categorical.
146
+ """
147
+ if X.ndim == 1:
148
+ X = np.expand_dims(X, axis=1)
149
+ num_samples = X.shape[0]
150
+ pct_unique = np.empty(X.shape[1])
151
+ for col in range(X.shape[1]): # type: ignore
152
+ uvals = np.unique(X[:, col], axis=0)
153
+ pct_unique[col] = len(uvals) / num_samples
154
+ return pct_unique < threshold
155
+
156
+
157
+ def preprocess_metadata(class_labels: Sequence[int], metadata: List[Dict]) -> Tuple[np.ndarray, List[str], List[bool]]:
158
+ # convert class_labels and list of metadata dicts to dict of ndarrays
159
+ metadata_dict: Dict[str, np.ndarray] = {
160
+ "class_label": np.asarray(class_labels, dtype=int),
161
+ **{k: np.array([d[k] for d in metadata]) for k in metadata[0]},
162
+ }
163
+
164
+ # map columns of dict that are not numeric (e.g. string) to numeric values
165
+ # that mutual information and diversity functions can accommodate. Each
166
+ # unique string receives a unique integer value.
167
+ for k, v in metadata_dict.items():
168
+ # if not numeric
169
+ if not np.issubdtype(v.dtype, np.number):
170
+ _, mapped_vals = np.unique(v, return_inverse=True)
171
+ metadata_dict[k] = mapped_vals
172
+
173
+ data = np.stack(list(metadata_dict.values()), axis=-1)
174
+ names = list(metadata_dict.keys())
175
+ is_categorical = [infer_categorical(metadata_dict[var], 0.25)[0] for var in names]
176
+
177
+ return data, names, is_categorical
13
178
 
14
179
 
15
180
  def minimum_spanning_tree(X: np.ndarray) -> Any:
@@ -89,7 +254,7 @@ def compute_neighbors(
89
254
 
90
255
  See Also
91
256
  --------
92
- :func:`sklearn.neighbors.NearestNeighbors`
257
+ sklearn.neighbors.NearestNeighbors
93
258
  """
94
259
 
95
260
  nbrs = NearestNeighbors(n_neighbors=k + 1, algorithm=algorithm).fit(B)
@@ -156,3 +321,73 @@ def edge_filter(image: np.ndarray, offset: float = 0.5) -> np.ndarray:
156
321
  edges = convolve2d(image, EDGE_KERNEL, mode="same", boundary="symm") + offset
157
322
  np.clip(edges, 0, 255, edges)
158
323
  return edges
324
+
325
+
326
+ def pchash(image: np.ndarray) -> str:
327
+ """
328
+ Performs a perceptual hash on an image by resizing to a square NxN image
329
+ using the Lanczos algorithm where N is 32x32 or the largest multiple of
330
+ 8 that is smaller than the input image dimensions. The resampled image
331
+ is compressed using a discrete cosine transform and the lowest frequency
332
+ component is encoded as a bit array of greater or less than median value
333
+ and returned as a hex string.
334
+
335
+ Parameters
336
+ ----------
337
+ image : np.ndarray
338
+ An image as a numpy array in CxHxW format
339
+
340
+ Returns
341
+ -------
342
+ str
343
+ The hex string hash of the image using perceptual hashing
344
+ """
345
+ # Verify that the image is at least larger than an 8x8 image
346
+ min_dim = min(image.shape[-2:])
347
+ if min_dim < HASH_SIZE + 1:
348
+ raise ValueError(f"Image must be larger than {HASH_SIZE}x{HASH_SIZE} for fuzzy hashing.")
349
+
350
+ # Calculates the dimensions of the resized square image
351
+ resize_dim = HASH_SIZE * min((min_dim - 1) // HASH_SIZE, MAX_FACTOR)
352
+
353
+ # Normalizes the image to CxHxW and takes the mean over all the channels
354
+ normalized = np.mean(normalize_image_shape(image), axis=0).squeeze()
355
+
356
+ # Rescales the pixel values to an 8-bit 0-255 image
357
+ rescaled = rescale(normalized, 8).astype(np.uint8)
358
+
359
+ # Resizes the image using the Lanczos algorithm to a square image
360
+ im = np.array(Image.fromarray(rescaled).resize((resize_dim, resize_dim), Image.Resampling.LANCZOS))
361
+
362
+ # Performs discrete cosine transforms to compress the image information and takes the lowest frequency component
363
+ transform = dct(dct(im.T).T)[:HASH_SIZE, :HASH_SIZE]
364
+
365
+ # Encodes the transform as a bit array over the median value
366
+ diff = transform > np.median(transform)
367
+
368
+ # Pads the front of the bit array to a multiple of 8 with False
369
+ padded = np.full(int(np.ceil(diff.size / 8) * 8), False)
370
+ padded[-diff.size :] = diff.ravel()
371
+
372
+ # Converts the bit array to a hex string and strips leading 0s
373
+ hash_hex = np.packbits(padded).tobytes().hex().lstrip("0")
374
+ return hash_hex if hash_hex else "0"
375
+
376
+
377
+ def xxhash(image: np.ndarray) -> str:
378
+ """
379
+ Performs a fast non-cryptographic hash using the xxhash algorithm
380
+ (xxhash.com) against the image as a flattened bytearray. The hash
381
+ is returned as a hex string.
382
+
383
+ Parameters
384
+ ----------
385
+ image : np.ndarray
386
+ An image as a numpy array
387
+
388
+ Returns
389
+ -------
390
+ str
391
+ The hex string hash of the image using the xxHash algorithm
392
+ """
393
+ return xxh.xxh3_64_hexdigest(image.ravel().tobytes())
@@ -0,0 +1,64 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict, List
3
+
4
+ from torch.utils.data import Dataset
5
+
6
+
7
+ def read_dataset(dataset: Dataset) -> List[List[Any]]:
8
+ """
9
+ Extract information from a dataset at each index into a individual lists of each information position
10
+
11
+ Parameters
12
+ ----------
13
+ dataset : torch.utils.data.Dataset
14
+ Input dataset
15
+
16
+ Returns
17
+ -------
18
+ List[List[Any]]
19
+ All objects in individual lists based on return position from dataset
20
+
21
+ Warning
22
+ -------
23
+ No type checking is done between lists or data inside lists
24
+
25
+ See Also
26
+ --------
27
+ torch.utils.data.Dataset
28
+
29
+ Examples
30
+ --------
31
+ >>> import numpy as np
32
+
33
+ >>> data = np.ones((10, 3, 3))
34
+ >>> labels = np.ones((10,))
35
+ >>> class ICDataset:
36
+ ... def __init__(self, data, labels):
37
+ ... self.data = data
38
+ ... self.labels = labels
39
+
40
+ ... def __getitem__(self, idx):
41
+ ... return self.data[idx], self.labels[idx]
42
+
43
+ >>> ds = ICDataset(data, labels)
44
+
45
+ >>> result = read_dataset(ds)
46
+ >>> assert len(result) == 2
47
+ True
48
+ >>> assert result[0].shape == (10, 3, 3) # 10 3x3 images
49
+ True
50
+ >>> assert result[1].shape == (10,) # 10 labels
51
+ True
52
+ """
53
+
54
+ ddict: Dict[int, List] = defaultdict(list)
55
+
56
+ for data in dataset:
57
+ # Convert to tuple if single return (e.g. images only)
58
+ if not isinstance(data, tuple):
59
+ data = (data,)
60
+
61
+ for i, d in enumerate(data):
62
+ ddict[i].append(d)
63
+
64
+ return list(ddict.values())
File without changes
@@ -1,7 +1,27 @@
1
- from dataeval._internal.metrics.ber import BER
2
- from dataeval._internal.metrics.divergence import Divergence
3
- from dataeval._internal.metrics.parity import Parity
1
+ from typing import List
2
+
3
+ __all__: List[str] = []
4
+
5
+ from dataeval._internal.metrics.balance import balance, balance_classwise
6
+ from dataeval._internal.metrics.ber import ber
7
+ from dataeval._internal.metrics.coverage import coverage
8
+ from dataeval._internal.metrics.divergence import divergence
9
+ from dataeval._internal.metrics.diversity import diversity, diversity_classwise
10
+ from dataeval._internal.metrics.parity import parity, parity_metadata
4
11
  from dataeval._internal.metrics.stats import ChannelStats, ImageStats
5
- from dataeval._internal.metrics.uap import UAP
12
+ from dataeval._internal.metrics.uap import uap
6
13
 
7
- __all__ = ["BER", "Divergence", "Parity", "UAP", "ChannelStats", "ImageStats"]
14
+ __all__ += [
15
+ "balance",
16
+ "balance_classwise",
17
+ "ber",
18
+ "coverage",
19
+ "divergence",
20
+ "diversity",
21
+ "diversity_classwise",
22
+ "parity",
23
+ "parity_metadata",
24
+ "ChannelStats",
25
+ "ImageStats",
26
+ "uap",
27
+ ]
@@ -0,0 +1,9 @@
1
+ from importlib.util import find_spec
2
+ from typing import List
3
+
4
+ __all__: List[str] = []
5
+
6
+ if find_spec("torch") is not None: # pragma: no cover
7
+ from dataeval._internal.utils import read_dataset
8
+
9
+ __all__ += ["read_dataset"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.61.0
3
+ Version: 0.64.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -23,7 +23,6 @@ Provides-Extra: all
23
23
  Provides-Extra: tensorflow
24
24
  Provides-Extra: torch
25
25
  Requires-Dist: hdbscan (>=0.8.36)
26
- Requires-Dist: maite
27
26
  Requires-Dist: matplotlib ; extra == "torch" or extra == "all"
28
27
  Requires-Dist: numpy (>1.24.3)
29
28
  Requires-Dist: nvidia-cudnn-cu11 (>=8.6.0.163) ; extra == "tensorflow" or extra == "torch" or extra == "all"