dataeval 0.76.0__py3-none-any.whl → 0.81.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/{output.py → _output.py} +14 -0
  3. dataeval/config.py +77 -0
  4. dataeval/detectors/__init__.py +1 -1
  5. dataeval/detectors/drift/__init__.py +6 -6
  6. dataeval/detectors/drift/{base.py → _base.py} +41 -30
  7. dataeval/detectors/drift/{cvm.py → _cvm.py} +21 -28
  8. dataeval/detectors/drift/{ks.py → _ks.py} +20 -26
  9. dataeval/detectors/drift/{mmd.py → _mmd.py} +33 -19
  10. dataeval/detectors/drift/{torch.py → _torch.py} +2 -1
  11. dataeval/detectors/drift/{uncertainty.py → _uncertainty.py} +23 -7
  12. dataeval/detectors/drift/updates.py +1 -1
  13. dataeval/detectors/linters/__init__.py +0 -3
  14. dataeval/detectors/linters/duplicates.py +17 -8
  15. dataeval/detectors/linters/outliers.py +52 -43
  16. dataeval/detectors/ood/ae.py +29 -8
  17. dataeval/detectors/ood/base.py +5 -4
  18. dataeval/detectors/ood/metadata_ks_compare.py +1 -1
  19. dataeval/detectors/ood/mixin.py +20 -5
  20. dataeval/detectors/ood/output.py +1 -1
  21. dataeval/detectors/ood/vae.py +73 -0
  22. dataeval/metadata/__init__.py +5 -0
  23. dataeval/metadata/_ood.py +238 -0
  24. dataeval/metrics/__init__.py +1 -1
  25. dataeval/metrics/bias/__init__.py +5 -4
  26. dataeval/metrics/bias/{balance.py → _balance.py} +67 -17
  27. dataeval/metrics/bias/{coverage.py → _coverage.py} +41 -35
  28. dataeval/metrics/bias/{diversity.py → _diversity.py} +17 -12
  29. dataeval/metrics/bias/{parity.py → _parity.py} +89 -63
  30. dataeval/metrics/estimators/__init__.py +14 -4
  31. dataeval/metrics/estimators/{ber.py → _ber.py} +42 -11
  32. dataeval/metrics/estimators/_clusterer.py +104 -0
  33. dataeval/metrics/estimators/{divergence.py → _divergence.py} +18 -13
  34. dataeval/metrics/estimators/{uap.py → _uap.py} +4 -4
  35. dataeval/metrics/stats/__init__.py +7 -7
  36. dataeval/metrics/stats/{base.py → _base.py} +52 -16
  37. dataeval/metrics/stats/{boxratiostats.py → _boxratiostats.py} +6 -9
  38. dataeval/metrics/stats/{datasetstats.py → _datasetstats.py} +10 -14
  39. dataeval/metrics/stats/{dimensionstats.py → _dimensionstats.py} +6 -5
  40. dataeval/metrics/stats/{hashstats.py → _hashstats.py} +6 -6
  41. dataeval/metrics/stats/{labelstats.py → _labelstats.py} +25 -25
  42. dataeval/metrics/stats/{pixelstats.py → _pixelstats.py} +5 -4
  43. dataeval/metrics/stats/{visualstats.py → _visualstats.py} +9 -8
  44. dataeval/typing.py +54 -0
  45. dataeval/utils/__init__.py +2 -2
  46. dataeval/utils/_array.py +169 -0
  47. dataeval/utils/_bin.py +199 -0
  48. dataeval/utils/_clusterer.py +144 -0
  49. dataeval/utils/_fast_mst.py +189 -0
  50. dataeval/utils/{image.py → _image.py} +6 -4
  51. dataeval/utils/_method.py +18 -0
  52. dataeval/utils/{shared.py → _mst.py} +3 -65
  53. dataeval/utils/{plot.py → _plot.py} +4 -4
  54. dataeval/utils/data/__init__.py +22 -0
  55. dataeval/utils/data/_embeddings.py +105 -0
  56. dataeval/utils/data/_images.py +65 -0
  57. dataeval/utils/data/_metadata.py +352 -0
  58. dataeval/utils/data/_selection.py +119 -0
  59. dataeval/utils/{dataset/split.py → data/_split.py} +13 -14
  60. dataeval/utils/data/_targets.py +73 -0
  61. dataeval/utils/data/_types.py +58 -0
  62. dataeval/utils/data/collate.py +103 -0
  63. dataeval/utils/data/datasets/__init__.py +17 -0
  64. dataeval/utils/data/datasets/_base.py +254 -0
  65. dataeval/utils/data/datasets/_cifar10.py +134 -0
  66. dataeval/utils/data/datasets/_fileio.py +168 -0
  67. dataeval/utils/data/datasets/_milco.py +153 -0
  68. dataeval/utils/data/datasets/_mixin.py +56 -0
  69. dataeval/utils/data/datasets/_mnist.py +183 -0
  70. dataeval/utils/data/datasets/_ships.py +123 -0
  71. dataeval/utils/data/datasets/_voc.py +352 -0
  72. dataeval/utils/data/selections/__init__.py +15 -0
  73. dataeval/utils/data/selections/_classfilter.py +60 -0
  74. dataeval/utils/data/selections/_indices.py +26 -0
  75. dataeval/utils/data/selections/_limit.py +26 -0
  76. dataeval/utils/data/selections/_reverse.py +18 -0
  77. dataeval/utils/data/selections/_shuffle.py +29 -0
  78. dataeval/utils/metadata.py +198 -376
  79. dataeval/utils/torch/{gmm.py → _gmm.py} +4 -2
  80. dataeval/utils/torch/{internal.py → _internal.py} +21 -51
  81. dataeval/utils/torch/models.py +43 -2
  82. dataeval/workflows/sufficiency.py +10 -9
  83. {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/METADATA +44 -15
  84. dataeval-0.81.0.dist-info/RECORD +94 -0
  85. dataeval/detectors/linters/clusterer.py +0 -512
  86. dataeval/detectors/linters/merged_stats.py +0 -49
  87. dataeval/detectors/ood/metadata_least_likely.py +0 -119
  88. dataeval/interop.py +0 -69
  89. dataeval/utils/dataset/__init__.py +0 -7
  90. dataeval/utils/dataset/datasets.py +0 -412
  91. dataeval/utils/dataset/read.py +0 -63
  92. dataeval-0.76.0.dist-info/RECORD +0 -67
  93. /dataeval/{log.py → _log.py} +0 -0
  94. /dataeval/utils/torch/{blocks.py → _blocks.py} +0 -0
  95. {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/LICENSE.txt +0 -0
  96. {dataeval-0.76.0.dist-info → dataeval-0.81.0.dist-info}/WHEEL +0 -0
@@ -2,40 +2,86 @@ from __future__ import annotations
2
2
 
3
3
  __all__ = []
4
4
 
5
+ import contextlib
5
6
  import warnings
6
7
  from dataclasses import dataclass
7
8
  from typing import Any, Generic, TypeVar
8
9
 
9
10
  import numpy as np
10
- from numpy.typing import ArrayLike, NDArray
11
+ from numpy.typing import NDArray
11
12
  from scipy.stats import chisquare
12
13
  from scipy.stats.contingency import chi2_contingency, crosstab
13
14
 
14
- from dataeval.interop import as_numpy, to_numpy
15
- from dataeval.output import Output, set_metadata
16
- from dataeval.utils.metadata import Metadata
15
+ from dataeval._output import Output, set_metadata
16
+ from dataeval.typing import ArrayLike
17
+ from dataeval.utils._array import as_numpy
18
+ from dataeval.utils.data import Metadata
19
+
20
+ with contextlib.suppress(ImportError):
21
+ import pandas as pd
17
22
 
18
23
  TData = TypeVar("TData", np.float64, NDArray[np.float64])
19
24
 
20
25
 
21
26
  @dataclass(frozen=True)
22
- class ParityOutput(Generic[TData], Output):
27
+ class BaseParityOutput(Generic[TData], Output):
28
+ score: TData
29
+ p_value: TData
30
+
31
+ def to_dataframe(self) -> pd.DataFrame:
32
+ """
33
+ Exports the parity output results to a pandas DataFrame.
34
+
35
+ Returns
36
+ -------
37
+ pd.DataFrame
38
+ """
39
+ import pandas as pd
40
+
41
+ return pd.DataFrame(
42
+ index=self.factor_names, # type: ignore - list[str] is documented as acceptable index type
43
+ data={
44
+ "score": self.score.round(2),
45
+ "p-value": self.p_value.round(2),
46
+ },
47
+ )
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class LabelParityOutput(BaseParityOutput[np.float64]):
52
+ """
53
+ Output class for :func:`.label_parity` :term:`bias<Bias>` metrics.
54
+
55
+ Attributes
56
+ ----------
57
+ score : np.float64
58
+ chi-squared score(s) of the test
59
+ p_value : np.float64
60
+ p-value(s) of the test
61
+ """
62
+
63
+
64
+ @dataclass(frozen=True)
65
+ class ParityOutput(BaseParityOutput[NDArray[np.float64]]):
23
66
  """
24
- Output class for :func:`parity` and :func:`label_parity` :term:`bias<Bias>` metrics.
67
+ Output class for :func:`.parity` :term:`bias<Bias>` metrics.
25
68
 
26
69
  Attributes
27
70
  ----------
28
- score : np.float64 | NDArray[np.float64]
71
+ score : NDArray[np.float64]
29
72
  chi-squared score(s) of the test
30
- p_value : np.float64 | NDArray[np.float64]
73
+ p_value : NDArray[np.float64]
31
74
  p-value(s) of the test
32
- metadata_names : list[str] | None
75
+ factor_names : list[str]
33
76
  Names of each metadata factor
77
+ insufficient_data: dict
78
+ Dictionary of metadata factors with less than 5 class occurrences per value
34
79
  """
35
80
 
36
- score: TData
37
- p_value: TData
38
- metadata_names: list[str] | None
81
+ # score: NDArray[np.float64]
82
+ # p_value: NDArray[np.float64]
83
+ factor_names: list[str]
84
+ insufficient_data: dict[str, dict[int, dict[str, int]]]
39
85
 
40
86
 
41
87
  def normalize_expected_dist(expected_dist: NDArray[Any], observed_dist: NDArray[Any]) -> NDArray[Any]:
@@ -109,7 +155,7 @@ def validate_dist(label_dist: NDArray[Any], label_name: str) -> None:
109
155
  raise ValueError(f"No labels found in the {label_name} dataset")
110
156
  if np.any(label_dist < 5):
111
157
  warnings.warn(
112
- f"Labels {np.where(label_dist<5)[0]} in {label_name}"
158
+ f"Labels {np.where(label_dist < 5)[0]} in {label_name}"
113
159
  " dataset have frequencies less than 5. This may lead"
114
160
  " to invalid chi-squared evaluation.",
115
161
  UserWarning,
@@ -121,7 +167,7 @@ def label_parity(
121
167
  expected_labels: ArrayLike,
122
168
  observed_labels: ArrayLike,
123
169
  num_classes: int | None = None,
124
- ) -> ParityOutput[np.float64]:
170
+ ) -> LabelParityOutput:
125
171
  """
126
172
  Calculate the chi-square statistic to assess the :term:`parity<Parity>` \
127
173
  between expected and observed label distributions.
@@ -142,7 +188,7 @@ def label_parity(
142
188
 
143
189
  Returns
144
190
  -------
145
- ParityOutput[np.float64]
191
+ LabelParityOutput
146
192
  chi-squared score and :term`P-Value` of the test
147
193
 
148
194
  Raises
@@ -171,7 +217,7 @@ def label_parity(
171
217
  >>> expected_labels = rng.choice([0, 1, 2, 3, 4], (100))
172
218
  >>> observed_labels = rng.choice([2, 3, 0, 4, 1], (100))
173
219
  >>> label_parity(expected_labels, observed_labels)
174
- ParityOutput(score=14.007374204742625, p_value=0.0072715574616218, metadata_names=None)
220
+ LabelParityOutput(score=14.007374204742625, p_value=0.0072715574616218)
175
221
  """
176
222
 
177
223
  # Calculate
@@ -179,8 +225,8 @@ def label_parity(
179
225
  num_classes = 0
180
226
 
181
227
  # Calculate the class frequencies associated with the datasets
182
- observed_dist = np.bincount(to_numpy(observed_labels), minlength=num_classes)
183
- expected_dist = np.bincount(to_numpy(expected_labels), minlength=num_classes)
228
+ observed_dist = np.bincount(as_numpy(observed_labels), minlength=num_classes)
229
+ expected_dist = np.bincount(as_numpy(expected_labels), minlength=num_classes)
184
230
 
185
231
  # Validate
186
232
  validate_dist(observed_dist, "observed")
@@ -202,11 +248,11 @@ def label_parity(
202
248
  )
203
249
 
204
250
  cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
205
- return ParityOutput(cs, p, None)
251
+ return LabelParityOutput(cs, p)
206
252
 
207
253
 
208
254
  @set_metadata
209
- def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
255
+ def parity(metadata: Metadata) -> ParityOutput:
210
256
  """
211
257
  Calculate chi-square statistics to assess the linear relationship \
212
258
  between multiple factors and class labels.
@@ -218,7 +264,7 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
218
264
  Parameters
219
265
  ----------
220
266
  metadata : Metadata
221
- Preprocessed metadata from :func:`dataeval.utils.metadata.preprocess`
267
+ Preprocessed metadata
222
268
 
223
269
  Returns
224
270
  -------
@@ -250,24 +296,21 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
250
296
  --------
251
297
  Randomly creating some "continuous" and categorical variables using ``np.random.default_rng``
252
298
 
253
- >>> from dataeval.utils.metadata import preprocess
254
- >>> rng = np.random.default_rng(175)
255
- >>> labels = rng.choice([0, 1, 2], (100))
256
- >>> metadata_dict = [
257
- ... {
258
- ... "age": list(rng.choice([25, 30, 35, 45], (100))),
259
- ... "income": list(rng.choice([50000, 65000, 80000], (100))),
260
- ... "gender": list(rng.choice(["M", "F"], (100))),
261
- ... }
262
- ... ]
263
- >>> continuous_factor_bincounts = {"age": 4, "income": 3}
264
- >>> metadata = preprocess(metadata_dict, labels, continuous_factor_bincounts)
299
+ >>> metadata = generate_random_metadata(
300
+ ... labels=["doctor", "artist", "teacher"],
301
+ ... factors={
302
+ ... "age": [25, 30, 35, 45],
303
+ ... "income": [50000, 65000, 80000],
304
+ ... "gender": ["M", "F"]},
305
+ ... length=100,
306
+ ... random_seed=175)
307
+ >>> metadata.continuous_factor_bins = {"age": 4, "income": 3}
265
308
  >>> parity(metadata)
266
- ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), metadata_names=['age', 'income', 'gender'])
309
+ ParityOutput(score=array([7.35731943, 5.46711299, 0.51506212]), p_value=array([0.28906231, 0.24263543, 0.77295762]), factor_names=['age', 'income', 'gender'], insufficient_data={'age': {3: {'artist': 4}, 4: {'artist': 4, 'teacher': 3}}, 'income': {1: {'artist': 3}}})
267
310
  """ # noqa: E501
268
311
  chi_scores = np.zeros(metadata.discrete_data.shape[1])
269
312
  p_values = np.zeros_like(chi_scores)
270
- not_enough_data = {}
313
+ insufficient_data = {}
271
314
  for i, col_data in enumerate(metadata.discrete_data.T):
272
315
  # Builds a contingency matrix where entry at index (r,c) represents
273
316
  # the frequency of current_factor_name achieving value unique_factor_values[r]
@@ -281,14 +324,14 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
281
324
  current_factor_name = metadata.discrete_factor_names[i]
282
325
  for int_factor, int_class in zip(counts[0], counts[1]):
283
326
  if contingency_matrix[int_factor, int_class] > 0:
284
- factor_category = unique_factor_values[int_factor]
285
- if current_factor_name not in not_enough_data:
286
- not_enough_data[current_factor_name] = {}
287
- if factor_category not in not_enough_data[current_factor_name]:
288
- not_enough_data[current_factor_name][factor_category] = []
289
- not_enough_data[current_factor_name][factor_category].append(
290
- (metadata.class_names[int_class], int(contingency_matrix[int_factor, int_class]))
291
- )
327
+ factor_category = unique_factor_values[int_factor].item()
328
+ if current_factor_name not in insufficient_data:
329
+ insufficient_data[current_factor_name] = {}
330
+ if factor_category not in insufficient_data[current_factor_name]:
331
+ insufficient_data[current_factor_name][factor_category] = {}
332
+ class_name = metadata.class_names[int_class]
333
+ class_count = contingency_matrix[int_factor, int_class].item()
334
+ insufficient_data[current_factor_name][factor_category][class_name] = class_count
292
335
 
293
336
  # This deletes rows containing only zeros,
294
337
  # because scipy.stats.chi2_contingency fails when there are rows containing only zeros.
@@ -301,24 +344,7 @@ def parity(metadata: Metadata) -> ParityOutput[NDArray[np.float64]]:
301
344
  chi_scores[i] = chi2
302
345
  p_values[i] = p
303
346
 
304
- if not_enough_data:
305
- factor_msg = []
306
- for factor, fact_dict in not_enough_data.items():
307
- stacked_msg = []
308
- for key, value in fact_dict.items():
309
- msg = []
310
- for item in value:
311
- msg.append(f"label {item[0]}: {item[1]} occurrences")
312
- flat_msg = "\n\t\t".join(msg)
313
- stacked_msg.append(f"value {key} - {flat_msg}\n\t")
314
- factor_msg.append(factor + " - " + "".join(stacked_msg))
315
-
316
- message = "\n".join(factor_msg)
317
-
318
- warnings.warn(
319
- f"The following factors did not meet the recommended 5 occurrences for each value-label combination. \n\
320
- Recommend rerunning parity after adjusting the following factor-value-label combinations: \n{message}",
321
- UserWarning,
322
- )
347
+ if insufficient_data:
348
+ warnings.warn("Some factors did not meet the recommended 5 occurrences for each value-label combination.")
323
349
 
324
- return ParityOutput(chi_scores, p_values, metadata.discrete_factor_names)
350
+ return ParityOutput(chi_scores, p_values, metadata.discrete_factor_names, insufficient_data)
@@ -2,8 +2,18 @@
2
2
  Estimators calculate performance bounds and the statistical distance between datasets.
3
3
  """
4
4
 
5
- __all__ = ["ber", "divergence", "uap", "BEROutput", "DivergenceOutput", "UAPOutput"]
5
+ __all__ = [
6
+ "ber",
7
+ "clusterer",
8
+ "divergence",
9
+ "uap",
10
+ "BEROutput",
11
+ "ClustererOutput",
12
+ "DivergenceOutput",
13
+ "UAPOutput",
14
+ ]
6
15
 
7
- from dataeval.metrics.estimators.ber import BEROutput, ber
8
- from dataeval.metrics.estimators.divergence import DivergenceOutput, divergence
9
- from dataeval.metrics.estimators.uap import UAPOutput, uap
16
+ from dataeval.metrics.estimators._ber import BEROutput, ber
17
+ from dataeval.metrics.estimators._clusterer import ClustererOutput, clusterer
18
+ from dataeval.metrics.estimators._divergence import DivergenceOutput, divergence
19
+ from dataeval.metrics.estimators._uap import UAPOutput, uap
@@ -16,19 +16,21 @@ from dataclasses import dataclass
16
16
  from typing import Literal
17
17
 
18
18
  import numpy as np
19
- from numpy.typing import ArrayLike, NDArray
19
+ from numpy.typing import NDArray
20
20
  from scipy.sparse import coo_matrix
21
21
  from scipy.stats import mode
22
22
 
23
- from dataeval.interop import as_numpy
24
- from dataeval.output import Output, set_metadata
25
- from dataeval.utils.shared import compute_neighbors, get_classes_counts, get_method, minimum_spanning_tree
23
+ from dataeval._output import Output, set_metadata
24
+ from dataeval.typing import ArrayLike
25
+ from dataeval.utils._array import as_numpy, ensure_embeddings
26
+ from dataeval.utils._method import get_method
27
+ from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
26
28
 
27
29
 
28
30
  @dataclass(frozen=True)
29
31
  class BEROutput(Output):
30
32
  """
31
- Output class for :func:`ber` estimator metric.
33
+ Output class for :func:`.ber` estimator metric.
32
34
 
33
35
  Attributes
34
36
  ----------
@@ -116,18 +118,21 @@ def knn_lowerbound(value: float, classes: int, k: int) -> float:
116
118
  return ((classes - 1) / classes) * (1 - np.sqrt(max(0, 1 - ((classes / (classes - 1)) * value))))
117
119
 
118
120
 
121
+ _BER_FN_MAP = {"KNN": ber_knn, "MST": ber_mst}
122
+
123
+
119
124
  @set_metadata
120
- def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
125
+ def ber(embeddings: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN", "MST"] = "KNN") -> BEROutput:
121
126
  """
122
127
  An estimator for Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` \
123
128
  using FR or KNN test statistic basis.
124
129
 
125
130
  Parameters
126
131
  ----------
127
- images : ArrayLike (N, ... )
128
- Array of images or image :term:`embeddings<Embeddings>`
132
+ embeddings : ArrayLike (N, ... )
133
+ Array of image :term:`embeddings<Embeddings>`
129
134
  labels : ArrayLike (N, 1)
130
- Array of labels for each image or image embedding
135
+ Array of labels for each image
131
136
  k : int, default 1
132
137
  Number of nearest neighbors for KNN estimator -- ignored by MST estimator
133
138
  method : Literal["KNN", "MST"], default "KNN"
@@ -152,8 +157,34 @@ def ber(images: ArrayLike, labels: ArrayLike, k: int = 1, method: Literal["KNN",
152
157
  >>> ber(images, labels)
153
158
  BEROutput(ber=0.04, ber_lower=0.020416847668728033)
154
159
  """
155
- ber_fn = get_method({"KNN": ber_knn, "MST": ber_mst}, method)
156
- X = as_numpy(images)
160
+ ber_fn = get_method(_BER_FN_MAP, method)
161
+ X = ensure_embeddings(embeddings, dtype=np.float64)
157
162
  y = as_numpy(labels)
158
163
  upper, lower = ber_fn(X, y, k)
159
164
  return BEROutput(upper, lower)
165
+
166
+
167
+ def get_classes_counts(labels: NDArray[np.int_]) -> tuple[int, int]:
168
+ """
169
+ Returns the classes and counts of from an array of labels
170
+
171
+ Parameters
172
+ ----------
173
+ label : NDArray
174
+ Numpy labels array
175
+
176
+ Returns
177
+ -------
178
+ Classes and counts
179
+
180
+ Raises
181
+ ------
182
+ ValueError
183
+ If the number of unique classes is less than 2
184
+ """
185
+ classes, counts = np.unique(labels, return_counts=True)
186
+ M = len(classes)
187
+ if M < 2:
188
+ raise ValueError("Label vector contains less than 2 classes!")
189
+ N = int(np.sum(counts))
190
+ return M, N
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = []
4
+
5
+ from dataclasses import dataclass
6
+
7
+ import numpy as np
8
+ from numpy.typing import NDArray
9
+
10
+ from dataeval._output import Output
11
+ from dataeval.typing import ArrayLike
12
+ from dataeval.utils._array import as_numpy
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class ClustererOutput(Output):
17
+ """
18
+ Output class for :func:`.clusterer`.
19
+
20
+ Attributes
21
+ ----------
22
+ clusters : NDArray[int]
23
+ Assigned clusters
24
+ mst : NDArray[int]
25
+ The minimum spanning tree of the data
26
+ linkage_tree : NDArray[float]
27
+ The linkage array of the data
28
+ condensed_tree : NDArray[float]
29
+ The condensed tree of the data
30
+ membership_strengths : NDArray[float]
31
+ The strength of the data point belonging to the assigned cluster
32
+ """
33
+
34
+ clusters: NDArray[np.int_]
35
+ mst: NDArray[np.double]
36
+ linkage_tree: NDArray[np.double]
37
+ condensed_tree: NDArray[np.double]
38
+ membership_strengths: NDArray[np.double]
39
+
40
+ def find_outliers(self) -> NDArray[np.int_]:
41
+ """
42
+ Retrieves Outliers based on when the sample was added to the cluster
43
+ and how far it was from the cluster when it was added
44
+
45
+ Returns
46
+ -------
47
+ NDArray[int]
48
+ A numpy array of the outlier indices
49
+ """
50
+ return np.nonzero(self.clusters == -1)[0]
51
+
52
+ def find_duplicates(self) -> tuple[list[list[int]], list[list[int]]]:
53
+ """
54
+ Finds duplicate and near duplicate data based on cluster average distance
55
+
56
+ Returns
57
+ -------
58
+ Tuple[List[List[int]], List[List[int]]]
59
+ The exact :term:`duplicates<Duplicates>` and near duplicates as lists of related indices
60
+ """
61
+ # Delay load numba compiled functions
62
+ from dataeval.utils._clusterer import compare_links_to_cluster_std, sorted_union_find
63
+
64
+ exact_indices, near_indices = compare_links_to_cluster_std(self.mst, self.clusters)
65
+ exact_dupes = sorted_union_find(exact_indices)
66
+ near_dupes = sorted_union_find(near_indices)
67
+
68
+ return [[int(ii) for ii in il] for il in exact_dupes], [[int(ii) for ii in il] for il in near_dupes]
69
+
70
+
71
+ def clusterer(data: ArrayLike) -> ClustererOutput:
72
+ """
73
+ Uses hierarchical clustering on the flattened data and returns clustering
74
+ information.
75
+
76
+ Parameters
77
+ ----------
78
+ data : ArrayLike, shape - (N, ...)
79
+ A dataset in an ArrayLike format. Function expects the data to have 2
80
+ or more dimensions which will flatten to (N, P) where N number of
81
+ observations in a P-dimensional space.
82
+
83
+ Returns
84
+ -------
85
+ :class:`.ClustererOutput`
86
+
87
+ Note
88
+ ----
89
+ The clusterer works best when the length of the feature dimension, P, is
90
+ less than 500. If flattening a CxHxW image results in a dimension larger
91
+ than 500, then it is recommended to reduce the dimensions.
92
+
93
+ Example
94
+ -------
95
+ >>> clusterer(clusterer_images).clusters
96
+ array([ 2, 0, 0, 0, 0, 0, 4, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0,
97
+ 4, 2, 0, 0, 1, 2, 0, 1, 3, 0, 3, 3, 4, 0, 0, 3, 0,
98
+ 3, -1, 0, 0, 2, 4, 3, 4, 0, 1, 0, -1, 3, 0, 0, 0])
99
+ """
100
+ # Delay load numba compiled functions
101
+ from dataeval.utils._clusterer import cluster
102
+
103
+ c = cluster(data)
104
+ return ClustererOutput(c.clusters, c.mst, c.linkage_tree, as_numpy(c.condensed_tree), c.membership_strengths)
@@ -11,17 +11,19 @@ from dataclasses import dataclass
11
11
  from typing import Literal
12
12
 
13
13
  import numpy as np
14
- from numpy.typing import ArrayLike, NDArray
14
+ from numpy.typing import NDArray
15
15
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.output import Output, set_metadata
18
- from dataeval.utils.shared import compute_neighbors, get_method, minimum_spanning_tree
16
+ from dataeval._output import Output, set_metadata
17
+ from dataeval.typing import ArrayLike
18
+ from dataeval.utils._array import ensure_embeddings
19
+ from dataeval.utils._method import get_method
20
+ from dataeval.utils._mst import compute_neighbors, minimum_spanning_tree
19
21
 
20
22
 
21
23
  @dataclass(frozen=True)
22
24
  class DivergenceOutput(Output):
23
25
  """
24
- Output class for :func:`divergence` estimator metric.
26
+ Output class for :func:`.divergence` estimator metric.
25
27
 
26
28
  Attributes
27
29
  ----------
@@ -78,18 +80,21 @@ def divergence_fnn(data: NDArray[np.float64], labels: NDArray[np.int_]) -> int:
78
80
  return errors
79
81
 
80
82
 
83
+ _DIVERGENCE_FN_MAP = {"FNN": divergence_fnn, "MST": divergence_mst}
84
+
85
+
81
86
  @set_metadata
82
- def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
87
+ def divergence(emb_a: ArrayLike, emb_b: ArrayLike, method: Literal["FNN", "MST"] = "FNN") -> DivergenceOutput:
83
88
  """
84
89
  Calculates the :term:`divergence` and any errors between the datasets.
85
90
 
86
91
  Parameters
87
92
  ----------
88
- data_a : ArrayLike, shape - (N, P)
89
- A dataset in an ArrayLike format to compare.
93
+ emb_a : ArrayLike, shape - (N, P)
94
+ Image embeddings in an ArrayLike format to compare.
90
95
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
91
- data_b : ArrayLike, shape - (N, P)
92
- A dataset in an ArrayLike format to compare.
96
+ emb_b : ArrayLike, shape - (N, P)
97
+ Image embeddings in an ArrayLike format to compare.
93
98
  Function expects the data to have 2 dimensions, N number of observations in a P-dimensionial space.
94
99
  method : Literal["MST, "FNN"], default "FNN"
95
100
  Method used to estimate dataset :term:`divergence<Divergence>`
@@ -125,9 +130,9 @@ def divergence(data_a: ArrayLike, data_b: ArrayLike, method: Literal["FNN", "MST
125
130
  >>> divergence(datasetA, datasetB)
126
131
  DivergenceOutput(divergence=0.28, errors=36)
127
132
  """
128
- div_fn = get_method({"FNN": divergence_fnn, "MST": divergence_mst}, method)
129
- a = as_numpy(data_a)
130
- b = as_numpy(data_b)
133
+ div_fn = get_method(_DIVERGENCE_FN_MAP, method)
134
+ a = ensure_embeddings(emb_a, dtype=np.float64)
135
+ b = ensure_embeddings(emb_b, dtype=np.float64)
131
136
  N = a.shape[0]
132
137
  M = b.shape[0]
133
138
 
@@ -10,17 +10,17 @@ __all__ = []
10
10
 
11
11
  from dataclasses import dataclass
12
12
 
13
- from numpy.typing import ArrayLike
14
13
  from sklearn.metrics import average_precision_score
15
14
 
16
- from dataeval.interop import as_numpy
17
- from dataeval.output import Output, set_metadata
15
+ from dataeval._output import Output, set_metadata
16
+ from dataeval.typing import ArrayLike
17
+ from dataeval.utils._array import as_numpy
18
18
 
19
19
 
20
20
  @dataclass(frozen=True)
21
21
  class UAPOutput(Output):
22
22
  """
23
- Output class for :func:`uap` estimator metric.
23
+ Output class for :func:`.uap` estimator metric.
24
24
 
25
25
  Attributes
26
26
  ----------
@@ -21,15 +21,15 @@ __all__ = [
21
21
  "visualstats",
22
22
  ]
23
23
 
24
- from dataeval.metrics.stats.boxratiostats import boxratiostats
25
- from dataeval.metrics.stats.datasetstats import (
24
+ from dataeval.metrics.stats._boxratiostats import boxratiostats
25
+ from dataeval.metrics.stats._datasetstats import (
26
26
  ChannelStatsOutput,
27
27
  DatasetStatsOutput,
28
28
  channelstats,
29
29
  datasetstats,
30
30
  )
31
- from dataeval.metrics.stats.dimensionstats import DimensionStatsOutput, dimensionstats
32
- from dataeval.metrics.stats.hashstats import HashStatsOutput, hashstats
33
- from dataeval.metrics.stats.labelstats import LabelStatsOutput, labelstats
34
- from dataeval.metrics.stats.pixelstats import PixelStatsOutput, pixelstats
35
- from dataeval.metrics.stats.visualstats import VisualStatsOutput, visualstats
31
+ from dataeval.metrics.stats._dimensionstats import DimensionStatsOutput, dimensionstats
32
+ from dataeval.metrics.stats._hashstats import HashStatsOutput, hashstats
33
+ from dataeval.metrics.stats._labelstats import LabelStatsOutput, labelstats
34
+ from dataeval.metrics.stats._pixelstats import PixelStatsOutput, pixelstats
35
+ from dataeval.metrics.stats._visualstats import VisualStatsOutput, visualstats