dataeval 0.73.0__tar.gz → 0.73.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {dataeval-0.73.0 → dataeval-0.73.1}/PKG-INFO +1 -1
  2. {dataeval-0.73.0 → dataeval-0.73.1}/pyproject.toml +6 -4
  3. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/__init__.py +3 -3
  4. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/__init__.py +1 -1
  5. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/__init__.py +1 -1
  6. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/base.py +2 -2
  7. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/clusterer.py +1 -1
  8. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/__init__.py +1 -1
  9. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/balance.py +29 -19
  10. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/coverage.py +11 -11
  11. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/diversity.py +79 -50
  12. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/metadata.py +133 -51
  13. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/parity.py +30 -24
  14. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/__init__.py +2 -2
  15. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/shared.py +1 -1
  16. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/split_dataset.py +12 -6
  17. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/datasets.py +2 -2
  18. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/workflows/__init__.py +1 -1
  19. {dataeval-0.73.0 → dataeval-0.73.1}/LICENSE.txt +0 -0
  20. {dataeval-0.73.0 → dataeval-0.73.1}/README.md +0 -0
  21. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/cvm.py +0 -0
  22. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/ks.py +0 -0
  23. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/mmd.py +0 -0
  24. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/torch.py +0 -0
  25. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/uncertainty.py +0 -0
  26. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/drift/updates.py +0 -0
  27. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/__init__.py +0 -0
  28. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/duplicates.py +0 -0
  29. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/merged_stats.py +0 -0
  30. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/linters/outliers.py +0 -0
  31. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/ae.py +0 -0
  32. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/aegmm.py +0 -0
  33. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/base.py +0 -0
  34. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/llr.py +0 -0
  35. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/metadata_ks_compare.py +0 -0
  36. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/metadata_least_likely.py +0 -0
  37. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/metadata_ood_mi.py +0 -0
  38. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/vae.py +0 -0
  39. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/detectors/ood/vaegmm.py +0 -0
  40. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/interop.py +0 -0
  41. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/__init__.py +0 -0
  42. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/bias/__init__.py +0 -0
  43. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/__init__.py +0 -0
  44. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/ber.py +0 -0
  45. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/divergence.py +0 -0
  46. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/estimators/uap.py +0 -0
  47. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/__init__.py +0 -0
  48. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/base.py +0 -0
  49. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/boxratiostats.py +0 -0
  50. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/datasetstats.py +0 -0
  51. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/dimensionstats.py +0 -0
  52. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/hashstats.py +0 -0
  53. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/labelstats.py +0 -0
  54. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/pixelstats.py +0 -0
  55. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/metrics/stats/visualstats.py +0 -0
  56. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/output.py +0 -0
  57. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/py.typed +0 -0
  58. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/image.py +0 -0
  59. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/lazy.py +0 -0
  60. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/metadata.py +0 -0
  61. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/__init__.py +0 -0
  62. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/gmm.py +0 -0
  63. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/loss.py +0 -0
  64. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/models.py +0 -0
  65. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/trainer.py +0 -0
  66. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/_internal/utils.py +0 -0
  67. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/tensorflow/loss/__init__.py +0 -0
  68. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/__init__.py +0 -0
  69. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/blocks.py +0 -0
  70. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/models.py +0 -0
  71. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/trainer.py +0 -0
  72. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/utils/torch/utils.py +0 -0
  73. {dataeval-0.73.0 → dataeval-0.73.1}/src/dataeval/workflows/sufficiency.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.73.0
3
+ Version: 0.73.1
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dataeval"
3
- version = "0.73.0" # dynamic
3
+ version = "0.73.1" # dynamic
4
4
  description = "DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks"
5
5
  license = "MIT"
6
6
  readme = "README.md"
@@ -69,8 +69,7 @@ all = ["matplotlib", "markupsafe", "tensorflow", "tensorflow_probability", "tf-k
69
69
  optional = true
70
70
 
71
71
  [tool.poetry.group.dev.dependencies]
72
- tox = {version = "*"}
73
- tox-uv = {version = "*"}
72
+ nox = {version = "*", extras = ["uv"]}
74
73
  uv = {version = "*"}
75
74
  poetry = {version = "*"}
76
75
  poetry-lock-groups-plugin = {version = "*"}
@@ -122,7 +121,6 @@ files = ["src/dataeval/__init__.py"]
122
121
  name = "dataeval"
123
122
 
124
123
  [tool.poetry2conda.dependencies]
125
- nvidia-cudnn-cu11 = { name = "cudnn" }
126
124
  tensorflow_probability = { name = "tensorflow-probability" }
127
125
  torch = { name = "pytorch" }
128
126
  xxhash = { name = "python-xxhash" }
@@ -145,6 +143,9 @@ parallel = true
145
143
  exclude_also = [
146
144
  "raise NotImplementedError",
147
145
  "if TYPE_CHECKING:",
146
+ "if _IS_TENSORFLOW_AVAILABLE",
147
+ "if _IS_TORCH_AVAILABLE",
148
+ "if _IS_TORCHVISION_AVAILABLE",
148
149
  ]
149
150
  include = ["*/src/dataeval/*"]
150
151
  omit = [
@@ -164,6 +165,7 @@ exclude = [
164
165
  "*env*",
165
166
  "output",
166
167
  "_build",
168
+ ".nox",
167
169
  ".tox",
168
170
  "prototype",
169
171
  ]
@@ -1,4 +1,4 @@
1
- __version__ = "0.73.0"
1
+ __version__ = "0.73.1"
2
2
 
3
3
  from importlib.util import find_spec
4
4
 
@@ -12,12 +12,12 @@ from dataeval import detectors, metrics # noqa: E402
12
12
 
13
13
  __all__ = ["detectors", "metrics"]
14
14
 
15
- if _IS_TORCH_AVAILABLE: # pragma: no cover
15
+ if _IS_TORCH_AVAILABLE:
16
16
  from dataeval import workflows
17
17
 
18
18
  __all__ += ["workflows"]
19
19
 
20
- if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE: # pragma: no cover
20
+ if _IS_TENSORFLOW_AVAILABLE or _IS_TORCH_AVAILABLE:
21
21
  from dataeval import utils
22
22
 
23
23
  __all__ += ["utils"]
@@ -7,7 +7,7 @@ from dataeval.detectors import drift, linters
7
7
 
8
8
  __all__ = ["drift", "linters"]
9
9
 
10
- if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
10
+ if _IS_TENSORFLOW_AVAILABLE:
11
11
  from dataeval.detectors import ood
12
12
 
13
13
  __all__ += ["ood"]
@@ -10,7 +10,7 @@ from dataeval.detectors.drift.ks import DriftKS
10
10
 
11
11
  __all__ = ["DriftCVM", "DriftKS", "DriftOutput", "updates"]
12
12
 
13
- if _IS_TORCH_AVAILABLE: # pragma: no cover
13
+ if _IS_TORCH_AVAILABLE:
14
14
  from dataeval.detectors.drift.mmd import DriftMMD, DriftMMDOutput
15
15
  from dataeval.detectors.drift.torch import preprocess_drift
16
16
  from dataeval.detectors.drift.uncertainty import DriftUncertainty
@@ -18,7 +18,7 @@ from typing import Any, Callable, Literal, TypeVar
18
18
  import numpy as np
19
19
  from numpy.typing import ArrayLike, NDArray
20
20
 
21
- from dataeval.interop import as_numpy, to_numpy
21
+ from dataeval.interop import as_numpy
22
22
  from dataeval.output import OutputMetadata, set_metadata
23
23
 
24
24
  R = TypeVar("R")
@@ -196,7 +196,7 @@ class BaseDrift:
196
196
  if correction not in ["bonferroni", "fdr"]:
197
197
  raise ValueError("`correction` must be `bonferroni` or `fdr`.")
198
198
 
199
- self._x_ref = to_numpy(x_ref)
199
+ self._x_ref = as_numpy(x_ref)
200
200
  self.x_ref_preprocessed: bool = x_ref_preprocessed
201
201
 
202
202
  # Other attributes
@@ -480,7 +480,7 @@ class Clusterer:
480
480
  samples = self.clusters[level][cluster_id].samples
481
481
  if len(samples) >= self._min_num_samples_per_cluster:
482
482
  duplicates_std.append(self.clusters[level][cluster_id].dist_std)
483
- diag_mask = np.ones_like(self._sqdmat, dtype=bool)
483
+ diag_mask = np.ones_like(self._sqdmat, dtype=np.bool_)
484
484
  np.fill_diagonal(diag_mask, 0)
485
485
  diag_mask = np.triu(diag_mask)
486
486
 
@@ -4,7 +4,7 @@ Out-of-distribution (OOD)` detectors identify data that is different from the da
4
4
 
5
5
  from dataeval import _IS_TENSORFLOW_AVAILABLE
6
6
 
7
- if _IS_TENSORFLOW_AVAILABLE: # pragma: no cover
7
+ if _IS_TENSORFLOW_AVAILABLE:
8
8
  from dataeval.detectors.ood.ae import OOD_AE
9
9
  from dataeval.detectors.ood.aegmm import OOD_AEGMM
10
10
  from dataeval.detectors.ood.base import OODOutput, OODScoreOutput
@@ -11,7 +11,7 @@ import numpy as np
11
11
  from numpy.typing import ArrayLike, NDArray
12
12
  from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
13
13
 
14
- from dataeval.metrics.bias.metadata import entropy, heatmap, preprocess_metadata
14
+ from dataeval.metrics.bias.metadata import CLASS_LABEL, entropy, heatmap, preprocess_metadata
15
15
  from dataeval.output import OutputMetadata, set_metadata
16
16
 
17
17
  with contextlib.suppress(ImportError):
@@ -31,9 +31,9 @@ class BalanceOutput(OutputMetadata):
31
31
  Estimate of inter/intra-factor mutual information
32
32
  classwise : NDArray[np.float64]
33
33
  Estimate of mutual information between metadata factors and individual class labels
34
- class_list: NDArray
34
+ class_list : NDArray
35
35
  Array of the class labels present in the dataset
36
- metadata_names: list[str]
36
+ metadata_names : list[str]
37
37
  Names of each metadata factor
38
38
  """
39
39
 
@@ -54,9 +54,9 @@ class BalanceOutput(OutputMetadata):
54
54
 
55
55
  Parameters
56
56
  ----------
57
- row_labels : ArrayLike | None, default None
57
+ row_labels : ArrayLike or None, default None
58
58
  List/Array containing the labels for rows in the histogram
59
- col_labels : ArrayLike | None, default None
59
+ col_labels : ArrayLike or None, default None
60
60
  List/Array containing the labels for columns in the histogram
61
61
  plot_classwise : bool, default False
62
62
  Whether to plot per-class balance instead of global balance
@@ -116,19 +116,29 @@ def validate_num_neighbors(num_neighbors: int) -> int:
116
116
 
117
117
 
118
118
  @set_metadata("dataeval.metrics")
119
- def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neighbors: int = 5) -> BalanceOutput:
119
+ def balance(
120
+ class_labels: ArrayLike,
121
+ metadata: Mapping[str, ArrayLike],
122
+ num_neighbors: int = 5,
123
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
124
+ ) -> BalanceOutput:
120
125
  """
121
126
  Mutual information (MI) between factors (class label, metadata, label/image properties)
122
127
 
123
128
  Parameters
124
129
  ----------
125
- class_labels: ArrayLike
130
+ class_labels : ArrayLike
126
131
  List of class labels for each image
127
- metadata: Mapping[str, ArrayLike]
132
+ metadata : Mapping[str, ArrayLike]
128
133
  Dict of lists of metadata factors for each image
129
- num_neighbors: int, default 5
134
+ num_neighbors : int, default 5
130
135
  Number of nearest neighbors to use for computing MI between discrete
131
136
  and continuous variables.
137
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
138
+ The factors in metadata that have continuous values and the array of bin counts to
139
+ discretize values into. All factors are treated as having discrete values unless they
140
+ are specified as keys in this dictionary. Each element of this array must occur as a key
141
+ in metadata.
132
142
 
133
143
  Returns
134
144
  -------
@@ -148,7 +158,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
148
158
  -------
149
159
  Return balance (mutual information) of factors with class_labels
150
160
 
151
- >>> bal = balance(class_labels, metadata)
161
+ >>> bal = balance(class_labels, metadata, continuous_factor_bincounts=continuous_factor_bincounts)
152
162
  >>> bal.balance
153
163
  array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
154
164
 
@@ -165,6 +175,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
165
175
  array([[0.99999822, 0.13363788, 0. , 0. ],
166
176
  [0.99999822, 0.13363788, 0. , 0. ]])
167
177
 
178
+
168
179
  See Also
169
180
  --------
170
181
  sklearn.feature_selection.mutual_info_classif
@@ -178,9 +189,9 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
178
189
  mi[:] = np.nan
179
190
 
180
191
  for idx in range(num_factors):
181
- tgt = data[:, idx].astype(int)
192
+ tgt = data[:, idx].astype(np.intp)
182
193
 
183
- if is_categorical[idx]:
194
+ if continuous_factor_bincounts and names[idx] not in continuous_factor_bincounts:
184
195
  mi[idx, :] = mutual_info_classif(
185
196
  data,
186
197
  tgt,
@@ -197,7 +208,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
197
208
  random_state=0,
198
209
  )
199
210
 
200
- ent_all = entropy(data, names, is_categorical, normalized=False)
211
+ ent_all = entropy(data, names, continuous_factor_bincounts, normalized=False)
201
212
  norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
202
213
  # in principle MI should be symmetric, but it is not in practice.
203
214
  nmi = 0.5 * (mi + mi.T) / norm_factor
@@ -205,7 +216,7 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
205
216
  factors = nmi[1:, 1:]
206
217
 
207
218
  # unique class labels
208
- class_idx = names.index("class_label")
219
+ class_idx = names.index(CLASS_LABEL)
209
220
  u_cls = np.unique(data[:, class_idx])
210
221
  num_classes = len(u_cls)
211
222
 
@@ -214,12 +225,11 @@ def balance(class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], num_neig
214
225
  classwise_mi[:] = np.nan
215
226
 
216
227
  # categorical variables, excluding class label
217
- cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
228
+ cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(np.intp)
218
229
 
219
- tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(int)
220
- ent_tgt_bin = entropy(
221
- tgt_bin, names=[str(idx) for idx in range(num_classes)], is_categorical=[True for idx in range(num_classes)]
222
- )
230
+ tgt_bin = np.stack([data[:, class_idx] == cls for cls in u_cls]).T.astype(np.intp)
231
+ names = [str(idx) for idx in range(num_classes)]
232
+ ent_tgt_bin = entropy(tgt_bin, names, continuous_factor_bincounts)
223
233
 
224
234
  # classification MI for discrete/categorical features
225
235
  for idx in range(num_classes):
@@ -5,7 +5,7 @@ __all__ = ["CoverageOutput", "coverage"]
5
5
  import contextlib
6
6
  import math
7
7
  from dataclasses import dataclass
8
- from typing import Any, Literal
8
+ from typing import Literal
9
9
 
10
10
  import numpy as np
11
11
  from numpy.typing import ArrayLike, NDArray
@@ -27,9 +27,9 @@ class CoverageOutput(OutputMetadata):
27
27
 
28
28
  Attributes
29
29
  ----------
30
- indices : NDArray
30
+ indices : NDArray[np.intp]
31
31
  Array of uncovered indices
32
- radii : NDArray
32
+ radii : NDArray[np.float64]
33
33
  Array of critical value radii
34
34
  critical_value : float
35
35
  Radius for :term:`coverage<Coverage>`
@@ -39,11 +39,7 @@ class CoverageOutput(OutputMetadata):
39
39
  radii: NDArray[np.float64]
40
40
  critical_value: float
41
41
 
42
- def plot(
43
- self,
44
- images: NDArray[Any],
45
- top_k: int = 6,
46
- ) -> Figure:
42
+ def plot(self, images: ArrayLike, top_k: int = 6) -> Figure:
47
43
  """
48
44
  Plot the top k images together for visualization
49
45
 
@@ -53,6 +49,10 @@ class CoverageOutput(OutputMetadata):
53
49
  Original images (not embeddings) in (N, C, H, W) or (N, H, W) format
54
50
  top_k : int, default 6
55
51
  Number of images to plot (plotting assumes groups of 3)
52
+
53
+ Returns
54
+ -------
55
+ matplotlib.figure.Figure
56
56
  """
57
57
  # Determine which images to plot
58
58
  highest_uncovered_indices = self.indices[:top_k]
@@ -82,12 +82,12 @@ def coverage(
82
82
  embeddings : ArrayLike, shape - (N, P)
83
83
  A dataset in an ArrayLike format.
84
84
  Function expects the data to have 2 dimensions, N number of observations in a P-dimesionial space.
85
- radius_type : Literal["adaptive", "naive"], default "adaptive"
85
+ radius_type : {"adaptive", "naive"}, default "adaptive"
86
86
  The function used to determine radius.
87
- k: int, default 20
87
+ k : int, default 20
88
88
  Number of observations required in order to be covered.
89
89
  [1] suggests that a minimum of 20-50 samples is necessary.
90
- percent: float, default 0.01
90
+ percent : float, default 0.01
91
91
  Percent of observations to be considered uncovered. Only applies to adaptive radius.
92
92
 
93
93
  Returns
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from numpy.typing import ArrayLike, NDArray
11
11
 
12
12
  from dataeval.metrics.bias.metadata import (
13
+ CLASS_LABEL,
13
14
  diversity_bar_plot,
14
15
  entropy,
15
16
  get_counts,
@@ -35,9 +36,9 @@ class DiversityOutput(OutputMetadata):
35
36
  :term:`Diversity` index for classes and factors
36
37
  classwise : NDArray[np.float64]
37
38
  Classwise diversity index [n_class x n_factor]
38
- class_list: NDArray[np.int64]
39
+ class_list : NDArray[np.int64]
39
40
  Class labels for each value in the dataset
40
- metadata_names: list[str]
41
+ metadata_names : list[str]
41
42
  Names of each metadata factor
42
43
  """
43
44
 
@@ -45,12 +46,11 @@ class DiversityOutput(OutputMetadata):
45
46
  classwise: NDArray[np.float64]
46
47
  class_list: NDArray[Any]
47
48
  metadata_names: list[str]
48
- method: Literal["shannon", "simpson"]
49
49
 
50
50
  def plot(
51
51
  self,
52
- row_labels: list[Any] | NDArray[Any] | None = None,
53
- col_labels: list[Any] | NDArray[Any] | None = None,
52
+ row_labels: ArrayLike | list[Any] | None = None,
53
+ col_labels: ArrayLike | list[Any] | None = None,
54
54
  plot_classwise: bool = False,
55
55
  ) -> Figure:
56
56
  """
@@ -58,9 +58,9 @@ class DiversityOutput(OutputMetadata):
58
58
 
59
59
  Parameters
60
60
  ----------
61
- row_labels : ArrayLike | None, default None
61
+ row_labels : ArrayLike or None, default None
62
62
  List/Array containing the labels for rows in the histogram
63
- col_labels : ArrayLike | None, default None
63
+ col_labels : ArrayLike or None, default None
64
64
  List/Array containing the labels for columns in the histogram
65
65
  plot_classwise : bool, default False
66
66
  Whether to plot per-class balance instead of global balance
@@ -77,7 +77,7 @@ class DiversityOutput(OutputMetadata):
77
77
  col_labels,
78
78
  xlabel="Factors",
79
79
  ylabel="Class",
80
- cbarlabel=f"Normalized {self.method.title()} Index",
80
+ cbarlabel=f"Normalized {self.meta()['arguments']['method'].title()} Index",
81
81
  )
82
82
 
83
83
  else:
@@ -92,7 +92,7 @@ class DiversityOutput(OutputMetadata):
92
92
  def diversity_shannon(
93
93
  data: NDArray[Any],
94
94
  names: list[str],
95
- is_categorical: list[bool],
95
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
96
96
  subset_mask: NDArray[np.bool_] | None = None,
97
97
  ) -> NDArray[np.float64]:
98
98
  """
@@ -106,14 +106,16 @@ def diversity_shannon(
106
106
 
107
107
  Parameters
108
108
  ----------
109
- data: NDArray
109
+ data : NDArray
110
110
  Array containing numerical values for metadata factors
111
- names: list[str]
111
+ names : list[str]
112
112
  Names of metadata factors -- keys of the metadata dictionary
113
- is_categorical: list[bool]
114
- List of flags to identify whether variables are categorical (True) or
115
- continuous (False)
116
- subset_mask: NDArray[np.bool_] | None
113
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
114
+ The factors in names that have continuous values and the array of bin counts to
115
+ discretize values into. All factors are treated as having discrete values unless they
116
+ are specified as keys in this dictionary. Each element of this array must occur as a key
117
+ in names.
118
+ subset_mask : NDArray[np.bool_] or None, default None
117
119
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
118
120
 
119
121
  Note
@@ -122,18 +124,32 @@ def diversity_shannon(
122
124
 
123
125
  Returns
124
126
  -------
125
- diversity_index: NDArray
127
+ diversity_index : NDArray[np.float64]
126
128
  Diversity index per column of X
127
129
 
128
130
  See Also
129
131
  --------
130
132
  numpy.histogram
131
133
  """
134
+ hist_cache = {}
132
135
 
133
136
  # entropy computed using global auto bins so that we can properly normalize
134
- ent_unnormalized = entropy(data, names, is_categorical, normalized=False, subset_mask=subset_mask)
137
+ ent_unnormalized = entropy(
138
+ data,
139
+ names,
140
+ continuous_factor_bincounts,
141
+ normalized=False,
142
+ subset_mask=subset_mask,
143
+ hist_cache=hist_cache,
144
+ )
135
145
  # normalize by global counts rather than classwise counts
136
- num_bins = get_num_bins(data, names, is_categorical=is_categorical, subset_mask=subset_mask)
146
+ num_bins = get_num_bins(
147
+ data,
148
+ names,
149
+ continuous_factor_bincounts=continuous_factor_bincounts,
150
+ subset_mask=subset_mask,
151
+ hist_cache=hist_cache,
152
+ )
137
153
  ent_norm = np.empty(ent_unnormalized.shape)
138
154
  ent_norm[num_bins != 1] = ent_unnormalized[num_bins != 1] / np.log(num_bins[num_bins != 1])
139
155
  ent_norm[num_bins == 1] = 0
@@ -143,7 +159,7 @@ def diversity_shannon(
143
159
  def diversity_simpson(
144
160
  data: NDArray[Any],
145
161
  names: list[str],
146
- is_categorical: list[bool],
162
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
147
163
  subset_mask: NDArray[np.bool_] | None = None,
148
164
  ) -> NDArray[np.float64]:
149
165
  """
@@ -157,14 +173,16 @@ def diversity_simpson(
157
173
 
158
174
  Parameters
159
175
  ----------
160
- data: NDArray
176
+ data : NDArray
161
177
  Array containing numerical values for metadata factors
162
- names: list[str]
178
+ names : list[str]
163
179
  Names of metadata factors -- keys of the metadata dictionary
164
- is_categorical: list[bool]
165
- List of flags to identify whether variables are categorical (True) or
166
- continuous (False)
167
- subset_mask: NDArray[np.bool_] | None
180
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
181
+ The factors in names that have continuous values and the array of bin counts to
182
+ discretize values into. All factors are treated as having discrete values unless they
183
+ are specified as keys in this dictionary. Each element of this array must occur as a key
184
+ in names.
185
+ subset_mask : NDArray[np.bool_] or None, default None
168
186
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
169
187
 
170
188
  Note
@@ -175,35 +193,39 @@ def diversity_simpson(
175
193
 
176
194
  Returns
177
195
  -------
178
- NDArray
196
+ diversity_index : NDArray[np.float64]
179
197
  Diversity index per column of X
180
198
 
181
199
  See Also
182
200
  --------
183
201
  numpy.histogram
184
202
  """
203
+ hist_cache = {}
185
204
 
186
- hist_counts, _ = get_counts(data, names, is_categorical, subset_mask)
205
+ hist_counts = get_counts(data, names, continuous_factor_bincounts, subset_mask, hist_cache=hist_cache)
187
206
  # normalize by global counts, not classwise counts
188
- num_bins = get_num_bins(data, names, is_categorical)
207
+ num_bins = get_num_bins(data, names, continuous_factor_bincounts, hist_cache=hist_cache)
189
208
 
190
209
  ev_index = np.empty(len(names))
191
210
  # loop over columns for convenience
192
211
  for col, cnts in enumerate(hist_counts.values()):
193
212
  # relative frequencies
194
- p_i = cnts / cnts.sum()
213
+ p_i = cnts / np.sum(cnts)
195
214
  # inverse Simpson index normalized by (number of bins)
196
- s_0 = 1 / np.sum(p_i**2) / num_bins[col]
215
+ s_0 = 1 / np.sum(p_i**2) # / num_bins[col]
197
216
  if num_bins[col] == 1:
198
217
  ev_index[col] = 0
199
218
  else:
200
- ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
219
+ ev_index[col] = (s_0 - 1) / (num_bins[col] - 1)
201
220
  return ev_index
202
221
 
203
222
 
204
223
  @set_metadata()
205
224
  def diversity(
206
- class_labels: ArrayLike, metadata: Mapping[str, ArrayLike], method: Literal["shannon", "simpson"] = "simpson"
225
+ class_labels: ArrayLike,
226
+ metadata: Mapping[str, ArrayLike],
227
+ continuous_factor_bincounts: Mapping[str, int] | None = None,
228
+ method: Literal["simpson", "shannon"] = "simpson",
207
229
  ) -> DiversityOutput:
208
230
  """
209
231
  Compute :term:`diversity<Diversity>` and classwise diversity for discrete/categorical variables and,
@@ -216,11 +238,16 @@ def diversity(
216
238
 
217
239
  Parameters
218
240
  ----------
219
- class_labels: ArrayLike
241
+ class_labels : ArrayLike
220
242
  List of class labels for each image
221
- metadata: Mapping[str, ArrayLike]
243
+ metadata : Mapping[str, ArrayLike]
222
244
  Dict of list of metadata factors for each image
223
- method: Literal["shannon", "simpson"], default "simpson"
245
+ continuous_factor_bincounts : Mapping[str, int] or None, default None
246
+ The factors in metadata that have continuous values and the array of bin counts to
247
+ discretize values into. All factors are treated as having discrete values unless they
248
+ are specified as keys in this dictionary. Each element of this array must occur as a key
249
+ in metadata.
250
+ method : {"simpson", "shannon"}, default "simpson"
224
251
  Indicates which diversity index should be computed
225
252
 
226
253
  Note
@@ -239,40 +266,42 @@ def diversity(
239
266
  -------
240
267
  Compute Simpson diversity index of metadata and class labels
241
268
 
242
- >>> div_simp = diversity(class_labels, metadata, method="simpson")
269
+ >>> div_simp = diversity(class_labels, metadata, continuous_factor_bincounts, method="simpson")
243
270
  >>> div_simp.diversity_index
244
- array([0.18103448, 0.18103448, 0.88636364])
271
+ array([0.72413793, 0.72413793, 0.88636364])
245
272
 
246
273
  >>> div_simp.classwise
247
- array([[0.17241379, 0.39473684],
248
- [0.2 , 0.2 ]])
274
+ array([[0.68965517, 0.69230769],
275
+ [0.8 , 1. ]])
249
276
 
250
277
  Compute Shannon diversity index of metadata and class labels
251
278
 
252
- >>> div_shan = diversity(class_labels, metadata, method="shannon")
279
+ >>> div_shan = diversity(class_labels, metadata, continuous_factor_bincounts, method="shannon")
253
280
  >>> div_shan.diversity_index
254
- array([0.37955133, 0.37955133, 0.96748876])
281
+ array([0.8812909 , 0.8812909 , 0.96748876])
255
282
 
256
283
  >>> div_shan.classwise
257
- array([[0.43156028, 0.83224889],
258
- [0.57938016, 0.57938016]])
284
+ array([[0.86312057, 0.91651644],
285
+ [0.91829583, 1. ]])
259
286
 
260
287
  See Also
261
288
  --------
262
289
  numpy.histogram
263
290
  """
264
291
  diversity_fn = get_method({"simpson": diversity_simpson, "shannon": diversity_shannon}, method)
265
- data, names, is_categorical, unique_labels = preprocess_metadata(class_labels, metadata)
266
- diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
292
+ data, names, _, unique_labels = preprocess_metadata(class_labels, metadata)
293
+ diversity_index = diversity_fn(data, names, continuous_factor_bincounts)
294
+
295
+ class_idx = names.index(CLASS_LABEL)
296
+ class_lbl = data[:, class_idx]
267
297
 
268
- class_idx = names.index("class_label")
269
- u_classes = np.unique(data[:, class_idx])
298
+ u_classes = np.unique(class_lbl)
270
299
  num_factors = len(names)
271
300
  diversity = np.empty((len(u_classes), num_factors))
272
301
  diversity[:] = np.nan
273
302
  for idx, cls in enumerate(u_classes):
274
- subset_mask = data[:, class_idx] == cls
275
- diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
303
+ subset_mask = class_lbl == cls
304
+ diversity[idx, :] = diversity_fn(data, names, continuous_factor_bincounts, subset_mask)
276
305
  div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
277
306
 
278
- return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()), method)
307
+ return DiversityOutput(diversity_index, div_no_class, unique_labels, list(metadata.keys()))