dataeval 0.66.0__py3-none-any.whl → 0.68.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataeval/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.66.0"
1
+ __version__ = "0.68.0"
2
2
 
3
3
  from importlib.util import find_spec
4
4
 
@@ -1,28 +1,37 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Iterable
4
+ from typing import Generic, Iterable, Sequence, TypeVar, cast
5
5
 
6
6
  from numpy.typing import ArrayLike
7
7
 
8
+ from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
8
9
  from dataeval._internal.flags import ImageStat
9
10
  from dataeval._internal.metrics.stats import StatsOutput, imagestats
10
11
  from dataeval._internal.output import OutputMetadata, set_metadata
11
12
 
13
+ DuplicateGroup = list[int]
14
+ DatasetDuplicateGroupMap = dict[int, DuplicateGroup]
15
+ TIndexCollection = TypeVar("TIndexCollection", DuplicateGroup, DatasetDuplicateGroupMap)
16
+
12
17
 
13
18
  @dataclass(frozen=True)
14
- class DuplicatesOutput(OutputMetadata):
19
+ class DuplicatesOutput(Generic[TIndexCollection], OutputMetadata):
15
20
  """
16
21
  Attributes
17
22
  ----------
18
- exact : List[List[int]]
23
+ exact : list[list[int] | dict[int, list[int]]]
19
24
  Indices of images that are exact matches
20
- near: List[List[int]]
25
+ near: list[list[int] | dict[int, list[int]]]
21
26
  Indices of images that are near matches
27
+
28
+ - For a single dataset, indices are returned as a list of index groups.
29
+ - For multiple datasets, indices are returned as dictionaries where the key is the
30
+ index of the dataset, and the value is the list index groups from that dataset.
22
31
  """
23
32
 
24
- exact: list[list[int]]
25
- near: list[list[int]]
33
+ exact: list[TIndexCollection]
34
+ near: list[TIndexCollection]
26
35
 
27
36
 
28
37
  class Duplicates:
@@ -54,18 +63,18 @@ class Duplicates:
54
63
  def _get_duplicates(self) -> dict[str, list[list[int]]]:
55
64
  stats_dict = self.stats.dict()
56
65
  if "xxhash" in stats_dict:
57
- exact = {}
66
+ exact_dict: dict[int, list] = {}
58
67
  for i, value in enumerate(stats_dict["xxhash"]):
59
- exact.setdefault(value, []).append(i)
60
- exact = [v for v in exact.values() if len(v) > 1]
68
+ exact_dict.setdefault(value, []).append(i)
69
+ exact = [sorted(v) for v in exact_dict.values() if len(v) > 1]
61
70
  else:
62
71
  exact = []
63
72
 
64
73
  if "pchash" in stats_dict and not self.only_exact:
65
- near = {}
74
+ near_dict: dict[int, list] = {}
66
75
  for i, value in enumerate(stats_dict["pchash"]):
67
- near.setdefault(value, []).append(i)
68
- near = [v for v in near.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
76
+ near_dict.setdefault(value, []).append(i)
77
+ near = [sorted(v) for v in near_dict.values() if len(v) > 1 and not any(set(v).issubset(x) for x in exact)]
69
78
  else:
70
79
  near = []
71
80
 
@@ -75,14 +84,14 @@ class Duplicates:
75
84
  }
76
85
 
77
86
  @set_metadata("dataeval.detectors", ["only_exact"])
78
- def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> DuplicatesOutput:
87
+ def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> DuplicatesOutput:
79
88
  """
80
89
  Returns duplicate image indices for both exact matches and near matches
81
90
 
82
91
  Parameters
83
92
  ----------
84
- data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput
85
- A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
93
+ data : Iterable[ArrayLike], shape - (N, C, H, W) | StatsOutput | Sequence[StatsOutput]
94
+ A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
86
95
 
87
96
  Returns
88
97
  -------
@@ -98,12 +107,32 @@ class Duplicates:
98
107
  >>> dups.evaluate(images)
99
108
  DuplicatesOutput(exact=[[3, 20], [16, 37]], near=[[3, 20, 22], [12, 18], [13, 36], [14, 31], [17, 27], [19, 38, 47]])
100
109
  """ # noqa: E501
101
- if isinstance(data, StatsOutput):
102
- if not data.xxhash:
110
+
111
+ stats, dataset_steps = combine_stats(data)
112
+
113
+ if isinstance(stats, StatsOutput):
114
+ if not stats.xxhash:
103
115
  raise ValueError("StatsOutput must include xxhash information of the images.")
104
- if not self.only_exact and not data.pchash:
116
+ if not self.only_exact and not stats.pchash:
105
117
  raise ValueError("StatsOutput must include pchash information of the images for near matches.")
106
- self.stats = data
118
+ self.stats = stats
107
119
  else:
108
- self.stats = imagestats(data, ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH))
109
- return DuplicatesOutput(**self._get_duplicates())
120
+ flags = ImageStat.XXHASH | (ImageStat(0) if self.only_exact else ImageStat.PCHASH)
121
+ self.stats = imagestats(cast(Iterable[ArrayLike], data), flags)
122
+
123
+ duplicates = self._get_duplicates()
124
+
125
+ # split up results from combined dataset into individual dataset buckets
126
+ if dataset_steps:
127
+ dup_list: list[list[int]]
128
+ for dup_type, dup_list in duplicates.items():
129
+ dup_list_dict = []
130
+ for idxs in dup_list:
131
+ dup_dict = {}
132
+ for idx in idxs:
133
+ k, v = get_dataset_step_from_idx(idx, dataset_steps)
134
+ dup_dict.setdefault(k, []).append(v)
135
+ dup_list_dict.append(dup_dict)
136
+ duplicates[dup_type] = dup_list_dict
137
+
138
+ return DuplicatesOutput(**duplicates)
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Sequence, cast
4
+ from warnings import warn
5
+
6
+ import numpy as np
7
+
8
+ from dataeval._internal.metrics.stats import StatsOutput
9
+ from dataeval._internal.output import populate_defaults
10
+
11
+
12
+ def add_stats(a: StatsOutput, b: StatsOutput) -> StatsOutput:
13
+ if not isinstance(a, StatsOutput) or not isinstance(b, StatsOutput):
14
+ raise TypeError(f"Cannot add object of type {type(a)} and type {type(b)}.")
15
+
16
+ a_dict = a.dict()
17
+ b_dict = b.dict()
18
+ a_keys = set(a_dict)
19
+ b_keys = set(b_dict)
20
+
21
+ missing_keys = a_keys - b_keys
22
+ if missing_keys:
23
+ raise ValueError(f"Required keys are missing: {missing_keys}.")
24
+
25
+ extra_keys = b_keys - a_keys
26
+ if extra_keys:
27
+ warn(f"Extraneous keys will be dropped: {extra_keys}.")
28
+
29
+ # perform add of multi-channel stats
30
+ if "ch_idx_map" in a_dict:
31
+ for k, v in a_dict.items():
32
+ if k == "ch_idx_map":
33
+ offset = sum([len(idxs) for idxs in v.values()])
34
+ for ch_k, ch_v in b_dict[k].items():
35
+ if ch_k not in v:
36
+ v[ch_k] = []
37
+ a_dict[k][ch_k].extend([idx + offset for idx in ch_v])
38
+ else:
39
+ for ch_k in b_dict[k]:
40
+ if ch_k not in v:
41
+ v[ch_k] = b_dict[k][ch_k]
42
+ else:
43
+ v[ch_k] = np.concatenate((v[ch_k], b_dict[k][ch_k]), axis=1)
44
+ else:
45
+ for k in a_dict:
46
+ if isinstance(a_dict[k], list):
47
+ a_dict[k].extend(b_dict[k])
48
+ else:
49
+ a_dict[k] = np.concatenate((a_dict[k], b_dict[k]))
50
+
51
+ return StatsOutput(**populate_defaults(a_dict, StatsOutput))
52
+
53
+
54
+ def combine_stats(stats) -> tuple[StatsOutput | None, list[int]]:
55
+ dataset_steps = []
56
+
57
+ if isinstance(stats, StatsOutput):
58
+ return stats, dataset_steps
59
+
60
+ output = None
61
+ if isinstance(stats, Sequence) and isinstance(stats[0], StatsOutput):
62
+ stats = cast(Sequence[StatsOutput], stats)
63
+ cur_len = 0
64
+ for s in stats:
65
+ output = s if output is None else add_stats(output, s)
66
+ cur_len += len(s)
67
+ dataset_steps.append(cur_len)
68
+
69
+ return output, dataset_steps
70
+
71
+
72
+ def get_dataset_step_from_idx(idx: int, dataset_steps: list[int]) -> tuple[int, int]:
73
+ last_step = 0
74
+ for i, step in enumerate(dataset_steps):
75
+ if idx < step:
76
+ return i, idx - last_step
77
+ last_step = step
78
+ return -1, idx
@@ -1,27 +1,39 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
- from typing import Iterable, Literal
4
+ from typing import Iterable, Literal, Sequence, cast
5
+ from warnings import warn
5
6
 
6
7
  import numpy as np
7
8
  from numpy.typing import ArrayLike, NDArray
8
9
 
10
+ from dataeval._internal.detectors.merged_stats import combine_stats, get_dataset_step_from_idx
9
11
  from dataeval._internal.flags import ImageStat, to_distinct, verify_supported
10
12
  from dataeval._internal.metrics.stats import StatsOutput, imagestats
11
13
  from dataeval._internal.output import OutputMetadata, set_metadata
12
14
 
15
+ IndexIssueMap = dict[int, dict[str, float]]
16
+ DatasetIndexIssueMap = dict[int, IndexIssueMap]
17
+ """
18
+ Mapping of image indices to a dictionary of issue types and calculated values
19
+ """
20
+
13
21
 
14
22
  @dataclass(frozen=True)
15
23
  class OutliersOutput(OutputMetadata):
16
24
  """
17
25
  Attributes
18
26
  ----------
19
- issues : Dict[int, Dict[str, float]]
20
- Dictionary containing the indices of outliers and a dictionary showing
21
- the issues and calculated values for the given index.
27
+ issues : dict[int, dict[str, float]] | dict[int, dict[int, dict[str, float]]]
28
+ Indices of image outliers with their associated issue type and calculated values.
29
+
30
+ - For a single dataset, a dictionary containing the indices of outliers and
31
+ a dictionary showing the issues and calculated values for the given index.
32
+ - For multiple datasets, a map of dataset indices to the indices of outliers
33
+ and their associated issues and calculated values.
22
34
  """
23
35
 
24
- issues: dict[int, dict[str, float]]
36
+ issues: IndexIssueMap | DatasetIndexIssueMap
25
37
 
26
38
 
27
39
  def _get_outlier_mask(
@@ -64,7 +76,7 @@ class Outliers:
64
76
 
65
77
  Attributes
66
78
  ----------
67
- stats : Dict[str, Any]
79
+ stats : dict[str, Any]
68
80
  Dictionary to hold the value of each metric for each image
69
81
 
70
82
  See Also
@@ -135,14 +147,14 @@ class Outliers:
135
147
  return dict(sorted(flagged_images.items()))
136
148
 
137
149
  @set_metadata("dataeval.detectors", ["flags", "outlier_method", "outlier_threshold"])
138
- def evaluate(self, data: Iterable[ArrayLike] | StatsOutput) -> OutliersOutput:
150
+ def evaluate(self, data: Iterable[ArrayLike] | StatsOutput | Sequence[StatsOutput]) -> OutliersOutput:
139
151
  """
140
152
  Returns indices of outliers with the issues identified for each
141
153
 
142
154
  Parameters
143
155
  ----------
144
- data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput
145
- A dataset of images in an ArrayLike format or the output from an imagestats metric analysis
156
+ data : Iterable[ArrayLike], shape - (C, H, W) | StatsOutput | Sequence[StatsOutput]
157
+ A dataset of images in an ArrayLike format or the output(s) from an imagestats metric analysis
146
158
 
147
159
  Returns
148
160
  -------
@@ -157,13 +169,29 @@ class Outliers:
157
169
  >>> outliers.evaluate(images)
158
170
  OutliersOutput(issues={18: {'brightness': 0.78}, 25: {'brightness': 0.98}})
159
171
  """
160
- if isinstance(data, StatsOutput):
161
- flags = set(to_distinct(self.flags).values())
162
- stats = set(data.dict())
163
- missing = flags - stats
172
+ stats, dataset_steps = combine_stats(data)
173
+
174
+ if isinstance(stats, StatsOutput):
175
+ selected_flags = set(to_distinct(self.flags).values())
176
+ provided = set(stats.dict())
177
+ missing = selected_flags - provided
164
178
  if missing:
165
- raise ValueError(f"StatsOutput is missing {missing} from the required stats: {flags}.")
166
- self.stats = data
179
+ warn(
180
+ f"StatsOutput provided {provided} and is missing {missing} \
181
+ from the selected stat flags: {selected_flags}."
182
+ )
183
+ self.stats = stats
167
184
  else:
168
- self.stats = imagestats(data, self.flags)
169
- return OutliersOutput(self._get_outliers())
185
+ self.stats = imagestats(cast(Iterable[ArrayLike], data), self.flags)
186
+
187
+ outliers = self._get_outliers()
188
+
189
+ # split up results from combined dataset into individual dataset buckets
190
+ if dataset_steps:
191
+ out_dict = {}
192
+ for idx, issue in outliers.items():
193
+ k, v = get_dataset_step_from_idx(idx, dataset_steps)
194
+ out_dict.setdefault(k, {})[v] = issue
195
+ outliers = out_dict
196
+
197
+ return OutliersOutput(outliers)
@@ -17,11 +17,17 @@ class BalanceOutput(OutputMetadata):
17
17
  """
18
18
  Attributes
19
19
  ----------
20
- mutual_information : NDArray[np.float64]
20
+ balance : NDArray[np.float64]
21
21
  Estimate of mutual information between metadata factors and class label
22
+ factors : NDArray[np.float64]
23
+ Estimate of inter/intra-factor mutual information
24
+ classwise : NDArray[np.float64]
25
+ Estimate of mutual information between metadata factors and individual class labels
22
26
  """
23
27
 
24
- mutual_information: NDArray[np.float64]
28
+ balance: NDArray[np.float64]
29
+ factors: NDArray[np.float64]
30
+ classwise: NDArray[np.float64]
25
31
 
26
32
 
27
33
  def validate_num_neighbors(num_neighbors: int) -> int:
@@ -77,17 +83,22 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
77
83
  -------
78
84
  Return balance (mutual information) of factors with class_labels
79
85
 
80
- >>> balance(class_labels, metadata).mutual_information[0]
81
- array([0.99999822, 0.13363788, 0. , 0.02994455])
86
+ >>> bal = balance(class_labels, metadata)
87
+ >>> bal.balance
88
+ array([0.99999822, 0.13363788, 0.04505382, 0.02994455])
82
89
 
83
- Return balance (mutual information) of metadata factors with class_labels
84
- and each other
90
+ Return intra/interfactor balance (mutual information)
85
91
 
86
- >>> balance(class_labels, metadata).mutual_information
87
- array([[0.99999822, 0.13363788, 0. , 0.02994455],
88
- [0.13363788, 0.99999843, 0.01389763, 0.09725766],
89
- [0. , 0.01389763, 0.48549233, 0.15314612],
90
- [0.02994455, 0.09725766, 0.15314612, 0.99999856]])
92
+ >>> bal.factors
93
+ array([[0.99999843, 0.03510422, 0.09725766],
94
+ [0.03510422, 0.08433558, 0.15621459],
95
+ [0.09725766, 0.15621459, 0.99999856]])
96
+
97
+ Return classwise balance (mutual information) of factors with individual class_labels
98
+
99
+ >>> bal.classwise
100
+ array([[0.99999822, 0.13363788, 0. , 0. ],
101
+ [0.99999822, 0.13363788, 0. , 0. ]])
91
102
 
92
103
  See Also
93
104
  --------
@@ -102,13 +113,9 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
102
113
  mi[:] = np.nan
103
114
 
104
115
  for idx in range(num_factors):
105
- tgt = data[:, idx]
116
+ tgt = data[:, idx].astype(int)
106
117
 
107
118
  if is_categorical[idx]:
108
- if tgt.dtype == float:
109
- # map to unique integers if categorical
110
- _, tgt = np.unique(tgt, return_inverse=True)
111
- # categorical target
112
119
  mi[idx, :] = mutual_info_classif(
113
120
  data,
114
121
  tgt,
@@ -129,89 +136,40 @@ def balance(class_labels: Sequence[int], metadata: list[dict], num_neighbors: in
129
136
  norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
130
137
  # in principle MI should be symmetric, but it is not in practice.
131
138
  nmi = 0.5 * (mi + mi.T) / norm_factor
139
+ balance = nmi[0]
140
+ factors = nmi[1:, 1:]
132
141
 
133
- return BalanceOutput(nmi)
134
-
135
-
136
- @set_metadata("dataeval.metrics")
137
- def balance_classwise(class_labels: Sequence[int], metadata: list[dict], num_neighbors: int = 5) -> BalanceOutput:
138
- """
139
- Compute mutual information (analogous to correlation) between metadata factors
140
- (class label, metadata, label/image properties) with individual class labels.
141
-
142
- Parameters
143
- ----------
144
- class_labels: Sequence[int]
145
- List of class labels for each image
146
- metadata: List[Dict]
147
- List of metadata factors for each image
148
- num_neighbors: int, default 5
149
- Number of nearest neighbors to use for computing MI between discrete
150
- and continuous variables.
151
-
152
- Notes
153
- -----
154
- We use `mutual_info_classif` from sklearn since class label is categorical.
155
- `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
156
- seed. MI is computed differently for categorical and continuous variables, so we
157
- have to specify with is_categorical.
158
-
159
- Returns
160
- -------
161
- BalanceOutput
162
- (num_classes x num_factors) estimate of mutual information between
163
- num_factors metadata factors and individual class labels.
164
-
165
- Example
166
- -------
167
- Return classwise balance (mutual information) of factors with individual class_labels
168
-
169
- >>> balance_classwise(class_labels, metadata).mutual_information
170
- array([[0.13363788, 0.54085156, 0. ],
171
- [0.13363788, 0.54085156, 0. ]])
172
-
173
-
174
- See Also
175
- --------
176
- sklearn.feature_selection.mutual_info_classif
177
- sklearn.feature_selection.mutual_info_regression
178
- sklearn.metrics.mutual_info_score
179
- compute_mutual_information
180
- """
181
- num_neighbors = validate_num_neighbors(num_neighbors)
182
- data, names, is_categorical = preprocess_metadata(class_labels, metadata)
183
- num_factors = len(names)
184
142
  # unique class labels
185
143
  class_idx = names.index("class_label")
186
- class_data = data[:, class_idx]
144
+ class_data = data[:, class_idx].astype(int)
187
145
  u_cls = np.unique(class_data)
188
146
  num_classes = len(u_cls)
189
147
 
190
- data_no_class = np.concatenate((data[:, :class_idx], data[:, (class_idx + 1) :]), axis=1)
191
-
192
148
  # assume class is a factor
193
- mi = np.empty((num_classes, num_factors - 1))
194
- mi[:] = np.nan
149
+ classwise_mi = np.empty((num_classes, num_factors))
150
+ classwise_mi[:] = np.nan
195
151
 
196
152
  # categorical variables, excluding class label
197
153
  cat_mask = np.concatenate((is_categorical[:class_idx], is_categorical[(class_idx + 1) :]), axis=0).astype(int)
198
154
 
155
+ tgt_bin = np.stack([class_data == cls for cls in u_cls]).T.astype(int)
156
+ ent_tgt_bin = entropy(
157
+ tgt_bin, names=[str(idx) for idx in range(num_classes)], is_categorical=[True for idx in range(num_classes)]
158
+ )
159
+
199
160
  # classification MI for discrete/categorical features
200
- for idx, cls in enumerate(u_cls):
201
- tgt = class_data == cls
161
+ for idx in range(num_classes):
162
+ # tgt = class_data == cls
202
163
  # units: nat
203
- mi[idx, :] = mutual_info_classif(
204
- data_no_class,
205
- tgt,
164
+ classwise_mi[idx, :] = mutual_info_classif(
165
+ data,
166
+ tgt_bin[:, idx],
206
167
  discrete_features=cat_mask, # type: ignore
207
168
  n_neighbors=num_neighbors,
208
169
  random_state=0,
209
170
  )
210
171
 
211
- # let this recompute for all features including class label
212
- ent_all = entropy(data, names, is_categorical)
213
- ent_tgt = ent_all[class_idx]
214
- ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
215
- norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
216
- nmi = mi / norm_factor
217
- return BalanceOutput(nmi)
172
+ norm_factor = 0.5 * np.add.outer(ent_tgt_bin, ent_all) + 1e-6
173
+ classwise = classwise_mi / norm_factor
174
+
175
+ return BalanceOutput(balance, factors, classwise)
@@ -66,27 +66,22 @@ def coverage(
66
66
 
67
67
  Note
68
68
  ----
69
- Embeddings should be on the unit interval.
69
+ Embeddings should be on the unit interval [0-1].
70
70
 
71
71
  Example
72
72
  -------
73
- >>> coverage(embeddings)
74
- CoverageOutput(indices=array([], dtype=int64), radii=array([0.59307666, 0.56956307, 0.56328616, 0.70660265, 0.57778087,
75
- 0.53738624, 0.58968217, 1.27721334, 0.84378694, 0.67767021,
76
- 0.69680335, 1.35532621, 0.59764166, 0.8691945 , 0.83627602,
77
- 0.84187303, 0.62212358, 1.09039732, 0.67956797, 0.60134383,
78
- 0.83713908, 0.91784263, 1.12901193, 0.73907618, 0.63943983,
79
- 0.61188447, 0.47872713, 0.57207771, 0.92885883, 0.54750511,
80
- 0.83015726, 1.20721778, 0.50421928, 0.98312246, 0.59764166,
81
- 0.61009202, 0.73864073, 1.0381061 , 0.77598609, 0.72984036,
82
- 0.67573006, 0.48056064, 1.00050879, 0.89532971, 0.58395529,
83
- 0.95954793, 0.60134383, 1.10096454, 0.51955314, 0.73038702]), critical_value=0)
73
+ >>> results = coverage(embeddings)
74
+ >>> results.indices
75
+ array([447, 412, 8, 32, 63])
76
+ >>> results.critical_value
77
+ 0.8459038956941765
84
78
 
85
79
  Reference
86
80
  ---------
87
81
  This implementation is based on https://dl.acm.org/doi/abs/10.1145/3448016.3457315.
82
+
88
83
  [1] Seymour Sudman. 1976. Applied sampling. Academic Press New York (1976).
89
- """ # noqa: E501
84
+ """
90
85
 
91
86
  # Calculate distance matrix, look at the (k+1)th farthest neighbor for each image.
92
87
  embeddings = to_numpy(embeddings)
@@ -105,8 +100,9 @@ def coverage(
105
100
  pvals = np.where(crit > rho)[0]
106
101
  elif radius_type == "adaptive":
107
102
  # Use data adaptive cutoff as rho
108
- rho = int(n * percent)
109
- pvals = np.argsort(crit)[::-1][:rho]
103
+ selection = int(max(n * percent, 1))
104
+ pvals = np.argsort(crit)[::-1][:selection]
105
+ rho = float(np.mean(np.sort(crit)[::-1][selection - 1 : selection + 1]))
110
106
  else:
111
107
  raise ValueError(f"{radius_type} is an invalid radius type. Expected 'adaptive' or 'naive'")
112
108
  return CoverageOutput(pvals, crit, rho)
@@ -17,9 +17,12 @@ class DiversityOutput(OutputMetadata):
17
17
  ----------
18
18
  diversity_index : NDArray[np.float64]
19
19
  Diversity index for classes and factors
20
+ classwise : NDArray[np.float64]
21
+ Classwise diversity index [n_class x n_factor]
20
22
  """
21
23
 
22
24
  diversity_index: NDArray[np.float64]
25
+ classwise: NDArray[np.float64]
23
26
 
24
27
 
25
28
  def diversity_shannon(
@@ -39,6 +42,13 @@ def diversity_shannon(
39
42
 
40
43
  Parameters
41
44
  ----------
45
+ data: NDArray
46
+ Array containing numerical values for metadata factors
47
+ names: list[str]
48
+ Names of metadata factors -- keys of the metadata dictionary
49
+ is_categorical: list[bool]
50
+ List of flags to identify whether variables are categorical (True) or
51
+ continuous (False)
42
52
  subset_mask: NDArray[np.bool_] | None
43
53
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
44
54
 
@@ -76,14 +86,20 @@ def diversity_simpson(
76
86
  Compute diversity for discrete/categorical variables and, through standard
77
87
  histogram binning, for continuous variables.
78
88
 
79
- We define diversity as a normalized form of the inverse Simpson diversity
80
- index.
89
+ We define diversity as the inverse Simpson diversity index linearly rescaled to the unit interval.
81
90
 
82
91
  diversity = 1 implies that samples are evenly distributed across a particular factor
83
- diversity = 1/num_categories implies that all samples belong to one category/bin
92
+ diversity = 0 implies that all samples belong to one category/bin
84
93
 
85
94
  Parameters
86
95
  ----------
96
+ data: NDArray
97
+ Array containing numerical values for metadata factors
98
+ names: list[str]
99
+ Names of metadata factors -- keys of the metadata dictionary
100
+ is_categorical: list[bool]
101
+ List of flags to identify whether variables are categorical (True) or
102
+ continuous (False)
87
103
  subset_mask: NDArray[np.bool_] | None
88
104
  Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
89
105
 
@@ -91,10 +107,7 @@ def diversity_simpson(
91
107
  -----
92
108
  For continuous variables, histogram bins are chosen automatically. See
93
109
  numpy.histogram for details.
94
- The expression is undefined for q=1, but it approaches the Shannon entropy
95
- in the limit.
96
- If there is only one category, the diversity index takes a value of 1 =
97
- 1/N = 1/1. Entropy will take a value of 0.
110
+ If there is only one category, the diversity index takes a value of 0.
98
111
 
99
112
  Returns
100
113
  -------
@@ -116,8 +129,8 @@ def diversity_simpson(
116
129
  # relative frequencies
117
130
  p_i = cnts / cnts.sum()
118
131
  # inverse Simpson index normalized by (number of bins)
119
- ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
120
-
132
+ s_0 = 1 / np.sum(p_i**2) / num_bins[col]
133
+ ev_index[col] = (s_0 * num_bins[col] - 1) / (num_bins[col] - 1)
121
134
  return ev_index
122
135
 
123
136
 
@@ -129,9 +142,11 @@ def diversity(
129
142
  class_labels: Sequence[int], metadata: list[dict], method: Literal["shannon", "simpson"] = "simpson"
130
143
  ) -> DiversityOutput:
131
144
  """
132
- Compute diversity for discrete/categorical variables and, through standard
145
+ Compute diversity and classwise diversity for discrete/categorical variables and, through standard
133
146
  histogram binning, for continuous variables.
134
147
 
148
+ We define diversity as a normalized form of the inverse Simpson diversity index.
149
+
135
150
  diversity = 1 implies that samples are evenly distributed across a particular factor
136
151
  diversity = 0 implies that all samples belong to one category/bin
137
152
 
@@ -141,95 +156,51 @@ def diversity(
141
156
  List of class labels for each image
142
157
  metadata: List[Dict]
143
158
  List of metadata factors for each image
144
- metric: Literal["shannon", "simpson"], default "simpson"
145
- string variable indicating which diversity index should be used.
146
- Permissible values include "simpson" and "shannon"
159
+ method: Literal["shannon", "simpson"], default "simpson"
160
+ Indicates which diversity index should be computed
147
161
 
148
162
  Notes
149
163
  -----
150
164
  - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
165
+ - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
166
+ - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
151
167
 
152
168
  Returns
153
169
  -------
154
170
  DiversityOutput
155
- Diversity index per column of self.data or each factor in self.names
171
+ Diversity index per column of self.data or each factor in self.names and
172
+ classwise diversity [n_class x n_factor]
156
173
 
157
174
  Example
158
175
  -------
159
176
  Compute Simpson diversity index of metadata and class labels
160
177
 
161
- >>> diversity(class_labels, metadata, method="simpson").diversity_index
162
- array([0.34482759, 0.34482759, 0.90909091])
178
+ >>> div_simp = diversity(class_labels, metadata, method="simpson")
179
+ >>> div_simp.diversity_index
180
+ array([0.18103448, 0.18103448, 0.88636364])
181
+
182
+ >>> div_simp.classwise
183
+ array([[0.17241379, 0.39473684],
184
+ [0.2 , 0.2 ]])
163
185
 
164
186
  Compute Shannon diversity index of metadata and class labels
165
187
 
166
- >>> diversity(class_labels, metadata, method="shannon").diversity_index
188
+ >>> div_shan = diversity(class_labels, metadata, method="shannon")
189
+ >>> div_shan.diversity_index
167
190
  array([0.37955133, 0.37955133, 0.96748876])
168
191
 
169
-
170
- See Also
171
- --------
172
- numpy.histogram
173
- """
174
- diversity_fn = get_method(DIVERSITY_FN_MAP, method)
175
- data, names, is_categorical = preprocess_metadata(class_labels, metadata)
176
- diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
177
- return DiversityOutput(diversity_index)
178
-
179
-
180
- @set_metadata("dataeval.metrics")
181
- def diversity_classwise(
182
- class_labels: Sequence[int], metadata: list[dict], method: Literal["shannon", "simpson"] = "simpson"
183
- ) -> DiversityOutput:
184
- """
185
- Compute diversity for discrete/categorical variables and, through standard
186
- histogram binning, for continuous variables.
187
-
188
- We define diversity as a normalized form of the inverse Simpson diversity
189
- index.
190
-
191
- diversity = 1 implies that samples are evenly distributed across a particular factor
192
- diversity = 1/num_categories implies that all samples belong to one category/bin
193
-
194
- Parameters
195
- ----------
196
- class_labels: Sequence[int]
197
- List of class labels for each image
198
- metadata: List[Dict]
199
- List of metadata factors for each image
200
-
201
- Notes
202
- -----
203
- - For continuous variables, histogram bins are chosen automatically. See numpy.histogram for details.
204
- - The expression is undefined for q=1, but it approaches the Shannon entropy in the limit.
205
- - If there is only one category, the diversity index takes a value of 1 = 1/N = 1/1. Entropy will take a value of 0.
206
-
207
- Returns
208
- -------
209
- DiversityOutput
210
- Diversity index [n_class x n_factor]
211
-
212
- Example
213
- -------
214
- Compute classwise Simpson diversity index of metadata and class labels
215
-
216
- >>> diversity_classwise(class_labels, metadata, method="simpson").diversity_index
217
- array([[0.33793103, 0.51578947],
218
- [0.36 , 0.36 ]])
219
-
220
- Compute classwise Shannon diversity index of metadata and class labels
221
-
222
- >>> diversity_classwise(class_labels, metadata, method="shannon").diversity_index
192
+ >>> div_shan.classwise
223
193
  array([[0.43156028, 0.83224889],
224
194
  [0.57938016, 0.57938016]])
225
195
 
226
-
227
196
  See Also
228
197
  --------
229
198
  numpy.histogram
230
199
  """
231
200
  diversity_fn = get_method(DIVERSITY_FN_MAP, method)
232
201
  data, names, is_categorical = preprocess_metadata(class_labels, metadata)
202
+ diversity_index = diversity_fn(data, names, is_categorical, None).astype(np.float64)
203
+
233
204
  class_idx = names.index("class_label")
234
205
  class_lbl = data[:, class_idx]
235
206
 
@@ -241,4 +212,5 @@ def diversity_classwise(
241
212
  subset_mask = class_lbl == cls
242
213
  diversity[idx, :] = diversity_fn(data, names, is_categorical, subset_mask)
243
214
  div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
244
- return DiversityOutput(div_no_class)
215
+
216
+ return DiversityOutput(diversity_index, div_no_class)
@@ -89,6 +89,16 @@ class StatsOutput(OutputMetadata):
89
89
  def dict(self):
90
90
  return {k: v for k, v in self.__dict__.items() if not k.startswith("_") and len(v) > 0}
91
91
 
92
+ def __len__(self) -> int:
93
+ if self.ch_idx_map:
94
+ return sum([len(idxs) for idxs in self.ch_idx_map.values()])
95
+ else:
96
+ for a in self.__annotations__:
97
+ attr = getattr(self, a, None)
98
+ if attr is not None and hasattr(a, "__len__") and len(attr) > 0:
99
+ return len(attr)
100
+ return 0
101
+
92
102
 
93
103
  QUARTILES = (0, 25, 50, 75, 100)
94
104
 
@@ -11,7 +11,7 @@ from dataeval import __version__
11
11
 
12
12
  class OutputMetadata:
13
13
  _name: str
14
- _execution_time: str
14
+ _execution_time: datetime
15
15
  _execution_duration: float
16
16
  _arguments: dict[str, str]
17
17
  _state: dict[str, str]
@@ -1,14 +1,12 @@
1
- from dataeval._internal.metrics.balance import balance, balance_classwise
1
+ from dataeval._internal.metrics.balance import balance
2
2
  from dataeval._internal.metrics.coverage import coverage
3
- from dataeval._internal.metrics.diversity import diversity, diversity_classwise
3
+ from dataeval._internal.metrics.diversity import diversity
4
4
  from dataeval._internal.metrics.parity import label_parity, parity
5
5
 
6
6
  __all__ = [
7
7
  "balance",
8
- "balance_classwise",
9
8
  "coverage",
10
9
  "diversity",
11
- "diversity_classwise",
12
10
  "label_parity",
13
11
  "parity",
14
12
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: dataeval
3
- Version: 0.66.0
3
+ Version: 0.68.0
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Home-page: https://dataeval.ai/
6
6
  License: MIT
@@ -1,4 +1,4 @@
1
- dataeval/__init__.py,sha256=dshMbJco8lxfbbIg0DO5fSDsvgu4DKPGE5PzA7pwvPQ,590
1
+ dataeval/__init__.py,sha256=fV-lc8AokA2hnkUSOdX-Bxy0xmEfPTXVFB3VcYAoiA8,590
2
2
  dataeval/_internal/detectors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  dataeval/_internal/detectors/clusterer.py,sha256=hJwELUeAdZZ3OVLIfwalw2P7Zz13q2ZqrV6gx90s44E,20695
4
4
  dataeval/_internal/detectors/drift/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -8,7 +8,8 @@ dataeval/_internal/detectors/drift/ks.py,sha256=aoDx7ps-5vrSI8Q9ii6cwmKnAyaD8tjG
8
8
  dataeval/_internal/detectors/drift/mmd.py,sha256=xUMQDaLOcqc3Uq2xDvNR7hbt3WnmCR2etZlGCwYlu2c,7489
9
9
  dataeval/_internal/detectors/drift/torch.py,sha256=YhIN85MbUV3C4IJcRvqYdXSWLj5lUeEOb05T5DgB3xo,11552
10
10
  dataeval/_internal/detectors/drift/uncertainty.py,sha256=Ot8L42AnFbkij4J3Tis7VzXLv3hfBxoOWBP4UoCEnVs,5125
11
- dataeval/_internal/detectors/duplicates.py,sha256=BQMWHT4j3zMuzD-S9hUXuQjZDFsSrtG1GQiTjPEIJSI,3421
11
+ dataeval/_internal/detectors/duplicates.py,sha256=qkzbdWuJuUozFLqpnD6CYAGXQb7-aWw2mHr_cxXAfPo,4922
12
+ dataeval/_internal/detectors/merged_stats.py,sha256=WVPxz7n5fUkFKW3kobD_TkKkof51YjfIz4M_4CHh-1s,2517
12
13
  dataeval/_internal/detectors/ood/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
14
  dataeval/_internal/detectors/ood/ae.py,sha256=k8pZP7oPwVyQlv6YcoacNMzpmQZy7W222yYrdXGTYZI,2031
14
15
  dataeval/_internal/detectors/ood/aegmm.py,sha256=pffThqXRoLx3GuZXEQBd-xEy5DjAZHV7WSeP2HgM_TI,2403
@@ -16,17 +17,17 @@ dataeval/_internal/detectors/ood/base.py,sha256=Pw34uFEWOJZiG4ciM0ArUkqhiM8WCGl2
16
17
  dataeval/_internal/detectors/ood/llr.py,sha256=tCo8G7V8VaVuIZ09rg0ZXZmdE0N_zGm7vCfFUnGbGvo,10102
17
18
  dataeval/_internal/detectors/ood/vae.py,sha256=WbQugS-bBUTTqQ9PRLHBmSUtk7O2_PN4PBLJE9ieMjw,2921
18
19
  dataeval/_internal/detectors/ood/vaegmm.py,sha256=pVUSlVF2jo8uokyks2QzfBJnNtcFWmcF8EQl-azs2Bg,2832
19
- dataeval/_internal/detectors/outliers.py,sha256=e5Hr-MpRfCj96AknqN3Lizz4QoQPcEeY0ZofMVguKOg,6304
20
+ dataeval/_internal/detectors/outliers.py,sha256=tzIraHkooPA4gSb8lG0O3koVK-9fOQg8EPo3xvgL1Y4,7533
20
21
  dataeval/_internal/flags.py,sha256=FHRgm8NKB9AjQgPcAESYeSbqIszgxbSGfF0Xd_tSkyk,2169
21
22
  dataeval/_internal/interop.py,sha256=x4qj4EiBt5NthSxe8prSLrPDAEcipAdyyLwbNyCBaFk,1059
22
23
  dataeval/_internal/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- dataeval/_internal/metrics/balance.py,sha256=g-YYFpq0qy2xq4iHjBKZDMjOn5R9Rit6sSb53anBeis,7744
24
+ dataeval/_internal/metrics/balance.py,sha256=eAHvgjiGCH893XSQLqh9j9wgvAECoNPVT8k0u_9Ijzg,6097
24
25
  dataeval/_internal/metrics/ber.py,sha256=Onsi47AbT9rMvng-Pbu8LIrYRfLpI13En1FxkFoMKQs,4668
25
- dataeval/_internal/metrics/coverage.py,sha256=9ZvcNjItE9rEyA2UHPE1K9zpTbbib4xqk8WpPpDN8ok,4037
26
+ dataeval/_internal/metrics/coverage.py,sha256=EZVES1rbZW2j_CtQv1VFfSO-UmWcrt5nmqxDErtrG14,3473
26
27
  dataeval/_internal/metrics/divergence.py,sha256=nmMUfr9FGnH798eb6xzEiMj4C42rQVthh5HeexiY6EE,4119
27
- dataeval/_internal/metrics/diversity.py,sha256=2xEkLnaRhPOvsd2DCTDT-dVvPPEZOH4PKm0vufrgBq4,8207
28
+ dataeval/_internal/metrics/diversity.py,sha256=nGjYQ-NLjb8mPt1PAYnvkWH4D58kjM39IPs2FULfis4,7503
28
29
  dataeval/_internal/metrics/parity.py,sha256=suv1Pf7gPj0_NxsS0_M6ewfUndsFJyEhbt5NPp6ktMI,15457
29
- dataeval/_internal/metrics/stats.py,sha256=Xbm7lLB0OZtsoxClMIrfULSqT8VymQiQmohJFtN7oz8,16332
30
+ dataeval/_internal/metrics/stats.py,sha256=-gLGn8Yy-Xx0kkaF-Z_3RitqPLZJhhbflksSjBRN3iY,16702
30
31
  dataeval/_internal/metrics/uap.py,sha256=w-wvXXnX16kUq-weaZD2SrJi22LJ8EjOFbOhPxeGejI,2043
31
32
  dataeval/_internal/metrics/utils.py,sha256=mSYa-3cHGcsQwPr7zbdpzrnK_8jIXCiAcu2HCcvrtaY,13007
32
33
  dataeval/_internal/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -41,7 +42,7 @@ dataeval/_internal/models/tensorflow/losses.py,sha256=pZH5RnlM9R0RrBde9Lgq32muwA
41
42
  dataeval/_internal/models/tensorflow/pixelcnn.py,sha256=lRpRNebMgkCJUnEk1xouVaTfS_YGMQgQhI01wNKAjeM,48420
42
43
  dataeval/_internal/models/tensorflow/trainer.py,sha256=xNY0Iw7Qa1TnCuy9N1b77_VduFoW_BhbZjfQCxOVby4,4082
43
44
  dataeval/_internal/models/tensorflow/utils.py,sha256=l6jXKMWyQAEI4LpAONq95Xwr7CPgrs408ypf9TuNxkY,8732
44
- dataeval/_internal/output.py,sha256=7JEmbrbsDs6jgzqXgKNN9h1dMdfcB2iOP2wBsGCwA1c,3044
45
+ dataeval/_internal/output.py,sha256=bFC2qJxXUc_daQwJHHa9KfFNLuxZANGb7Dpget_TXYs,3049
45
46
  dataeval/_internal/utils.py,sha256=gK0z4buuQoUYblkrCiRV9pIESzyikcY-3a08XsQkD7E,1585
46
47
  dataeval/_internal/workflows/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
48
  dataeval/_internal/workflows/sufficiency.py,sha256=0k7Dbk3QmEGkZp2IW4OcZBcrxb4zAp9hC9nXGN1v1cY,18199
@@ -53,7 +54,7 @@ dataeval/detectors/linters/__init__.py,sha256=1yxsJw8CFpHsZwn_YUlWpb-4YBet5U6uB-
53
54
  dataeval/detectors/ood/__init__.py,sha256=ybWhwbMmWygIwE1A-nYihDfugrj3j0GiuABmVvD7264,583
54
55
  dataeval/flags/__init__.py,sha256=qo06_Tk0ul4lOhKSEs0HE2G6WBFvMwNJq77vRX1ynww,72
55
56
  dataeval/metrics/__init__.py,sha256=42szGyZrLekNU-T-rwJu-pUoDBdOoStuScB-mnGzjw4,81
56
- dataeval/metrics/bias/__init__.py,sha256=IV34GPYPOdpy3PtcCZYWaV9M9C8h_oYP56DliQcAYr0,427
57
+ dataeval/metrics/bias/__init__.py,sha256=xqpxCttgzz-hMZQI7_IlaNn4OGZaGVz3KKRd26GbSKE,335
57
58
  dataeval/metrics/estimators/__init__.py,sha256=fWQZUIxu88u5POYXN1yoFc-Hxx5B1fveEiiSXmK5kPk,210
58
59
  dataeval/metrics/stats/__init__.py,sha256=N5UvO7reDkYX1xFdAQjwALyJwcC2FAbruzd7ZYYW_4I,123
59
60
  dataeval/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -66,7 +67,7 @@ dataeval/torch/models/__init__.py,sha256=YnDnePYpRIKHyYn3F5qR1OObMSb-g0FGvI8X-uT
66
67
  dataeval/torch/trainer/__init__.py,sha256=Te-qElt8h-Zv8NN0r-VJOEdCPHTQ2yO3rd2MhRiZGZs,93
67
68
  dataeval/utils/__init__.py,sha256=ExQ1xj62MjcM9uIu1-g1P2fW0EPJpcIofnvxjQ908c4,172
68
69
  dataeval/workflows/__init__.py,sha256=gkU2B6yUiefexcYrBwqfZKNl8BvX8abUjfeNvVBXF4E,186
69
- dataeval-0.66.0.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
70
- dataeval-0.66.0.dist-info/METADATA,sha256=P04dHyQOp4_6lg0IkoUEXTGJAPPpgRwf5ZAwdYpuatc,4217
71
- dataeval-0.66.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
72
- dataeval-0.66.0.dist-info/RECORD,,
70
+ dataeval-0.68.0.dist-info/LICENSE.txt,sha256=Kpzcfobf1HlqafF-EX6dQLw9TlJiaJzfgvLQFukyXYw,1060
71
+ dataeval-0.68.0.dist-info/METADATA,sha256=XWLDiMY9JE2dxIDnRnJMQMLS8GPWFH2mbMDXkeP7Y5Q,4217
72
+ dataeval-0.68.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
73
+ dataeval-0.68.0.dist-info/RECORD,,