dataeval 0.63.0__py3-none-any.whl → 0.64.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. dataeval/__init__.py +3 -3
  2. dataeval/_internal/detectors/clusterer.py +2 -1
  3. dataeval/_internal/detectors/drift/base.py +2 -1
  4. dataeval/_internal/detectors/drift/cvm.py +2 -1
  5. dataeval/_internal/detectors/drift/ks.py +2 -1
  6. dataeval/_internal/detectors/drift/mmd.py +4 -3
  7. dataeval/_internal/detectors/drift/uncertainty.py +1 -2
  8. dataeval/_internal/detectors/duplicates.py +2 -1
  9. dataeval/_internal/detectors/linter.py +1 -1
  10. dataeval/_internal/detectors/ood/ae.py +2 -1
  11. dataeval/_internal/detectors/ood/aegmm.py +2 -1
  12. dataeval/_internal/detectors/ood/base.py +2 -1
  13. dataeval/_internal/detectors/ood/llr.py +3 -2
  14. dataeval/_internal/detectors/ood/vae.py +2 -1
  15. dataeval/_internal/detectors/ood/vaegmm.py +2 -1
  16. dataeval/_internal/interop.py +2 -11
  17. dataeval/_internal/metrics/balance.py +180 -0
  18. dataeval/_internal/metrics/base.py +1 -83
  19. dataeval/_internal/metrics/ber.py +122 -48
  20. dataeval/_internal/metrics/coverage.py +83 -74
  21. dataeval/_internal/metrics/divergence.py +67 -67
  22. dataeval/_internal/metrics/diversity.py +206 -0
  23. dataeval/_internal/metrics/parity.py +300 -155
  24. dataeval/_internal/metrics/stats.py +7 -5
  25. dataeval/_internal/metrics/uap.py +37 -29
  26. dataeval/_internal/metrics/utils.py +393 -0
  27. dataeval/_internal/utils.py +64 -0
  28. dataeval/metrics/__init__.py +25 -6
  29. dataeval/utils/__init__.py +9 -0
  30. {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/METADATA +1 -1
  31. dataeval-0.64.0.dist-info/RECORD +60 -0
  32. dataeval/_internal/functional/__init__.py +0 -0
  33. dataeval/_internal/functional/ber.py +0 -63
  34. dataeval/_internal/functional/coverage.py +0 -75
  35. dataeval/_internal/functional/divergence.py +0 -16
  36. dataeval/_internal/functional/hash.py +0 -79
  37. dataeval/_internal/functional/metadata.py +0 -136
  38. dataeval/_internal/functional/metadataparity.py +0 -190
  39. dataeval/_internal/functional/uap.py +0 -6
  40. dataeval/_internal/functional/utils.py +0 -158
  41. dataeval/_internal/maite/__init__.py +0 -0
  42. dataeval/_internal/maite/utils.py +0 -30
  43. dataeval/_internal/metrics/metadata.py +0 -610
  44. dataeval/_internal/metrics/metadataparity.py +0 -67
  45. dataeval-0.63.0.dist-info/RECORD +0 -68
  46. {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/LICENSE.txt +0 -0
  47. {dataeval-0.63.0.dist-info → dataeval-0.64.0.dist-info}/WHEEL +0 -0
@@ -1,610 +0,0 @@
1
- import warnings
2
- from typing import Dict, List
3
-
4
- import numpy as np
5
- import torch
6
- from numpy.typing import ArrayLike, NDArray
7
- from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
8
- from torchmetrics import Metric
9
-
10
- from dataeval._internal.functional.metadata import _entropy, _get_counts, _get_num_bins, _infer_categorical
11
-
12
-
13
- def str_to_int(d: Dict) -> Dict:
14
- """
15
- Map columns of dict that are not numeric (e.g. string) to numeric values
16
- that mutual information and diversity functions can accommodate. Each
17
- unique string receives a unique integer value.
18
-
19
- Parameters
20
- ----------
21
- d: Dict
22
- Dictionary of ndarray feature values or descriptors.
23
-
24
- Returns
25
- -------
26
- Dict
27
- Dictionary with same keys and non-numeric values mapped to numeric values.
28
- """
29
- for key, val in d.items():
30
- val = val.numpy() if torch.is_tensor(val) else val
31
- val = np.array(val) if isinstance(val, list) else val
32
- # if not numeric
33
- if not np.issubdtype(val.dtype, np.number):
34
- _, mapped_vals = np.unique(val, return_inverse=True)
35
- d[key] = mapped_vals
36
- return d
37
-
38
-
39
- def list_to_dict(list_of_dicts: List[Dict]) -> Dict:
40
- """
41
- Converts list of dicts to dict of ndarrays
42
-
43
- Parameters
44
- ----------
45
- list_of_dicts: List[Dict]
46
- list of dictionaries, typically of metadata factors
47
-
48
- Returns
49
- -------
50
- Dict[np.ndarray]
51
- dictionary whose columns are np.ndarray
52
- """
53
- return {k: np.array([dic[k] for dic in list_of_dicts]) for k in list_of_dicts[0]}
54
-
55
-
56
- class BaseBiasMetric(Metric):
57
- """
58
- Base class for bias metrics with common functionality for consuming
59
- metadata---subclasses torchmetrics.Metric
60
-
61
- Attributes
62
- ---------
63
- data: np.ndarray
64
- Array of metadata factors; string variables are converted to integers
65
- names: List[str]
66
- List of the names of metadata factor variables
67
- is_categorical: List
68
- List of boolean flags for categorical features. Mutual information is
69
- computed differently for categorical/discrete and continuous variables
70
- num_factors: int
71
- Number of metadata factors in the dataset
72
- num_samples: int
73
- Number of samples in the dataset
74
- """
75
-
76
- def __init__(self):
77
- super().__init__()
78
- self.names = []
79
- self.data = np.empty(0)
80
- self.is_categorical = []
81
-
82
- # torchmetric 'compute' function operates on these states
83
- self.add_state("metadata", default=[], dist_reduce_fx="cat")
84
- self.add_state("class_label", default=[], dist_reduce_fx="cat")
85
-
86
- self.num_factors = 0
87
- self.num_samples = 0
88
-
89
- def update(self, class_label: ArrayLike, metadata: List[Dict]):
90
- self.metadata.extend(metadata)
91
- self.class_label.append(class_label)
92
-
93
- def _collect_data(self):
94
- metadata_dict = {"class_label": np.concatenate(self.class_label).astype(int)}
95
- metadata_dict = {**metadata_dict, **list_to_dict(self.metadata)}
96
-
97
- # convert string variables to int
98
- metadata_dict = str_to_int(metadata_dict)
99
- self.data = np.stack(list(metadata_dict.values()), axis=-1)
100
- self.names = list(metadata_dict.keys())
101
-
102
- self.is_categorical = [_infer_categorical(metadata_dict[var], 0.25)[0] for var in self.names]
103
-
104
- # class_label is also in self.names
105
- self.num_factors = len(self.names)
106
- self.num_samples = len(self.metadata)
107
-
108
-
109
- class BaseBalanceMetric(BaseBiasMetric):
110
- """
111
- Base class for balance (mutual information) metrics. Contains input
112
- validation for balance metrics.
113
- """
114
-
115
- def __init__(self, num_neighbors: int):
116
- super().__init__()
117
- if not isinstance(num_neighbors, (int, float)):
118
- raise TypeError(
119
- f"Variable {num_neighbors} is not real-valued numeric type."
120
- "num_neighbors should be an int, greater than 0 and less than"
121
- "the number of samples in the dataset"
122
- )
123
- if num_neighbors < 1:
124
- raise ValueError(
125
- f"Invalid value for {num_neighbors}."
126
- "Choose a value greater than 0 and less than number of samples"
127
- "in the dataset."
128
- )
129
- if isinstance(num_neighbors, float):
130
- num_neighbors = int(num_neighbors)
131
- warnings.warn(f"Variable {num_neighbors} is currently type float and will be truncated to type int.")
132
-
133
- self.num_neighbors = num_neighbors
134
-
135
-
136
- class Balance(BaseBalanceMetric):
137
- """
138
- Metadata balance measures distributional correlation between metadata
139
- factors and class label to identify opportunities for shortcut learning or
140
- sampling bias in the dataset.
141
-
142
- Parameters
143
- ----------
144
- num_neighbors: int
145
- number of nearest neighbors used for the computation of
146
-
147
- Attributes
148
- ---------
149
- data: np.ndarray
150
- Array of metadata factors; string variables are converted to integers
151
- names: List[str]
152
- List of the names of metadata factor variables
153
- is_categorical: List
154
- List of boolean flags for categorical features. Mutual information is
155
- computed differently for categorical/discrete and continuous variables
156
- num_factors: int
157
- Number of metadata factors in the dataset
158
- num_samples: int
159
- Number of samples in the dataset
160
-
161
- Notes
162
- -----
163
- We use mutual_info_classif from sklearn since class label is categorical
164
- mutual_info_classif outputs are consistent up to O(1e-4) and depend on
165
- a random seed.
166
- MI is computed differently for categorical and continuous variables,
167
- and we attempt to infer whether a variable is categorical by the
168
- fraction of unique values in the dataset.
169
-
170
- See Also
171
- --------
172
- sklearn.feature_selection.mutual_info_classif
173
- sklearn.feature_selection.mutual_info_regression
174
- sklearn.metrics.mutual_info_score
175
- """
176
-
177
- def __init__(self, num_neighbors: int = 5):
178
- super().__init__(num_neighbors=num_neighbors)
179
-
180
- def compute(self) -> NDArray:
181
- """
182
- Mutual information (MI) between factors (class label, metadata, label/image properties)
183
-
184
- Parameters
185
- ----------
186
- num_neighbors: int
187
- Number of nearest neighbors to use for computing MI between discrete
188
- and continuous variables.
189
-
190
- Returns
191
- -------
192
- NDArray
193
- (num_factors+1) x (num_factors+1) estimate of mutual information
194
- between num_factors metadata factors and class label. Symmetry is enforced.
195
-
196
- See Also
197
- --------
198
- sklearn.feature_selection.mutual_info_classif
199
- sklearn.feature_selection.mutual_info_regression
200
- sklearn.metrics.mutual_info_score
201
- """
202
- self._collect_data()
203
- mi = np.empty((self.num_factors, self.num_factors))
204
- mi[:] = np.nan
205
-
206
- for idx, tgt_var in enumerate(self.names):
207
- tgt = self.data[:, idx]
208
-
209
- if self.is_categorical[idx]:
210
- # categorical target
211
- mi[idx, :] = mutual_info_classif(
212
- self.data,
213
- tgt,
214
- discrete_features=self.is_categorical, # type: ignore
215
- n_neighbors=self.num_neighbors,
216
- )
217
- else:
218
- # continuous variables
219
- mi[idx, :] = mutual_info_regression(
220
- self.data,
221
- tgt,
222
- discrete_features=self.is_categorical, # type: ignore
223
- n_neighbors=self.num_neighbors,
224
- )
225
-
226
- ent_all = _entropy(self.data, self.names, self.is_categorical, normalized=False)
227
- norm_factor = 0.5 * np.add.outer(ent_all, ent_all) + 1e-6
228
- # in principle MI should be symmetric, but it is not in practice.
229
- nmi = 0.5 * (mi + mi.T) / norm_factor
230
-
231
- return nmi
232
-
233
-
234
- class BalanceClasswise(BaseBalanceMetric):
235
- """
236
- Computes mutual information (analogous to correlation) between metadata
237
- factors (class label, metadata, label/image properties) with individual
238
- class labels.
239
-
240
- Parameters
241
- ----------
242
- num_neighbors: int
243
- Number of nearest neighbors to use for computing MI between discrete
244
- and continuous variables.
245
-
246
- Attributes
247
- ----------
248
- num_neighbors: int
249
- Number of nearest neighbors to use for computing MI between discrete
250
- and continuous variables.
251
- data: np.ndarray
252
- Array of metadata factors; string variables are converted to integers
253
- names: List[str]
254
- List of the names of metadata factor variables
255
- is_categorical: List
256
- List of boolean flags for categorical features. Mutual information is
257
- computed differently for categorical/discrete and continuous variables
258
- num_factors: int
259
- Number of metadata factors in the dataset
260
- num_samples: int
261
- Number of samples in the dataset
262
- """
263
-
264
- def __init__(self, num_neighbors: int = 5):
265
- super().__init__(num_neighbors)
266
-
267
- def compute(self) -> NDArray:
268
- """
269
- Compute mutual information between metadata factors (class label, metadata,
270
- label/image properties) with individual class labels.
271
-
272
- Parameters
273
- ----------
274
- num_neighbors: int
275
- Number of nearest neighbors to use for computing MI between discrete
276
- and continuous variables.
277
-
278
- Notes
279
- -----
280
- We use mutual_info_classif from sklearn since class label is categorical
281
- mutual_info_classif outputs are consistent up to O(1e-4) and depend on
282
- a random seed
283
- MI is computed differently for categorical and continuous variables,
284
- so we have to specify with self.is_categorical.
285
-
286
- Returns
287
- -------
288
- NDArray
289
- (num_classes x num_factors) estimate of mutual information between
290
- num_factors metadata factors and individual class labels.
291
-
292
- See Also
293
- --------
294
- sklearn.feature_selection.mutual_info_classif
295
- sklearn.feature_selection.mutual_info_regression
296
- sklearn.metrics.mutual_info_score
297
- compute_mutual_information
298
- """
299
-
300
- self._collect_data()
301
- # unique class labels
302
- class_idx = self.names.index("class_label")
303
- class_data = self.data[:, class_idx]
304
- u_cls = np.unique(class_data)
305
- num_classes = len(u_cls)
306
-
307
- data_no_class = np.concatenate((self.data[:, :class_idx], self.data[:, (class_idx + 1) :]), axis=1)
308
-
309
- # assume class is a factor
310
- mi = np.empty((num_classes, self.num_factors - 1))
311
- mi[:] = np.nan
312
-
313
- # categorical variables, excluding class label
314
- cat_mask = np.concatenate(
315
- (self.is_categorical[:class_idx], self.is_categorical[(class_idx + 1) :]), axis=0
316
- ).astype(int)
317
-
318
- # classification MI for discrete/categorical features
319
- for idx, cls in enumerate(u_cls):
320
- tgt = class_data == cls
321
- # units: nat
322
- mi[idx, :] = mutual_info_classif(
323
- data_no_class,
324
- tgt,
325
- discrete_features=cat_mask, # type: ignore
326
- n_neighbors=self.num_neighbors,
327
- )
328
-
329
- # let this recompute for all features including class label
330
- ent_all = _entropy(self.data, self.names, self.is_categorical)
331
- ent_tgt = ent_all[class_idx]
332
- ent_all = np.concatenate((ent_all[:class_idx], ent_all[(class_idx + 1) :]), axis=0)
333
- norm_factor = 0.5 * np.add.outer(ent_tgt, ent_all) + 1e-6
334
- nmi = mi / norm_factor
335
- return nmi
336
-
337
-
338
- class BaseDiversityMetric(BaseBiasMetric):
339
- """
340
- Base class for Diversity and ClasswiseDiversity metrics.
341
-
342
- Parameters
343
- ----------
344
- metric: str
345
- string variable indicating which diversity index should be used.
346
- Permissible values include "simpson" and "shannon"
347
-
348
- Attributes
349
- ----------
350
- metric: str
351
- string variable indicating which diversity index should be used.
352
- Permissible values include "simpson" and "shannon"
353
- data: np.ndarray
354
- Array of metadata factors; string variables are converted to integers
355
- names: List[str]
356
- List of the names of metadata factor variables
357
- is_categorical: List
358
- List of boolean flags for categorical features. Mutual information is
359
- computed differently for categorical/discrete and continuous variables
360
- num_factors: int
361
- Number of metadata factors in the dataset
362
- num_samples: int
363
- Number of samples in the dataset
364
- """
365
-
366
- def __init__(self, metric: str):
367
- super().__init__()
368
- allowed_metrics = ["simpson", "shannon"]
369
- if metric.lower() not in allowed_metrics:
370
- raise ValueError(f"metric '{metric}' should be one of {allowed_metrics}")
371
- self.metric = metric
372
-
373
- def _diversity_simpson(self, subset_mask: np.ndarray = np.empty(shape=0)) -> np.ndarray:
374
- """
375
- Compute diversity for discrete/categorical variables and, through standard
376
- histogram binning, for continuous variables.
377
-
378
- We define diversity as a normalized form of the inverse Simpson diversity
379
- index.
380
-
381
- diversity = 1 implies that samples are evenly distributed across a particular factor
382
- diversity = 1/num_categories implies that all samples belong to one category/bin
383
-
384
- Parameters
385
- ----------
386
- subset_mask: Optional[np.ndarray[bool]]
387
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
388
-
389
- Notes
390
- -----
391
- For continuous variables, histogram bins are chosen automatically. See
392
- numpy.histogram for details.
393
- The expression is undefined for q=1, but it approaches the Shannon entropy
394
- in the limit.
395
- If there is only one category, the diversity index takes a value of 1 =
396
- 1/N = 1/1. Entropy will take a value of 0.
397
-
398
- Returns
399
- -------
400
- np.ndarray
401
- Diversity index per column of X
402
-
403
- See Also
404
- --------
405
- numpy.histogram
406
- """
407
-
408
- # hist_counts,_ = _get_counts(subset_mask)
409
- hist_counts, _ = _get_counts(self.data, self.names, self.is_categorical, subset_mask)
410
- # normalize by global counts, not classwise counts
411
- num_bins = _get_num_bins(self.data, self.names, self.is_categorical)
412
-
413
- ev_index = np.empty(self.num_factors)
414
- # loop over columns for convenience
415
- for col, cnts in enumerate(hist_counts.values()):
416
- # relative frequencies
417
- p_i = cnts / cnts.sum()
418
- # inverse Simpson index normalized by (number of bins)
419
- ev_index[col] = 1 / np.sum(p_i**2) / num_bins[col]
420
-
421
- return ev_index
422
-
423
- def _diversity_shannon(self, subset_mask: np.ndarray = np.empty(shape=0)) -> np.ndarray:
424
- """
425
- Compute diversity for discrete/categorical variables and, through standard
426
- histogram binning, for continuous variables.
427
-
428
- We define diversity as a normalized form of the Shannon entropy.
429
-
430
- diversity = 1 implies that samples are evenly distributed across a particular factor
431
- diversity = 0 implies that all samples belong to one category/bin
432
-
433
- Parameters
434
- ----------
435
- subset_mask: Optional[np.ndarray[bool]]
436
- Boolean mask of samples to bin (e.g. when computing per class). True -> include in histogram counts
437
-
438
- Notes
439
- -----
440
- - For continuous variables, histogram bins are chosen automatically. See
441
- numpy.histogram for details.
442
-
443
- Returns
444
- -------
445
- diversity_index: np.ndarray
446
- Diversity index per column of X
447
-
448
- See Also
449
- --------
450
- numpy.histogram
451
- """
452
-
453
- # entropy computed using global auto bins so that we can properly normalize
454
- ent_unnormalized = _entropy(
455
- self.data, self.names, self.is_categorical, normalized=False, subset_mask=subset_mask
456
- )
457
- # normalize by global counts rather than classwise counts
458
- num_bins = _get_num_bins(self.data, self.names, is_categorical=self.is_categorical, subset_mask=subset_mask)
459
- return ent_unnormalized / np.log(num_bins)
460
-
461
-
462
- class DiversityClasswise(BaseDiversityMetric):
463
- """
464
- Classwise diversity index: evenness of the distribution of metadata factors
465
- per class.
466
-
467
- Parameters
468
- ----------
469
- metric: str
470
- string variable indicating which diversity index should be used.
471
- Permissible values include "simpson" and "shannon"
472
-
473
- Attributes
474
- ----------
475
- metric: str
476
- string variable indicating which diversity index should be used.
477
- Permissible values include "simpson" and "shannon"
478
- data: np.ndarray
479
- Array of metadata factors; string variables are converted to integers
480
- names: List[str]
481
- List of the names of metadata factor variables
482
- is_categorical: List
483
- List of boolean flags for categorical features. Mutual information is
484
- computed differently for categorical/discrete and continuous variables
485
- num_factors: int
486
- Number of metadata factors in the dataset
487
- num_samples: int
488
- Number of samples in the dataset
489
-
490
- """
491
-
492
- def __init__(self, metric="simpson"):
493
- super().__init__(metric=metric)
494
-
495
- def compute(self):
496
- """
497
- Compute diversity for discrete/categorical variables and, through standard
498
- histogram binning, for continuous variables.
499
-
500
- We define diversity as a normalized form of the inverse Simpson diversity
501
- index.
502
-
503
- diversity = 1 implies that samples are evenly distributed across a particular factor
504
- diversity = 1/num_categories implies that all samples belong to one category/bin
505
-
506
- Notes
507
- -----
508
- For continuous variables, histogram bins are chosen automatically. See
509
- numpy.histogram for details.
510
- The expression is undefined for q=1, but it approaches the Shannon entropy
511
- in the limit.
512
- If there is only one category, the diversity index takes a value of 1 =
513
- 1/N = 1/1. Entropy will take a value of 0.
514
-
515
- Returns
516
- -------
517
- np.ndarray
518
- Diversity index [n_class x n_factor]
519
-
520
- See Also
521
- --------
522
- diversity_simpson
523
- diversity_shannon
524
- numpy.histogram
525
- """
526
- self._collect_data()
527
-
528
- class_idx = self.names.index("class_label")
529
- class_labels = self.data[:, class_idx]
530
-
531
- u_classes = np.unique(class_labels)
532
- num_factors = len(self.names)
533
- diversity = np.empty((len(u_classes), num_factors))
534
- diversity[:] = np.nan
535
- for idx, cls in enumerate(u_classes):
536
- subset_mask = class_labels == cls
537
- if self.metric == "simpson":
538
- diversity[idx, :] = self._diversity_simpson(subset_mask)
539
- elif self.metric == "shannon":
540
- diversity[idx, :] = self._diversity_shannon(subset_mask)
541
- div_no_class = np.concatenate((diversity[:, :class_idx], diversity[:, (class_idx + 1) :]), axis=1)
542
- return div_no_class
543
-
544
-
545
- class Diversity(BaseDiversityMetric):
546
- """
547
- Diversity index: evenness of the distribution of metadata factors to
548
- identify imbalance or undersampled data categories.
549
-
550
- Parameters
551
- ----------
552
- metric: str
553
- string variable indicating which diversity index should be used.
554
- Permissible values include "simpson" and "shannon"
555
-
556
- Attributes
557
- ----------
558
- metric: str
559
- string variable indicating which diversity index should be used.
560
- Permissible values include "simpson" and "shannon"
561
- data: np.ndarray
562
- Array of metadata factors; string variables are converted to integers
563
- names: List[str]
564
- List of the names of metadata factor variables
565
- is_categorical: List
566
- List of boolean flags for categorical features. Mutual information is
567
- computed differently for categorical/discrete and continuous variables
568
- num_factors: int
569
- Number of metadata factors in the dataset
570
- num_samples: int
571
- Number of samples in the dataset
572
- """
573
-
574
- def __init__(self, metric="simpson"):
575
- super().__init__(metric=metric)
576
-
577
- def compute(self):
578
- """
579
- Compute diversity for discrete/categorical variables and, through standard
580
- histogram binning, for continuous variables.
581
-
582
- diversity = 1 implies that samples are evenly distributed across a particular factor
583
- diversity = 0 implies that all samples belong to one category/bin
584
-
585
- Parameters
586
- ----------
587
- metric: str
588
- The type of diversity index to return, currently ["simpson",
589
- "shannon"]
590
-
591
- Notes
592
- -----
593
- - For continuous variables, histogram bins are chosen automatically. See
594
- numpy.histogram for details.
595
-
596
- Returns
597
- -------
598
- diversity_index: np.ndarray
599
- Diversity index per column of self.data or each factor in self.names
600
-
601
- See Also
602
- --------
603
- numpy.histogram
604
-
605
- """
606
- self._collect_data()
607
- if self.metric.lower() == "simpson":
608
- return self._diversity_simpson()
609
- elif self.metric.lower() == "shannon":
610
- return self._diversity_shannon()
@@ -1,67 +0,0 @@
1
- from typing import Optional
2
-
3
- import numpy as np
4
-
5
- from dataeval._internal.functional.metadataparity import compute_parity, format_discretize_factors
6
-
7
-
8
- class MetadataParity:
9
- def __init__(
10
- self,
11
- data_factors: dict[str, np.ndarray],
12
- continuous_factor_names: Optional[np.ndarray] = None,
13
- continuous_factor_bincounts: Optional[np.ndarray] = None,
14
- ):
15
- """
16
- Sets up the internal list of metadata factors.
17
-
18
- Parameters
19
- ----------
20
- data_factors: Dict[str, np.ndarray]
21
- The dataset factors, which are per-image attributes including class label and metadata.
22
- Each key of dataset_factors is a factor, whose value is the per-image factor values.
23
- continuous_factor_names : np.ndarray, default None
24
- The factors in data_factors that have continuous values.
25
- All factors are treated as having discrete values unless they
26
- are specified in this array. Each element of this array must occur as a key in data_factors.
27
- continuous_factor_bincounts : np.ndarray, default None
28
- Array of the bin counts to discretize values into for each factor in continuous_factor_names.
29
- """
30
-
31
- continuous_factor_names = (
32
- np.array([], dtype=str) if continuous_factor_names is None else np.array(continuous_factor_names)
33
- )
34
- continuous_factor_bincounts = (
35
- 10 * np.ones(len(continuous_factor_names), dtype=int)
36
- if continuous_factor_bincounts is None
37
- else np.array(continuous_factor_bincounts)
38
- )
39
-
40
- self.metadata_factors, self.labels = format_discretize_factors(
41
- data_factors, continuous_factor_names, continuous_factor_bincounts
42
- )
43
-
44
- def evaluate(self) -> dict[str, np.ndarray]:
45
- """
46
- Evaluates the statistical independence of metadata factors from class labels.
47
- This performs a chi-square test, which provides a score and a p-value for
48
- statistical independence between each pair of a metadata factor and a class label.
49
- A high score with a low p-value suggests that a metadata factor is strongly
50
- correlated with a class label.
51
-
52
- Returns
53
- -------
54
- Dict[str, np.ndarray]
55
- chi_square: np.ndarray
56
- Array of length (num_factors) whose (i)th element corresponds to
57
- the chi-square score for the relationship between factor i
58
- and the class labels in the dataset.
59
- p_values: np.ndarray
60
- Array of length (num_factors) whose (i)th element corresponds to
61
- the p-value for the chi-square test for the relationship between
62
- factor i and the class labels in the dataset.
63
- """
64
- chi_square, p_values = compute_parity(self.metadata_factors, self.labels)
65
-
66
- formatted_output = {"chi_squares": chi_square, "p_values": p_values}
67
- return formatted_output