dataeval 1.0.5__tar.gz → 1.0.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. {dataeval-1.0.5 → dataeval-1.0.6}/PKG-INFO +1 -1
  2. {dataeval-1.0.5 → dataeval-1.0.6}/pyproject.toml +18 -1
  3. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_metadata.py +1 -1
  4. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_version.py +2 -2
  5. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/_balance.py +19 -16
  6. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/_diversity.py +4 -2
  7. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/_parity.py +0 -2
  8. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_ber.py +10 -2
  9. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_clusterer.py +1 -1
  10. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_coverage.py +31 -13
  11. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_feature_distance.py +4 -3
  12. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_label_parity.py +3 -3
  13. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_metadata_insights.py +24 -20
  14. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_mst.py +1 -1
  15. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_mutual_info.py +20 -20
  16. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_uap.py +2 -2
  17. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/_sufficiency.py +1 -1
  18. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/_duplicates.py +1 -1
  19. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/_outliers.py +2 -2
  20. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/scope/_prioritize.py +6 -3
  21. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_base.py +2 -1
  22. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_domain_classifier.py +25 -9
  23. {dataeval-1.0.5 → dataeval-1.0.6}/.gitignore +0 -0
  24. {dataeval-1.0.5 → dataeval-1.0.6}/LICENSE +0 -0
  25. {dataeval-1.0.5 → dataeval-1.0.6}/README.md +0 -0
  26. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/__init__.py +0 -0
  27. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_embeddings.py +0 -0
  28. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_experimental.py +0 -0
  29. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_helpers.py +0 -0
  30. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_log.py +0 -0
  31. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_warm_cache.py +0 -0
  32. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/__init__.py +0 -0
  33. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/config.py +0 -0
  34. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/__init__.py +0 -0
  35. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_bin.py +0 -0
  36. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/__init__.py +0 -0
  37. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_base.py +0 -0
  38. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_cache.py +0 -0
  39. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
  40. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_hashstats.py +0 -0
  41. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
  42. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_register.py +0 -0
  43. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_registry.py +0 -0
  44. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_visualstats.py +0 -0
  45. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_completeness.py +0 -0
  46. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_compute_ratios.py +0 -0
  47. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_compute_stats.py +0 -0
  48. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_divergence.py +0 -0
  49. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_diversity.py +0 -0
  50. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
  51. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
  52. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_mst.py +0 -0
  53. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_hash.py +0 -0
  54. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_label_errors.py +0 -0
  55. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_label_stats.py +0 -0
  56. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_nullmodel.py +0 -0
  57. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_parity.py +0 -0
  58. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_rank.py +0 -0
  59. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/exceptions.py +0 -0
  60. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/__init__.py +0 -0
  61. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_bovw.py +0 -0
  62. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_flatten.py +0 -0
  63. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_onnx.py +0 -0
  64. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_torch.py +0 -0
  65. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_uncertainty.py +0 -0
  66. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/flags.py +0 -0
  67. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/__init__.py +0 -0
  68. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/_aggregator.py +0 -0
  69. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/_output.py +0 -0
  70. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/schedules.py +0 -0
  71. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/protocols.py +0 -0
  72. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/py.typed +0 -0
  73. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/__init__.py +0 -0
  74. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/_shared.py +0 -0
  75. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/scope/__init__.py +0 -0
  76. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/__init__.py +0 -0
  77. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_classbalance.py +0 -0
  78. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_classfilter.py +0 -0
  79. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_indices.py +0 -0
  80. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_limit.py +0 -0
  81. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_reverse.py +0 -0
  82. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_select.py +0 -0
  83. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_shuffle.py +0 -0
  84. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/__init__.py +0 -0
  85. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/__init__.py +0 -0
  86. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_chunk.py +0 -0
  87. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
  88. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
  89. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_mmd.py +0 -0
  90. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
  91. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_univariate.py +0 -0
  92. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/__init__.py +0 -0
  93. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_base.py +0 -0
  94. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
  95. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
  96. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/__init__.py +0 -0
  97. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
  98. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
  99. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/_reconstruction.py +0 -0
  100. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/update_strategies.py +0 -0
  101. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/types.py +0 -0
  102. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/__init__.py +0 -0
  103. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/_internal.py +0 -0
  104. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/data.py +0 -0
  105. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/losses.py +0 -0
  106. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/models.py +0 -0
  107. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/onnx.py +0 -0
  108. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/preprocessing.py +0 -0
  109. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/thresholds.py +0 -0
  110. {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/training.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataeval
3
- Version: 1.0.5
3
+ Version: 1.0.6
4
4
  Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
5
5
  Project-URL: Homepage, https://dataeval.ai/
6
6
  Project-URL: Repository, https://github.com/aria-ml/dataeval/
@@ -114,17 +114,23 @@ docs = [
114
114
  "sphinx-tabs>=3.4.7",
115
115
  "Sphinx>=7.2.6,<9.0.0", # sphinx-immaterial <= 0.13.9 is not compatible with sphinx >=9.0
116
116
  "torchmetrics>=1.0.0",
117
- "torchvision>=0.17.0",
118
117
  "markupsafe>=3,<3.0.2",
119
118
  "jupytext>=1.19.1",
120
119
  ]
121
120
  security = [ # keep in sync with [tool.uv.constraint-dependencies]
122
121
  "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
123
122
  "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
123
+ "onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
124
+ # CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
125
+ # CVE-2026-27489: Vulnerable to Path Traversal via Symlink
126
+ # GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
124
127
  "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
128
+ "poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
125
129
  "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
126
130
  "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
127
131
  # CVE-2026-24049: (wheel) privilege escalation via unpack
132
+ "tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
133
+ # CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
128
134
  ]
129
135
  dev = [
130
136
  { include-group = "base" },
@@ -150,10 +156,17 @@ conflicts = [
150
156
  constraint-dependencies = [
151
157
  "cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
152
158
  "filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
159
+ "onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
160
+ # CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
161
+ # CVE-2026-27489: Vulnerable to Path Traversal via Symlink
162
+ # GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
153
163
  "pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
164
+ "poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
154
165
  "protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
155
166
  "setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
156
167
  # CVE-2026-24049: (wheel) privilege escalation via unpack
168
+ "tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
169
+ # CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
157
170
  ]
158
171
 
159
172
  [[tool.uv.index]]
@@ -211,6 +224,9 @@ version-file = "src/dataeval/_version.py"
211
224
  [tool.poetry]
212
225
  version = "0.0.0" # unused
213
226
 
227
+ [tool.poetry.dependencies]
228
+ python = ">=3.10,<3.15"
229
+
214
230
  [tool.pyproject2conda.dependencies]
215
231
  numpy = { skip = true, packages = "numpy>=1.24.2" }
216
232
  scikit-learn = { skip = true, packages = "scikit-learn>=1.5.0" }
@@ -307,6 +323,7 @@ max-complexity = 5
307
323
  convention = "numpy"
308
324
 
309
325
  [tool.ruff.format]
326
+ preview = true
310
327
  quote-style = "double"
311
328
  indent-style = "space"
312
329
  skip-magic-trailing-comma = false
@@ -650,7 +650,7 @@ class Metadata(Array, FeatureExtractor):
650
650
  -------
651
651
  Sequence[str]
652
652
  List of factor names that passed filtering and preprocessing steps.
653
- Order matches columns in factor_data and binned_data.
653
+ Order matches columns in factor_data.
654
654
 
655
655
  Notes
656
656
  -----
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
18
18
  commit_id: str | None
19
19
  __commit_id__: str | None
20
20
 
21
- __version__ = version = '1.0.5'
22
- __version_tuple__ = version_tuple = (1, 0, 5)
21
+ __version__ = version = '1.0.6'
22
+ __version_tuple__ = version_tuple = (1, 0, 6)
23
23
 
24
24
  __commit_id__ = commit_id = None
@@ -22,28 +22,30 @@ class BalanceOutput(DictOutput):
22
22
  """
23
23
  Output class for the :class:`.Balance` :term:`bias<Bias>` evaluator.
24
24
 
25
- Contains three polars DataFrames with mutual information scores and threshold flags.
25
+ Contains three polars DataFrames with normalized mutual information scores and threshold flags.
26
26
 
27
27
  Attributes
28
28
  ----------
29
29
  balance : pl.DataFrame
30
- DataFrame with global class-to-factor mutual information:
30
+ DataFrame with global class-to-factor normalized mutual information:
31
31
 
32
- - factor_name: str - Name of the metadata factor
33
- - mi_value: float - Mutual information value between this factor and class labels
32
+ - factor_name: str - Name of the metadata factor. Includes "class_label"
33
+ which represents the self-information (always 1.0).
34
+ - mi_value: float - Normalized mutual information value between this
35
+ factor and class labels
34
36
  factors : pl.DataFrame
35
- DataFrame with inter-factor mutual information correlations:
37
+ DataFrame with inter-factor normalized mutual information correlations:
36
38
 
37
39
  - factor1: str - Name of the first factor
38
40
  - factor2: str - Name of the second factor
39
- - mi_value: float - Mutual information value
41
+ - mi_value: float - Normalized mutual information value
40
42
  - is_correlated: bool - True if mi_value > factor_correlation_threshold
41
43
  classwise : pl.DataFrame
42
- DataFrame with per-class-to-factor mutual information:
44
+ DataFrame with per-class-to-factor normalized mutual information:
43
45
 
44
46
  - class_name: str - Name of the class
45
47
  - factor_name: str - Name of the metadata factor
46
- - mi_value: float - Mutual information value
48
+ - mi_value: float - Normalized mutual information value
47
49
  - is_imbalanced: bool - True if mi_value > class_imbalance_threshold
48
50
  """
49
51
 
@@ -58,21 +60,21 @@ class BalanceOutput(DictOutput):
58
60
 
59
61
  class Balance(Evaluator):
60
62
  """
61
- Computes mutual information (MI) between factors (class label, metadata, label/image properties).
63
+ Computes normalized mutual information (NMI) between factors (class label, metadata, label/image properties).
62
64
 
63
65
  Identifies imbalanced classes and highly correlated metadata factors based on
64
- mutual information thresholds.
66
+ NMI thresholds.
65
67
 
66
68
  Parameters
67
69
  ----------
68
70
  num_neighbors : int, default 5
69
71
  Number of points to consider as neighbors
70
72
  class_imbalance_threshold : float, default 0.3
71
- Threshold for identifying imbalanced classes. Classes with MI above this
73
+ Threshold for identifying imbalanced classes. Classes with NMI above this
72
74
  threshold with any metadata factor are considered imbalanced.
73
75
  factor_correlation_threshold : float, default 0.5
74
76
  Threshold for identifying highly correlated metadata factors. Factor pairs
75
- with MI above this threshold are considered highly correlated.
77
+ with NMI above this threshold are considered highly correlated.
76
78
 
77
79
  Attributes
78
80
  ----------
@@ -89,7 +91,8 @@ class Balance(Evaluator):
89
91
  -----
90
92
  We use `mutual_info_classif` from sklearn since class label is categorical.
91
93
  `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
92
- seed. MI is computed differently for categorical and continuous variables.
94
+ seed. MI is computed differently for categorical and continuous variables, and
95
+ in all cases normalized or transformed to [0, 1] prior to being returned.
93
96
 
94
97
  Examples
95
98
  --------
@@ -149,7 +152,7 @@ class Balance(Evaluator):
149
152
  @set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
150
153
  def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
151
154
  """
152
- Compute mutual information between factors and identify imbalanced classes.
155
+ Compute normalized mutual information between factors and identify imbalanced classes.
153
156
 
154
157
  Parameters
155
158
  ----------
@@ -160,7 +163,7 @@ class Balance(Evaluator):
160
163
  Returns
161
164
  -------
162
165
  BalanceOutput
163
- Three DataFrames containing MI scores and threshold flags:
166
+ Three DataFrames containing NMI scores and threshold flags:
164
167
 
165
168
  - balance: Global class-to-factor mutual information
166
169
  - factors: Inter-factor mutual information
@@ -168,7 +171,7 @@ class Balance(Evaluator):
168
171
 
169
172
  Example
170
173
  -------
171
- Return balance (mutual information) of factors with class_labels
174
+ Return balance (NMI) of factors with class_labels
172
175
 
173
176
  >>> from dataeval import Metadata
174
177
  >>> metadata = Metadata(dataset)
@@ -56,7 +56,7 @@ class Diversity(Evaluator):
56
56
  Through standard histogram binning, for continuous variables.
57
57
 
58
58
  The method specified defines diversity as the inverse Simpson diversity index linearly rescaled to
59
- the unit interval, or the normalized form of the Shannon entropy.
59
+ the unit interval [0, 1], or the normalized form of the Shannon entropy.
60
60
 
61
61
  diversity = 1 implies that samples are evenly distributed across a particular factor
62
62
  diversity = 0 implies that all samples belong to one category/bin
@@ -66,7 +66,9 @@ class Diversity(Evaluator):
66
66
  Parameters
67
67
  ----------
68
68
  method : "simpson" or "shannon", default "simpson"
69
- The methodology used for defining diversity
69
+ The methodology used for defining diversity. When "simpson" is used,
70
+ the index is linearly rescaled so that 1.0 represents maximum diversity
71
+ (even distribution) and 0.0 represents minimum diversity (all samples in one bin).
70
72
  threshold : float, default 0.5
71
73
  Threshold for identifying low diversity. Factors with diversity values
72
74
  at or below this threshold are flagged as having low diversity.
@@ -118,8 +118,6 @@ class Parity(Evaluator):
118
118
 
119
119
  >>> config = Parity.Config(score_threshold=0.4, p_value_threshold=0.01)
120
120
  >>> parity = Parity(config=config)
121
-
122
- output = parity(metadata.binned_data, metadata.class_labels.tolist())
123
121
  """
124
122
 
125
123
  class Config(EvaluatorConfig):
@@ -78,6 +78,8 @@ def ber_mst(embeddings: ArrayND[float], class_labels: Array1D[int]) -> BERResult
78
78
  """
79
79
  Estimate Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree.
80
80
 
81
+ BER bounds the irreducible classification error given the current feature
82
+ representation — the error attributable to class overlap in embedding space.
81
83
  Uses FR with a minimum spanning tree (MST) test statistic basis.
82
84
 
83
85
  Parameters
@@ -137,7 +139,13 @@ def ber_knn(embeddings: ArrayND[float], class_labels: Array1D[int], k: int) -> B
137
139
  """
138
140
  Estimate Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using KNN.
139
141
 
140
- Uses KNN test statistic basis.
142
+ BER bounds the irreducible classification error given the current feature
143
+ representation — the error attributable to class overlap in embedding space.
144
+ Uses KNN test statistic basis. The estimator's behavior depends on the value of k:
145
+ - k=1: Uses 1-NN for the lower bound and 2-NN for the upper bound.
146
+ - k=2: Uses 2-NN for the lower bound and 3-NN for the upper bound.
147
+ - 2<k<=5: Uses k-NN for the lower bound and (k+1)-NN for the upper bound.
148
+ - k>5: Only available for binary classification; uses k-NN for both bounds with specialized asymptotic weights.
141
149
 
142
150
  Parameters
143
151
  ----------
@@ -146,7 +154,7 @@ def ber_knn(embeddings: ArrayND[float], class_labels: Array1D[int], k: int) -> B
146
154
  class_labels : Array1D[int]
147
155
  Array of class labels for each image. Can be a 1D list, or array-like object.
148
156
  k : int
149
- Number of nearest neighbors for KNN estimator
157
+ Number of nearest neighbors for KNN estimator. Should be between 1 and the number of samples.
150
158
 
151
159
  Returns
152
160
  -------
@@ -241,7 +241,7 @@ class _HDBSCANSorter:
241
241
  n_samples_per_cluster = np.bincount(labels)
242
242
  _logger.debug(
243
243
  "HDBSCAN clustering complete: %d clusters, samples per cluster: min=%d, max=%d, mean=%.1f",
244
- clst.unique_clusters,
244
+ len(clst.unique_clusters),
245
245
  np.min(n_samples_per_cluster),
246
246
  np.max(n_samples_per_cluster),
247
247
  np.mean(n_samples_per_cluster),
@@ -6,8 +6,8 @@ from typing import TypedDict
6
6
 
7
7
  import numpy as np
8
8
  from numpy.typing import NDArray
9
- from scipy.spatial.distance import pdist, squareform
10
9
 
10
+ from dataeval.core._mst import _compute_nearest_neighbors
11
11
  from dataeval.types import Array2D
12
12
  from dataeval.utils._internal import as_numpy, ensure_embeddings, flatten_samples
13
13
 
@@ -22,19 +22,24 @@ class CoverageResult(TypedDict):
22
22
  ----------
23
23
  uncovered_indices : NDArray[np.intp]
24
24
  Array of indices for uncovered observations
25
- critical_value_radii : NDArray[np.float64]
25
+ critical_value_radii : NDArray[np.float32]
26
26
  Array of critical value radii for each observation
27
27
  coverage_radius : float
28
28
  The radius threshold for coverage
29
29
  """
30
30
 
31
31
  uncovered_indices: NDArray[np.intp]
32
- critical_value_radii: NDArray[np.float64]
32
+ critical_value_radii: NDArray[np.float32]
33
33
  coverage_radius: float
34
34
 
35
35
 
36
- def _validate_inputs(embeddings: NDArray[np.float64], num_observations: int) -> NDArray[np.float64]:
37
- embeddings = ensure_embeddings(embeddings, dtype=np.float64, unit_interval=True)
36
+ def _validate_inputs(
37
+ embeddings: NDArray[np.float64],
38
+ num_observations: int,
39
+ force_unit_interval: bool = False,
40
+ ) -> NDArray[np.float64]:
41
+ unit_interval = "force" if force_unit_interval else True
42
+ embeddings = ensure_embeddings(embeddings, dtype=np.float64, unit_interval=unit_interval)
38
43
  if len(embeddings) <= num_observations:
39
44
  raise ValueError(
40
45
  f"Length of embeddings ({len(embeddings)}) is less than or equal to the specified number of \
@@ -43,15 +48,17 @@ def _validate_inputs(embeddings: NDArray[np.float64], num_observations: int) ->
43
48
  return embeddings
44
49
 
45
50
 
46
- def _calculate_critical_value_radii(embeddings: NDArray[np.float64], num_observations: int) -> NDArray[np.float64]:
47
- embeddings_matrix = squareform(pdist(flatten_samples(embeddings))).astype(np.float64)
48
- sorted_dists = np.sort(embeddings_matrix, axis=1)
49
- return sorted_dists[:, num_observations]
51
+ def _calculate_critical_value_radii(embeddings: NDArray[np.float64], num_observations: int) -> NDArray[np.float32]:
52
+ _, embeddings_matrix = _compute_nearest_neighbors(
53
+ flatten_samples(embeddings), None, num_observations, return_distances=True
54
+ )
55
+ return embeddings_matrix[:, -1]
50
56
 
51
57
 
52
58
  def coverage_naive(
53
59
  embeddings: Array2D[float],
54
60
  num_observations: int,
61
+ force_unit_interval: bool = False,
55
62
  ) -> CoverageResult:
56
63
  """
57
64
  Evaluate :term:`coverage<Coverage>` using a naive radius calculation method.
@@ -68,6 +75,9 @@ def coverage_naive(
68
75
  num_observations : int
69
76
  Number of observations required in order to be covered.
70
77
  [1] suggests that a minimum of 20-50 samples is necessary.
78
+ force_unit_interval : bool, default False
79
+ If True, embeddings will be automatically rescaled to the unit interval [0, 1].
80
+ If False, a ValueError is raised if embeddings are outside [0, 1].
71
81
 
72
82
  Returns
73
83
  -------
@@ -81,7 +91,7 @@ def coverage_naive(
81
91
  Raises
82
92
  ------
83
93
  ValueError
84
- If embeddings are not unit interval [0-1]
94
+ If embeddings are not unit interval [0-1] and force_unit_interval is False
85
95
  ValueError
86
96
  If length of :term:`embeddings<Embeddings>` is less than or equal to num_observations
87
97
 
@@ -101,7 +111,9 @@ def coverage_naive(
101
111
  """
102
112
  _logger.info("Starting coverage_naive calculation with num_observations=%d", num_observations)
103
113
 
104
- embeddings_np = _validate_inputs(as_numpy(embeddings, dtype=np.float64, required_ndim=2), num_observations)
114
+ embeddings_np = _validate_inputs(
115
+ as_numpy(embeddings, dtype=np.float64, required_ndim=2), num_observations, force_unit_interval
116
+ )
105
117
  _logger.debug("Embeddings shape: %s", embeddings_np.shape)
106
118
 
107
119
  critical_value_radii = _calculate_critical_value_radii(embeddings_np, num_observations)
@@ -132,6 +144,7 @@ def coverage_adaptive(
132
144
  embeddings: Array2D[float],
133
145
  num_observations: int,
134
146
  percent: float,
147
+ force_unit_interval: bool = False,
135
148
  ) -> CoverageResult:
136
149
  """
137
150
  Evaluate :term:`coverage<Coverage>` using an adaptive radius calculation method.
@@ -150,6 +163,9 @@ def coverage_adaptive(
150
163
  [1] suggests that a minimum of 20-50 samples is necessary.
151
164
  percent : float
152
165
  Percent of observations to be considered uncovered. Should be between 0 and 1.
166
+ force_unit_interval : bool, default False
167
+ If True, embeddings will be automatically rescaled to the unit interval [0, 1].
168
+ If False, a ValueError is raised if embeddings are outside [0, 1].
153
169
 
154
170
  Returns
155
171
  -------
@@ -163,7 +179,7 @@ def coverage_adaptive(
163
179
  Raises
164
180
  ------
165
181
  ValueError
166
- If embeddings are not unit interval [0-1]
182
+ If embeddings are not unit interval [0-1] and force_unit_interval is False
167
183
  ValueError
168
184
  If length of :term:`embeddings<Embeddings>` is less than or equal to num_observations
169
185
 
@@ -188,7 +204,9 @@ def coverage_adaptive(
188
204
  percent,
189
205
  )
190
206
 
191
- embeddings = _validate_inputs(as_numpy(embeddings, dtype=np.float64, required_ndim=2), num_observations)
207
+ embeddings = _validate_inputs(
208
+ as_numpy(embeddings, dtype=np.float64, required_ndim=2), num_observations, force_unit_interval
209
+ )
192
210
  _logger.debug("Embeddings shape: %s", embeddings.shape)
193
211
 
194
212
  critical_value_radii = _calculate_critical_value_radii(embeddings, num_observations)
@@ -33,7 +33,8 @@ class FeatureDistanceResult(TypedDict):
33
33
  location : float
34
34
  The normalized location where the KS statistic was achieved
35
35
  dist : float
36
- The Earth Mover's Distance (Wasserstein distance) between distributions
36
+ The Wasserstein distance between distributions, scaled by the
37
+ Interquartile Range (IQR) of the reference distribution.
37
38
  p_value : float
38
39
  The p-value from the KS test
39
40
  """
@@ -67,7 +68,7 @@ def feature_distance(
67
68
  Measure the feature-wise distance between two continuous distributions.
68
69
 
69
70
  Computes a p-value to evaluate its significance.
70
- Uses the Earth Mover's Distance and the Kolmogorov-Smirnov two-sample test, featurewise.
71
+ Uses the Kolmogorov-Smirnov two-sample test and an IQR-scaled Wasserstein distance, featurewise.
71
72
 
72
73
  Parameters
73
74
  ----------
@@ -83,7 +84,7 @@ def feature_distance(
83
84
 
84
85
  - statistic: float - The Kolmogorov-Smirnov test statistic
85
86
  - location: float - The normalized location where the KS statistic was achieved
86
- - dist: float - The Earth Mover's Distance between distributions
87
+ - dist: float - The IQR-scaled Wasserstein distance between distributions
87
88
  - p_value: float - The p-value from the KS test
88
89
 
89
90
  See Also
@@ -194,9 +194,9 @@ def label_parity(
194
194
  f"Found {len(observed_dist)} unique classes in observed label distribution, "
195
195
  f"but found {len(expected_dist)} unique classes in expected label distribution. "
196
196
  "This can happen when some class ids have zero instances in one dataset but "
197
- "not in the other. When initializing Parity, try setting the num_classes "
198
- "parameter to the known number of unique class ids, so that classes with "
199
- "zero instances are still included in the distributions.",
197
+ "not in the other. Try setting the num_classes parameter to the known number "
198
+ "of unique class ids, so that classes with zero instances are still included "
199
+ "in the distributions.",
200
200
  )
201
201
 
202
202
  cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
@@ -16,12 +16,6 @@ from dataeval.protocols import SequenceLike
16
16
 
17
17
  _logger = logging.getLogger(__name__)
18
18
 
19
- _NATS2BITS = 1.442695
20
- """
21
- _NATS2BITS is the reciprocal of natural log of 2. If you have an information/entropy-type quantity measured in nats,
22
- which is what many library functions return, multiply it by _NATS2BITS to get it in bits.
23
- """
24
-
25
19
 
26
20
  def _calc_median_deviations(reference: NDArray[Any], test: NDArray[Any]) -> NDArray[Any]:
27
21
  """
@@ -189,12 +183,14 @@ def factor_predictors( # noqa: C901
189
183
  discrete_features: list[bool] | None = None,
190
184
  ) -> Mapping[str, float]:
191
185
  """
192
- Compute mutual information between metadata factors and flagged sample indices.
186
+ Compute a measure of mutual information between metadata factors and flagged sample indices.
193
187
 
194
188
  Given a set of metadata factors per sample and indices of flagged samples, this function
195
189
  calculates the mutual information between each factor and the flagged status.
196
190
  In other words, it finds which metadata factors most likely correlate to a
197
- flagged sample (e.g., outliers, OOD samples, or other anomalies).
191
+ flagged sample (e.g., outliers, OOD samples, or other anomalies). The maximum possible MI
192
+ is equal to the entropy of the flagged indices, so we normalize by that entropy in order
193
+ to return a measure of association on a scale from 0 to 1.
198
194
 
199
195
  Parameters
200
196
  ----------
@@ -213,14 +209,15 @@ def factor_predictors( # noqa: C901
213
209
  -------
214
210
  Mapping[str, float]
215
211
  A map with keys corresponding to factor names, and values indicating the strength of association
216
- between each named factor and the flagged status, as mutual information measured in bits.
212
+ between each named factor and the flagged status, as normalized mutual information.
217
213
  Returns dict with 0.0 values for all factors if no indices are provided.
218
214
 
219
215
  Notes
220
216
  -----
221
217
  A high mutual information between a factor and flagged samples is an indication of correlation,
222
218
  but not causation. Additional analysis should be done to determine how to handle factors
223
- with a high mutual information.
219
+ with a high mutual information. And note that "high" is always relative to the information
220
+ or entropy represented by the flagged indices, which is why we use that entropy to normalize.
224
221
 
225
222
  Examples
226
223
  --------
@@ -230,7 +227,7 @@ def factor_predictors( # noqa: C901
230
227
  ... }
231
228
  >>> indices = [2, 3, 4] # Flag last three samples
232
229
  >>> factor_predictors(factors, indices)
233
- {'time': 0.8415720833333329, 'altitude': 0.0}
230
+ {'time': 0.866750699769533, 'altitude': 0.0}
234
231
  """
235
232
  if not factors:
236
233
  raise ValueError("factors dictionary cannot be empty")
@@ -266,15 +263,22 @@ def factor_predictors( # noqa: C901
266
263
  f"discrete_features length ({len(discrete_features)}) must match number of factors ({len(factor_names)})",
267
264
  )
268
265
 
269
- mutual_info_values = (
270
- mutual_info_classif(
271
- X=scaled_data,
272
- y=sample_mask,
273
- discrete_features=discrete_features, # type: ignore - sklearn function not typed
274
- random_state=get_seed(),
275
- n_jobs=get_max_processes(), # type: ignore
276
- )
277
- * _NATS2BITS
266
+ mutual_info_values = mutual_info_classif(
267
+ X=scaled_data,
268
+ y=sample_mask,
269
+ discrete_features=discrete_features, # type: ignore - sklearn function not typed
270
+ random_state=get_seed(),
271
+ n_jobs=get_max_processes(), # type: ignore
278
272
  )
279
273
 
274
+ # We normalize the mutual info by the entropy of the flag, i.e. by its maximal
275
+ # information content. This yields a true measure of the strength of
276
+ # association between metadata factors and the flag, from 0 to 1.
277
+ if 0 < (frac_flagged := len(indices) / n_samples) < 1:
278
+ flagged_entropy = -(frac_flagged * np.log(frac_flagged) + (1 - frac_flagged) * np.log(1 - frac_flagged))
279
+ mutual_info_values = np.clip(mutual_info_values / flagged_entropy, 0, 1)
280
+ else:
281
+ # all or none are flagged, no MI possible.
282
+ mutual_info_values = np.zeros_like(mutual_info_values)
283
+
280
284
  return {k: mutual_info_values[i] for i, k in enumerate(factor_names)}
@@ -100,7 +100,7 @@ def _compute_nearest_neighbors(
100
100
  distances, neighbors = nbrs.kneighbors(data_query, return_distance=True)
101
101
 
102
102
  if return_distances:
103
- return neighbors, distances
103
+ return neighbors, distances.astype(np.float32)
104
104
  return neighbors
105
105
 
106
106
 
@@ -19,7 +19,7 @@ _logger = logging.getLogger(__name__)
19
19
 
20
20
  class MutualInfoResult(TypedDict):
21
21
  """
22
- Type definition for mutual information output.
22
+ Type definition for normalized mutual information output.
23
23
 
24
24
  Attributes
25
25
  ----------
@@ -60,18 +60,18 @@ def _merge_labels_and_factors(
60
60
  factor_data: NDArray[np.intp],
61
61
  discrete_features: Iterable[bool] | None,
62
62
  ) -> tuple[NDArray[np.intp], list[bool]]:
63
- discrete_features = [True] + (
63
+ discrete_list = [True] + (
64
64
  [not is_continuous(d) for d in factor_data.T] if discrete_features is None else list(discrete_features)
65
65
  )
66
66
 
67
67
  # Use numeric data for MI
68
68
  data = np.hstack((class_labels[:, np.newaxis], factor_data))
69
69
  # Present discrete features composed of distinct values as continuous for `mutual_info_classif`
70
- for i in range(len(discrete_features)):
70
+ for i in range(len(discrete_list)):
71
71
  if len(data) == len(np.unique(data[:, i])):
72
- discrete_features[i] = False
72
+ discrete_list[i] = False
73
73
 
74
- return data, discrete_features
74
+ return data, discrete_list
75
75
 
76
76
 
77
77
  def mutual_info( # noqa: C901
@@ -81,7 +81,7 @@ def mutual_info( # noqa: C901
81
81
  num_neighbors: int = 5,
82
82
  ) -> MutualInfoResult:
83
83
  """
84
- Compute mutual information between factors, transformed to lie in [0, 1].
84
+ Compute normalized mutual information between factors, transformed to lie in [0, 1].
85
85
 
86
86
  Factors include class label, metadata, and label/image properties.
87
87
 
@@ -101,8 +101,8 @@ def mutual_info( # noqa: C901
101
101
  MutualInfoResult
102
102
  TypedDict containing:
103
103
 
104
- - class_to_factor: NDArray[np.float64] - 1D array of MI between class labels and each factor
105
- - interfactor: NDArray[np.float64] - (num_factors) x (num_factors) matrix of MI between factors only
104
+ - class_to_factor: NDArray[np.float64] - 1D array of normalized MI between class labels and each factor
105
+ - interfactor: NDArray[np.float64] - (num_factors) x (num_factors) matrix of normalized MI between factors only
106
106
 
107
107
  Notes
108
108
  -----
@@ -120,7 +120,7 @@ def mutual_info( # noqa: C901
120
120
 
121
121
  Example
122
122
  -------
123
- Return balance (mutual information) of factors with class_labels
123
+ Return balance (normalized mutual information) of factors with class_labels
124
124
 
125
125
  >>> rng = np.random.default_rng(175)
126
126
  >>> class_labels = rng.choice([0, 1, 2], size=100)
@@ -155,7 +155,7 @@ def mutual_info( # noqa: C901
155
155
  data, discrete_list = _merge_labels_and_factors(class_labels_np, factor_data_np, discrete_feat_np)
156
156
  num_factors = len(discrete_list)
157
157
 
158
- _logger.debug("Computing MI for %d factors (%d discrete)", num_factors, sum(discrete_list))
158
+ _logger.debug("Computing NMI for %d factors (%d discrete)", num_factors, sum(discrete_list))
159
159
 
160
160
  # initialize output matrix
161
161
  mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
@@ -195,7 +195,7 @@ def mutual_info( # noqa: C901
195
195
  full_matrix = 0.5 * (mi + mi.T).astype(np.float64)
196
196
 
197
197
  _logger.info(
198
- "Mutual info calculation complete: %d factors, mean class_to_factor MI=%.4f",
198
+ "Mutual info calculation complete: %d factors, mean class_to_factor NMI=%.4f",
199
199
  num_factors - 1,
200
200
  np.mean(full_matrix[0, 1:]),
201
201
  )
@@ -208,12 +208,12 @@ def mutual_info( # noqa: C901
208
208
 
209
209
  def mutual_info_classwise(
210
210
  class_labels: Array1D[int],
211
- factor_data: Array2D[int],
211
+ factor_data: Array2D[int | float],
212
212
  discrete_features: Array1D[bool] | None = None,
213
213
  num_neighbors: int = 5,
214
214
  ) -> NDArray[np.float64]:
215
215
  """
216
- Compute mutual information (MI) between factors, transformed to lie in [0, 1].
216
+ Compute normalized mutual information (NMI) between factors.
217
217
 
218
218
  Factors include class label, metadata, and label/image properties.
219
219
 
@@ -221,7 +221,7 @@ def mutual_info_classwise(
221
221
  ----------
222
222
  class_labels : Array1D[int]
223
223
  Target class labels as integer indices. Can be a 1D list, or array-like object.
224
- factor_data : Array2D[int]
224
+ factor_data : Array2D[int | float]
225
225
  Factor values after binning or digitization. Can be a 1D list, or array-like object.
226
226
  discrete_features : Array1D[bool] | None = None
227
227
  Boolean array or iterable defining whether or not the feature set is discretized.
@@ -232,19 +232,19 @@ def mutual_info_classwise(
232
232
  Returns
233
233
  -------
234
234
  NDArray[np.float64]
235
- (num_factors+1) x (num_factors+1) estimate of mutual information
235
+ (num_classes) x (num_factors+1) estimate of normalized mutual information
236
236
  between num_factors metadata factors and class label. Symmetry is enforced.
237
237
 
238
238
  Notes
239
239
  -----
240
240
  We use `mutual_info_classif` from sklearn since class label is categorical.
241
241
  `mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
242
- seed. MI is computed differently for categorical and continuous variables. We
243
- return a transformation of MI onto the interval [0, 1].
242
+ seed. MI is computed differently for categorical and continuous variables. In all cases,
243
+ we return either a normalization or transformation of MI onto the interval [0, 1].
244
244
 
245
245
  Example
246
246
  -------
247
- Return classwise balance (mutual information) of factors with individual class_labels
247
+ Return classwise balance (normalized mutual information) of factors with individual class_labels
248
248
 
249
249
  >>> rng = np.random.default_rng(175)
250
250
  >>> class_labels = rng.choice([0, 1, 2], size=100)
@@ -267,7 +267,7 @@ def mutual_info_classwise(
267
267
  _logger.info("Starting mutual_info_classwise calculation with num_neighbors=%d", num_neighbors)
268
268
 
269
269
  class_labels_np = as_numpy(class_labels, dtype=np.intp, required_ndim=1)
270
- factor_data_np = as_numpy(factor_data, dtype=np.intp, required_ndim=2)
270
+ factor_data_np = as_numpy(factor_data, required_ndim=2)
271
271
  discrete_feat_np = opt_as_numpy(discrete_features, dtype=np.bool_, required_ndim=1)
272
272
 
273
273
  num_neighbors = _validate_num_neighbors(num_neighbors)
@@ -276,7 +276,7 @@ def mutual_info_classwise(
276
276
  u_classes = np.unique(class_labels_np)
277
277
  num_classes = len(u_classes)
278
278
 
279
- _logger.debug("Computing classwise MI for %d classes and %d factors", num_classes, num_factors)
279
+ _logger.debug("Computing classwise NMI for %d classes and %d factors", num_classes, num_factors)
280
280
 
281
281
  # classwise targets (binary indicators)
282
282
  tgt_bin = data[:, 0][:, None] == u_classes
@@ -11,14 +11,14 @@ import logging
11
11
  from sklearn.metrics import average_precision_score
12
12
 
13
13
  from dataeval._experimental import experimental
14
- from dataeval.types import Array2D
14
+ from dataeval.protocols import ArrayLike
15
15
  from dataeval.utils._internal import as_numpy
16
16
 
17
17
  _logger = logging.getLogger(__name__)
18
18
 
19
19
 
20
20
  @experimental
21
- def uap(labels: Array2D[int], scores: Array2D[float]) -> float:
21
+ def uap(labels: ArrayLike, scores: ArrayLike) -> float:
22
22
  """
23
23
  Estimate the empirical mean precision for the upperbound average precision.
24
24
 
@@ -112,7 +112,7 @@ class Sufficiency(Evaluator, Generic[T, M]):
112
112
  Raises
113
113
  ------
114
114
  ValueError
115
- If runs or substeps is not greater than 1
115
+ If runs or substeps is not at least 1
116
116
 
117
117
  Examples
118
118
  --------
@@ -988,7 +988,7 @@ class Duplicates(Evaluator):
988
988
 
989
989
  Attributes
990
990
  ----------
991
- flags : ImageStats, default ImageStats.HASH
991
+ flags : ImageStats, default ImageStats.HASH_DUPLICATES_BASIC
992
992
  Statistics to compute for hash-based duplicate detection.
993
993
  cluster_sensitivity : float or None, default None
994
994
  Distance factor for cluster-based near duplicate detection. Scales
@@ -978,10 +978,10 @@ class Outliers(Evaluator):
978
978
 
979
979
  - ``AdaptiveThreshold`` (default): Uses tail-weighted Double-MAD (separate MAD for
980
980
  data below and above the median) with automatic multiplier scaling for heavy
981
- tails to produce asymmetric bounds. Default multiplier: 3.0.
981
+ tails to produce asymmetric bounds. Default multiplier: 3.5.
982
982
  - ``ModifiedZScoreThreshold``: Based on median absolute deviation. Default multiplier: 3.5.
983
983
  Modified z score = :math:`0.6745 * |x_i - x̃| / MAD`
984
- - ``ZScoreThreshold``: Based on standard deviation from mean. Default multiplier: 3.
984
+ - ``ZScoreThreshold``: Based on standard deviation from mean. Default multiplier: 3.0.
985
985
  Z score = :math:`|x_i - \mu| / \sigma`
986
986
  - ``IQRThreshold``: Based on interquartile range. Default multiplier: 1.5.
987
987
  Outliers are outside :math:`[Q_1 - 1.5 \cdot IQR, Q_3 + 1.5 \cdot IQR]`
@@ -364,6 +364,9 @@ class Prioritize(Evaluator):
364
364
  ----------
365
365
  extractor : FeatureExtractor
366
366
  Feature extractor instance to use for extracting embeddings from data.
367
+ batch_size : int or None, default None
368
+ Batch size for embedding computation. When None, uses the global
369
+ batch size from :func:`~dataeval.config.get_batch_size`.
367
370
  method : {"knn", "kmeans_distance", "kmeans_complexity", "hdbscan_distance", \
368
371
  "hdbscan_complexity"}, default "knn"
369
372
  Ranking method to use:
@@ -470,6 +473,9 @@ class Prioritize(Evaluator):
470
473
  extractor : FeatureExtractor or None
471
474
  Feature extractor instance to use for extracting embeddings
472
475
  from data.
476
+ batch_size : int or None, default None
477
+ Batch size for embedding computation. When None, uses the global
478
+ batch size from :func:`~dataeval.config.get_batch_size`.
473
479
  method : {"knn", "kmeans_distance", "kmeans_complexity", "hdbscan_distance", \
474
480
  "hdbscan_complexity"}, default "knn"
475
481
  Ranking method to use.
@@ -481,9 +487,6 @@ class Prioritize(Evaluator):
481
487
  Number of K-means initializations (kmeans methods only).
482
488
  max_cluster_size : int or None, default None
483
489
  Maximum cluster size for HDBSCAN methods.
484
- batch_size : int or None, default None
485
- Batch size for embedding computation. When None, uses the global
486
- batch size from :func:`~dataeval.config.get_batch_size`.
487
490
  order : {"easy_first", "hard_first"}, default "easy_first"
488
491
  Sort direction for output indices.
489
492
  policy : {"difficulty", "stratified", "class_balanced"}, default "difficulty"
@@ -43,7 +43,8 @@ class DriftOutput(DictOutput, Generic[TDetails]):
43
43
  For multivariate methods, this is the corrected threshold after
44
44
  Bonferroni or FDR correction.
45
45
  distance : float
46
- Instance-level test statistic or distance metric, always >= 0.
46
+ Instance-level test statistic or distance metric. Typically >= 0, but can be
47
+ slightly negative for metrics like unbiased MMD².
47
48
  For univariate methods, this is the mean distance across all features.
48
49
  Higher values indicate greater deviation from reference distribution.
49
50
  metric_name : str
@@ -3,7 +3,7 @@
3
3
  __all__ = []
4
4
 
5
5
  from dataclasses import dataclass
6
- from typing import Any
6
+ from typing import Any, Literal
7
7
 
8
8
  import numpy as np
9
9
  from numpy.typing import NDArray
@@ -32,6 +32,11 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
32
32
  reference as class 0, runs repeated k-fold CV, and returns per-point
33
33
  class-1 rates. Points with rates exceeding the threshold are flagged OOD.
34
34
 
35
+ Note: By default, this detector uses the ``n_std`` based threshold for
36
+ predictions. If a value for ``threshold_perc`` is provided (either directly
37
+ or via config), it will use percentile-based thresholding from reference
38
+ scores instead.
39
+
35
40
  Parameters
36
41
  ----------
37
42
  n_folds : int, default 5
@@ -40,11 +45,12 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
40
45
  Number of times to repeat the k-fold split.
41
46
  n_std : float, default 2.0
42
47
  Number of standard deviations above the null mean for threshold.
48
+ Used when threshold_perc is not explicitly set.
43
49
  hyperparameters : dict or None, default None
44
50
  LightGBM hyperparameters.
45
51
  threshold_perc : float or None, default None
46
52
  Percentage of reference data considered normal (0-100).
47
- If None, uses config.threshold_perc (default 95.0).
53
+ If provided, overrides ``n_std`` for percentile-based thresholding.
48
54
  extractor : FeatureExtractor or None, default None
49
55
  Feature extractor for transforming input data before scoring.
50
56
  When provided, raw data is passed through the extractor in both
@@ -59,7 +65,7 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
59
65
  >>> test = np.random.randn(50, 8).astype(np.float32) + 3
60
66
  >>> detector = OODDomainClassifier(n_folds=3, n_repeats=3)
61
67
  >>> detector.fit(ref)
62
- OODDomainClassifier(n_folds=3, n_repeats=3, n_std=2.0, threshold_perc=95.0, hyperparameters=None, extractor=None, fitted=True)
68
+ OODDomainClassifier(n_folds=3, n_repeats=3, n_std=2.0, threshold_perc=None, hyperparameters=None, extractor=None, fitted=True)
63
69
  >>> predictions = detector.predict(test)
64
70
  """ # noqa: E501
65
71
 
@@ -76,8 +82,9 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
76
82
  Number of k-fold repeats.
77
83
  n_std : float, default 2.0
78
84
  Threshold multiplier for standard deviations above null mean.
79
- threshold_perc : float, default 95.0
80
- Percentile-based threshold (alternative to n_std).
85
+ Used when threshold_perc is None.
86
+ threshold_perc : float or None, default None
87
+ Percentile-based threshold. If provided, overrides n_std.
81
88
  hyperparameters : dict or None, default None
82
89
  LightGBM hyperparameters.
83
90
  extractor : FeatureExtractor or None, default None
@@ -87,7 +94,7 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
87
94
  n_folds: int = 5
88
95
  n_repeats: int = 5
89
96
  n_std: float = 2.0
90
- threshold_perc: float = 95.0
97
+ threshold_perc: float | None = None
91
98
  hyperparameters: dict[str, Any] | None = None
92
99
  extractor: FeatureExtractor | None = None
93
100
 
@@ -103,8 +110,11 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
103
110
  ) -> None:
104
111
  base_config = config or OODDomainClassifier.Config()
105
112
 
106
- threshold_perc = threshold_perc if threshold_perc is not None else base_config.threshold_perc
107
- super().__init__(threshold_perc)
113
+ self._threshold_perc_set = threshold_perc is not None or (
114
+ config is not None and config.threshold_perc is not None
115
+ )
116
+ perc = threshold_perc if threshold_perc is not None else (base_config.threshold_perc or 95.0)
117
+ super().__init__(perc)
108
118
 
109
119
  self._n_folds = n_folds if n_folds is not None else base_config.n_folds
110
120
  self._n_repeats = n_repeats if n_repeats is not None else base_config.n_repeats
@@ -115,7 +125,7 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
115
125
  n_folds=self._n_folds,
116
126
  n_repeats=self._n_repeats,
117
127
  n_std=self._n_std,
118
- threshold_perc=threshold_perc,
128
+ threshold_perc=threshold_perc if threshold_perc is not None else base_config.threshold_perc,
119
129
  hyperparameters=self._hyperparameters,
120
130
  extractor=self._extractor,
121
131
  )
@@ -177,6 +187,12 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
177
187
  self._ref_score = self.score(reference_data)
178
188
  return self
179
189
 
190
+ def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
191
+ """Get the threshold score. Prefers n_std threshold unless threshold_perc was explicitly set."""
192
+ if not self._threshold_perc_set and ood_type == "instance":
193
+ return np.float64(self._threshold)
194
+ return super()._threshold_score(ood_type)
195
+
180
196
  def _score(self, x: NDArray[np.float32], batch_size: int | None = None) -> OODScoreOutput: # noqa: ARG002
181
197
  """Compute per-point class-1 rates for test data vs reference."""
182
198
  x_ref = self._reference_data
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes