dataeval 1.0.5__tar.gz → 1.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataeval-1.0.5 → dataeval-1.0.6}/PKG-INFO +1 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/pyproject.toml +18 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_metadata.py +1 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_version.py +2 -2
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/_balance.py +19 -16
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/_diversity.py +4 -2
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/_parity.py +0 -2
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_ber.py +10 -2
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_clusterer.py +1 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_coverage.py +31 -13
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_feature_distance.py +4 -3
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_label_parity.py +3 -3
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_metadata_insights.py +24 -20
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_mst.py +1 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_mutual_info.py +20 -20
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_uap.py +2 -2
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/_sufficiency.py +1 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/_duplicates.py +1 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/_outliers.py +2 -2
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/scope/_prioritize.py +6 -3
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_base.py +2 -1
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_domain_classifier.py +25 -9
- {dataeval-1.0.5 → dataeval-1.0.6}/.gitignore +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/LICENSE +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/README.md +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_embeddings.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_experimental.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_helpers.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_log.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/_warm_cache.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/bias/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/config.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_bin.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_base.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_cache.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_dimensionstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_hashstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_pixelstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_register.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_registry.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_calculators/_visualstats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_completeness.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_compute_ratios.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_compute_stats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_divergence.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_diversity.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_cluster_trees.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_disjoint_set.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_fast_hdbscan/_mst.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_hash.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_label_errors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_label_stats.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_nullmodel.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_parity.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/core/_rank.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/exceptions.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_bovw.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_flatten.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_onnx.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_torch.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/extractors/_uncertainty.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/flags.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/_aggregator.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/_output.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/performance/schedules.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/protocols.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/py.typed +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/quality/_shared.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/scope/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_classbalance.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_classfilter.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_indices.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_limit.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_reverse.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_select.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/selection/_shuffle.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_chunk.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_domain_classifier.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_kneighbors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_mmd.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_reconstruction.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_drift/_univariate.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_base.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_kneighbors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_ood/_reconstruction.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/_domain_classifier.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/_kneighbors.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/_shared/_reconstruction.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/shift/update_strategies.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/types.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/__init__.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/_internal.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/data.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/losses.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/models.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/onnx.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/preprocessing.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/thresholds.py +0 -0
- {dataeval-1.0.5 → dataeval-1.0.6}/src/dataeval/utils/training.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataeval
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.6
|
|
4
4
|
Summary: DataEval provides a simple interface to characterize image data and its impact on model performance across classification and object-detection tasks
|
|
5
5
|
Project-URL: Homepage, https://dataeval.ai/
|
|
6
6
|
Project-URL: Repository, https://github.com/aria-ml/dataeval/
|
|
@@ -114,17 +114,23 @@ docs = [
|
|
|
114
114
|
"sphinx-tabs>=3.4.7",
|
|
115
115
|
"Sphinx>=7.2.6,<9.0.0", # sphinx-immaterial <= 0.13.9 is not compatible with sphinx >=9.0
|
|
116
116
|
"torchmetrics>=1.0.0",
|
|
117
|
-
"torchvision>=0.17.0",
|
|
118
117
|
"markupsafe>=3,<3.0.2",
|
|
119
118
|
"jupytext>=1.19.1",
|
|
120
119
|
]
|
|
121
120
|
security = [ # keep in sync with [tool.uv.constraint-dependencies]
|
|
122
121
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
123
122
|
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
123
|
+
"onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
|
|
124
|
+
# CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
|
|
125
|
+
# CVE-2026-27489: Vulnerable to Path Traversal via Symlink
|
|
126
|
+
# GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
|
|
124
127
|
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
128
|
+
"poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
|
|
125
129
|
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
126
130
|
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
127
131
|
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
132
|
+
"tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
|
|
133
|
+
# CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
|
|
128
134
|
]
|
|
129
135
|
dev = [
|
|
130
136
|
{ include-group = "base" },
|
|
@@ -150,10 +156,17 @@ conflicts = [
|
|
|
150
156
|
constraint-dependencies = [
|
|
151
157
|
"cryptography>=46.0.5", # CVE-2026-26007: Missing Subgroup Validation for SECT Curves
|
|
152
158
|
"filelock>=3.20.3", # GHSA-w853-jp5j-5j7f, GHSA-qmgc-5h2g-mvrw
|
|
159
|
+
"onnx>=1.21.0", # CVE-2026-28500: Untrusted Model Repository Warnings Suppressed by silent=True
|
|
160
|
+
# CVE-2026-34445: Malicious ONNX models can crash servers by exploiting unprotected object settings
|
|
161
|
+
# CVE-2026-27489: Vulnerable to Path Traversal via Symlink
|
|
162
|
+
# GHSA-q56x-g2fj-4rj6: TOCTOU arbitrary file read/write in save_external_dat
|
|
153
163
|
"pillow>=12.1.1", # CVE-2026-25990: OOB write via PSD image
|
|
164
|
+
"poetry>=2.3.3", # CVE-2026-34591: Poetry Has Wheel Path Traversal Which Can Lead to Arbitrary File Write
|
|
154
165
|
"protobuf>=6.33.5", # GHSA-7gcm-g887-7qv7
|
|
155
166
|
"setuptools>=82.0.0", # CVE-2026-23949: (jaraco_context) path traversal in tarball()
|
|
156
167
|
# CVE-2026-24049: (wheel) privilege escalation via unpack
|
|
168
|
+
"tornado>=6.5.5", # CVE-2026-31958: Tornado is vulnerable to DoS due to too many multipart parts
|
|
169
|
+
# CVE-2026-35536: Tornado has cookie attribute injection via .RequestHandler.set_cookie
|
|
157
170
|
]
|
|
158
171
|
|
|
159
172
|
[[tool.uv.index]]
|
|
@@ -211,6 +224,9 @@ version-file = "src/dataeval/_version.py"
|
|
|
211
224
|
[tool.poetry]
|
|
212
225
|
version = "0.0.0" # unused
|
|
213
226
|
|
|
227
|
+
[tool.poetry.dependencies]
|
|
228
|
+
python = ">=3.10,<3.15"
|
|
229
|
+
|
|
214
230
|
[tool.pyproject2conda.dependencies]
|
|
215
231
|
numpy = { skip = true, packages = "numpy>=1.24.2" }
|
|
216
232
|
scikit-learn = { skip = true, packages = "scikit-learn>=1.5.0" }
|
|
@@ -307,6 +323,7 @@ max-complexity = 5
|
|
|
307
323
|
convention = "numpy"
|
|
308
324
|
|
|
309
325
|
[tool.ruff.format]
|
|
326
|
+
preview = true
|
|
310
327
|
quote-style = "double"
|
|
311
328
|
indent-style = "space"
|
|
312
329
|
skip-magic-trailing-comma = false
|
|
@@ -650,7 +650,7 @@ class Metadata(Array, FeatureExtractor):
|
|
|
650
650
|
-------
|
|
651
651
|
Sequence[str]
|
|
652
652
|
List of factor names that passed filtering and preprocessing steps.
|
|
653
|
-
Order matches columns in factor_data
|
|
653
|
+
Order matches columns in factor_data.
|
|
654
654
|
|
|
655
655
|
Notes
|
|
656
656
|
-----
|
|
@@ -18,7 +18,7 @@ version_tuple: tuple[int | str, ...]
|
|
|
18
18
|
commit_id: str | None
|
|
19
19
|
__commit_id__: str | None
|
|
20
20
|
|
|
21
|
-
__version__ = version = '1.0.
|
|
22
|
-
__version_tuple__ = version_tuple = (1, 0,
|
|
21
|
+
__version__ = version = '1.0.6'
|
|
22
|
+
__version_tuple__ = version_tuple = (1, 0, 6)
|
|
23
23
|
|
|
24
24
|
__commit_id__ = commit_id = None
|
|
@@ -22,28 +22,30 @@ class BalanceOutput(DictOutput):
|
|
|
22
22
|
"""
|
|
23
23
|
Output class for the :class:`.Balance` :term:`bias<Bias>` evaluator.
|
|
24
24
|
|
|
25
|
-
Contains three polars DataFrames with mutual information scores and threshold flags.
|
|
25
|
+
Contains three polars DataFrames with normalized mutual information scores and threshold flags.
|
|
26
26
|
|
|
27
27
|
Attributes
|
|
28
28
|
----------
|
|
29
29
|
balance : pl.DataFrame
|
|
30
|
-
DataFrame with global class-to-factor mutual information:
|
|
30
|
+
DataFrame with global class-to-factor normalized mutual information:
|
|
31
31
|
|
|
32
|
-
- factor_name: str - Name of the metadata factor
|
|
33
|
-
|
|
32
|
+
- factor_name: str - Name of the metadata factor. Includes "class_label"
|
|
33
|
+
which represents the self-information (always 1.0).
|
|
34
|
+
- mi_value: float - Normalized mutual information value between this
|
|
35
|
+
factor and class labels
|
|
34
36
|
factors : pl.DataFrame
|
|
35
|
-
DataFrame with inter-factor mutual information correlations:
|
|
37
|
+
DataFrame with inter-factor normalized mutual information correlations:
|
|
36
38
|
|
|
37
39
|
- factor1: str - Name of the first factor
|
|
38
40
|
- factor2: str - Name of the second factor
|
|
39
|
-
- mi_value: float -
|
|
41
|
+
- mi_value: float - Normalized mutual information value
|
|
40
42
|
- is_correlated: bool - True if mi_value > factor_correlation_threshold
|
|
41
43
|
classwise : pl.DataFrame
|
|
42
|
-
DataFrame with per-class-to-factor mutual information:
|
|
44
|
+
DataFrame with per-class-to-factor normalized mutual information:
|
|
43
45
|
|
|
44
46
|
- class_name: str - Name of the class
|
|
45
47
|
- factor_name: str - Name of the metadata factor
|
|
46
|
-
- mi_value: float -
|
|
48
|
+
- mi_value: float - Normalized mutual information value
|
|
47
49
|
- is_imbalanced: bool - True if mi_value > class_imbalance_threshold
|
|
48
50
|
"""
|
|
49
51
|
|
|
@@ -58,21 +60,21 @@ class BalanceOutput(DictOutput):
|
|
|
58
60
|
|
|
59
61
|
class Balance(Evaluator):
|
|
60
62
|
"""
|
|
61
|
-
Computes mutual information (
|
|
63
|
+
Computes normalized mutual information (NMI) between factors (class label, metadata, label/image properties).
|
|
62
64
|
|
|
63
65
|
Identifies imbalanced classes and highly correlated metadata factors based on
|
|
64
|
-
|
|
66
|
+
NMI thresholds.
|
|
65
67
|
|
|
66
68
|
Parameters
|
|
67
69
|
----------
|
|
68
70
|
num_neighbors : int, default 5
|
|
69
71
|
Number of points to consider as neighbors
|
|
70
72
|
class_imbalance_threshold : float, default 0.3
|
|
71
|
-
Threshold for identifying imbalanced classes. Classes with
|
|
73
|
+
Threshold for identifying imbalanced classes. Classes with NMI above this
|
|
72
74
|
threshold with any metadata factor are considered imbalanced.
|
|
73
75
|
factor_correlation_threshold : float, default 0.5
|
|
74
76
|
Threshold for identifying highly correlated metadata factors. Factor pairs
|
|
75
|
-
with
|
|
77
|
+
with NMI above this threshold are considered highly correlated.
|
|
76
78
|
|
|
77
79
|
Attributes
|
|
78
80
|
----------
|
|
@@ -89,7 +91,8 @@ class Balance(Evaluator):
|
|
|
89
91
|
-----
|
|
90
92
|
We use `mutual_info_classif` from sklearn since class label is categorical.
|
|
91
93
|
`mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
|
|
92
|
-
seed. MI is computed differently for categorical and continuous variables
|
|
94
|
+
seed. MI is computed differently for categorical and continuous variables, and
|
|
95
|
+
in all cases normalized or transformed to [0, 1] prior to being returned.
|
|
93
96
|
|
|
94
97
|
Examples
|
|
95
98
|
--------
|
|
@@ -149,7 +152,7 @@ class Balance(Evaluator):
|
|
|
149
152
|
@set_metadata(state=["num_neighbors", "class_imbalance_threshold", "factor_correlation_threshold"])
|
|
150
153
|
def evaluate(self, data: AnnotatedDataset[Any] | MetadataLike) -> BalanceOutput: # noqa: C901
|
|
151
154
|
"""
|
|
152
|
-
Compute mutual information between factors and identify imbalanced classes.
|
|
155
|
+
Compute normalized mutual information between factors and identify imbalanced classes.
|
|
153
156
|
|
|
154
157
|
Parameters
|
|
155
158
|
----------
|
|
@@ -160,7 +163,7 @@ class Balance(Evaluator):
|
|
|
160
163
|
Returns
|
|
161
164
|
-------
|
|
162
165
|
BalanceOutput
|
|
163
|
-
Three DataFrames containing
|
|
166
|
+
Three DataFrames containing NMI scores and threshold flags:
|
|
164
167
|
|
|
165
168
|
- balance: Global class-to-factor mutual information
|
|
166
169
|
- factors: Inter-factor mutual information
|
|
@@ -168,7 +171,7 @@ class Balance(Evaluator):
|
|
|
168
171
|
|
|
169
172
|
Example
|
|
170
173
|
-------
|
|
171
|
-
Return balance (
|
|
174
|
+
Return balance (NMI) of factors with class_labels
|
|
172
175
|
|
|
173
176
|
>>> from dataeval import Metadata
|
|
174
177
|
>>> metadata = Metadata(dataset)
|
|
@@ -56,7 +56,7 @@ class Diversity(Evaluator):
|
|
|
56
56
|
Through standard histogram binning, for continuous variables.
|
|
57
57
|
|
|
58
58
|
The method specified defines diversity as the inverse Simpson diversity index linearly rescaled to
|
|
59
|
-
the unit interval, or the normalized form of the Shannon entropy.
|
|
59
|
+
the unit interval [0, 1], or the normalized form of the Shannon entropy.
|
|
60
60
|
|
|
61
61
|
diversity = 1 implies that samples are evenly distributed across a particular factor
|
|
62
62
|
diversity = 0 implies that all samples belong to one category/bin
|
|
@@ -66,7 +66,9 @@ class Diversity(Evaluator):
|
|
|
66
66
|
Parameters
|
|
67
67
|
----------
|
|
68
68
|
method : "simpson" or "shannon", default "simpson"
|
|
69
|
-
The methodology used for defining diversity
|
|
69
|
+
The methodology used for defining diversity. When "simpson" is used,
|
|
70
|
+
the index is linearly rescaled so that 1.0 represents maximum diversity
|
|
71
|
+
(even distribution) and 0.0 represents minimum diversity (all samples in one bin).
|
|
70
72
|
threshold : float, default 0.5
|
|
71
73
|
Threshold for identifying low diversity. Factors with diversity values
|
|
72
74
|
at or below this threshold are flagged as having low diversity.
|
|
@@ -118,8 +118,6 @@ class Parity(Evaluator):
|
|
|
118
118
|
|
|
119
119
|
>>> config = Parity.Config(score_threshold=0.4, p_value_threshold=0.01)
|
|
120
120
|
>>> parity = Parity(config=config)
|
|
121
|
-
|
|
122
|
-
output = parity(metadata.binned_data, metadata.class_labels.tolist())
|
|
123
121
|
"""
|
|
124
122
|
|
|
125
123
|
class Config(EvaluatorConfig):
|
|
@@ -78,6 +78,8 @@ def ber_mst(embeddings: ArrayND[float], class_labels: Array1D[int]) -> BERResult
|
|
|
78
78
|
"""
|
|
79
79
|
Estimate Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using a minimum spanning tree.
|
|
80
80
|
|
|
81
|
+
BER bounds the irreducible classification error given the current feature
|
|
82
|
+
representation — the error attributable to class overlap in embedding space.
|
|
81
83
|
Uses FR with a minimum spanning tree (MST) test statistic basis.
|
|
82
84
|
|
|
83
85
|
Parameters
|
|
@@ -137,7 +139,13 @@ def ber_knn(embeddings: ArrayND[float], class_labels: Array1D[int], k: int) -> B
|
|
|
137
139
|
"""
|
|
138
140
|
Estimate Multi-class :term:`Bayes error rate<Bayes Error Rate (BER)>` using KNN.
|
|
139
141
|
|
|
140
|
-
|
|
142
|
+
BER bounds the irreducible classification error given the current feature
|
|
143
|
+
representation — the error attributable to class overlap in embedding space.
|
|
144
|
+
Uses KNN test statistic basis. The estimator's behavior depends on the value of k:
|
|
145
|
+
- k=1: Uses 1-NN for the lower bound and 2-NN for the upper bound.
|
|
146
|
+
- k=2: Uses 2-NN for the lower bound and 3-NN for the upper bound.
|
|
147
|
+
- 2<k<=5: Uses k-NN for the lower bound and (k+1)-NN for the upper bound.
|
|
148
|
+
- k>5: Only available for binary classification; uses k-NN for both bounds with specialized asymptotic weights.
|
|
141
149
|
|
|
142
150
|
Parameters
|
|
143
151
|
----------
|
|
@@ -146,7 +154,7 @@ def ber_knn(embeddings: ArrayND[float], class_labels: Array1D[int], k: int) -> B
|
|
|
146
154
|
class_labels : Array1D[int]
|
|
147
155
|
Array of class labels for each image. Can be a 1D list, or array-like object.
|
|
148
156
|
k : int
|
|
149
|
-
Number of nearest neighbors for KNN estimator
|
|
157
|
+
Number of nearest neighbors for KNN estimator. Should be between 1 and the number of samples.
|
|
150
158
|
|
|
151
159
|
Returns
|
|
152
160
|
-------
|
|
@@ -241,7 +241,7 @@ class _HDBSCANSorter:
|
|
|
241
241
|
n_samples_per_cluster = np.bincount(labels)
|
|
242
242
|
_logger.debug(
|
|
243
243
|
"HDBSCAN clustering complete: %d clusters, samples per cluster: min=%d, max=%d, mean=%.1f",
|
|
244
|
-
clst.unique_clusters,
|
|
244
|
+
len(clst.unique_clusters),
|
|
245
245
|
np.min(n_samples_per_cluster),
|
|
246
246
|
np.max(n_samples_per_cluster),
|
|
247
247
|
np.mean(n_samples_per_cluster),
|
|
@@ -6,8 +6,8 @@ from typing import TypedDict
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
from numpy.typing import NDArray
|
|
9
|
-
from scipy.spatial.distance import pdist, squareform
|
|
10
9
|
|
|
10
|
+
from dataeval.core._mst import _compute_nearest_neighbors
|
|
11
11
|
from dataeval.types import Array2D
|
|
12
12
|
from dataeval.utils._internal import as_numpy, ensure_embeddings, flatten_samples
|
|
13
13
|
|
|
@@ -22,19 +22,24 @@ class CoverageResult(TypedDict):
|
|
|
22
22
|
----------
|
|
23
23
|
uncovered_indices : NDArray[np.intp]
|
|
24
24
|
Array of indices for uncovered observations
|
|
25
|
-
critical_value_radii : NDArray[np.
|
|
25
|
+
critical_value_radii : NDArray[np.float32]
|
|
26
26
|
Array of critical value radii for each observation
|
|
27
27
|
coverage_radius : float
|
|
28
28
|
The radius threshold for coverage
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
31
|
uncovered_indices: NDArray[np.intp]
|
|
32
|
-
critical_value_radii: NDArray[np.
|
|
32
|
+
critical_value_radii: NDArray[np.float32]
|
|
33
33
|
coverage_radius: float
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
def _validate_inputs(
|
|
37
|
-
embeddings
|
|
36
|
+
def _validate_inputs(
|
|
37
|
+
embeddings: NDArray[np.float64],
|
|
38
|
+
num_observations: int,
|
|
39
|
+
force_unit_interval: bool = False,
|
|
40
|
+
) -> NDArray[np.float64]:
|
|
41
|
+
unit_interval = "force" if force_unit_interval else True
|
|
42
|
+
embeddings = ensure_embeddings(embeddings, dtype=np.float64, unit_interval=unit_interval)
|
|
38
43
|
if len(embeddings) <= num_observations:
|
|
39
44
|
raise ValueError(
|
|
40
45
|
f"Length of embeddings ({len(embeddings)}) is less than or equal to the specified number of \
|
|
@@ -43,15 +48,17 @@ def _validate_inputs(embeddings: NDArray[np.float64], num_observations: int) ->
|
|
|
43
48
|
return embeddings
|
|
44
49
|
|
|
45
50
|
|
|
46
|
-
def _calculate_critical_value_radii(embeddings: NDArray[np.float64], num_observations: int) -> NDArray[np.
|
|
47
|
-
embeddings_matrix =
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
def _calculate_critical_value_radii(embeddings: NDArray[np.float64], num_observations: int) -> NDArray[np.float32]:
|
|
52
|
+
_, embeddings_matrix = _compute_nearest_neighbors(
|
|
53
|
+
flatten_samples(embeddings), None, num_observations, return_distances=True
|
|
54
|
+
)
|
|
55
|
+
return embeddings_matrix[:, -1]
|
|
50
56
|
|
|
51
57
|
|
|
52
58
|
def coverage_naive(
|
|
53
59
|
embeddings: Array2D[float],
|
|
54
60
|
num_observations: int,
|
|
61
|
+
force_unit_interval: bool = False,
|
|
55
62
|
) -> CoverageResult:
|
|
56
63
|
"""
|
|
57
64
|
Evaluate :term:`coverage<Coverage>` using a naive radius calculation method.
|
|
@@ -68,6 +75,9 @@ def coverage_naive(
|
|
|
68
75
|
num_observations : int
|
|
69
76
|
Number of observations required in order to be covered.
|
|
70
77
|
[1] suggests that a minimum of 20-50 samples is necessary.
|
|
78
|
+
force_unit_interval : bool, default False
|
|
79
|
+
If True, embeddings will be automatically rescaled to the unit interval [0, 1].
|
|
80
|
+
If False, a ValueError is raised if embeddings are outside [0, 1].
|
|
71
81
|
|
|
72
82
|
Returns
|
|
73
83
|
-------
|
|
@@ -81,7 +91,7 @@ def coverage_naive(
|
|
|
81
91
|
Raises
|
|
82
92
|
------
|
|
83
93
|
ValueError
|
|
84
|
-
If embeddings are not unit interval [0-1]
|
|
94
|
+
If embeddings are not unit interval [0-1] and force_unit_interval is False
|
|
85
95
|
ValueError
|
|
86
96
|
If length of :term:`embeddings<Embeddings>` is less than or equal to num_observations
|
|
87
97
|
|
|
@@ -101,7 +111,9 @@ def coverage_naive(
|
|
|
101
111
|
"""
|
|
102
112
|
_logger.info("Starting coverage_naive calculation with num_observations=%d", num_observations)
|
|
103
113
|
|
|
104
|
-
embeddings_np = _validate_inputs(
|
|
114
|
+
embeddings_np = _validate_inputs(
|
|
115
|
+
as_numpy(embeddings, dtype=np.float64, required_ndim=2), num_observations, force_unit_interval
|
|
116
|
+
)
|
|
105
117
|
_logger.debug("Embeddings shape: %s", embeddings_np.shape)
|
|
106
118
|
|
|
107
119
|
critical_value_radii = _calculate_critical_value_radii(embeddings_np, num_observations)
|
|
@@ -132,6 +144,7 @@ def coverage_adaptive(
|
|
|
132
144
|
embeddings: Array2D[float],
|
|
133
145
|
num_observations: int,
|
|
134
146
|
percent: float,
|
|
147
|
+
force_unit_interval: bool = False,
|
|
135
148
|
) -> CoverageResult:
|
|
136
149
|
"""
|
|
137
150
|
Evaluate :term:`coverage<Coverage>` using an adaptive radius calculation method.
|
|
@@ -150,6 +163,9 @@ def coverage_adaptive(
|
|
|
150
163
|
[1] suggests that a minimum of 20-50 samples is necessary.
|
|
151
164
|
percent : float
|
|
152
165
|
Percent of observations to be considered uncovered. Should be between 0 and 1.
|
|
166
|
+
force_unit_interval : bool, default False
|
|
167
|
+
If True, embeddings will be automatically rescaled to the unit interval [0, 1].
|
|
168
|
+
If False, a ValueError is raised if embeddings are outside [0, 1].
|
|
153
169
|
|
|
154
170
|
Returns
|
|
155
171
|
-------
|
|
@@ -163,7 +179,7 @@ def coverage_adaptive(
|
|
|
163
179
|
Raises
|
|
164
180
|
------
|
|
165
181
|
ValueError
|
|
166
|
-
If embeddings are not unit interval [0-1]
|
|
182
|
+
If embeddings are not unit interval [0-1] and force_unit_interval is False
|
|
167
183
|
ValueError
|
|
168
184
|
If length of :term:`embeddings<Embeddings>` is less than or equal to num_observations
|
|
169
185
|
|
|
@@ -188,7 +204,9 @@ def coverage_adaptive(
|
|
|
188
204
|
percent,
|
|
189
205
|
)
|
|
190
206
|
|
|
191
|
-
embeddings = _validate_inputs(
|
|
207
|
+
embeddings = _validate_inputs(
|
|
208
|
+
as_numpy(embeddings, dtype=np.float64, required_ndim=2), num_observations, force_unit_interval
|
|
209
|
+
)
|
|
192
210
|
_logger.debug("Embeddings shape: %s", embeddings.shape)
|
|
193
211
|
|
|
194
212
|
critical_value_radii = _calculate_critical_value_radii(embeddings, num_observations)
|
|
@@ -33,7 +33,8 @@ class FeatureDistanceResult(TypedDict):
|
|
|
33
33
|
location : float
|
|
34
34
|
The normalized location where the KS statistic was achieved
|
|
35
35
|
dist : float
|
|
36
|
-
The
|
|
36
|
+
The Wasserstein distance between distributions, scaled by the
|
|
37
|
+
Interquartile Range (IQR) of the reference distribution.
|
|
37
38
|
p_value : float
|
|
38
39
|
The p-value from the KS test
|
|
39
40
|
"""
|
|
@@ -67,7 +68,7 @@ def feature_distance(
|
|
|
67
68
|
Measure the feature-wise distance between two continuous distributions.
|
|
68
69
|
|
|
69
70
|
Computes a p-value to evaluate its significance.
|
|
70
|
-
Uses the
|
|
71
|
+
Uses the Kolmogorov-Smirnov two-sample test and an IQR-scaled Wasserstein distance, featurewise.
|
|
71
72
|
|
|
72
73
|
Parameters
|
|
73
74
|
----------
|
|
@@ -83,7 +84,7 @@ def feature_distance(
|
|
|
83
84
|
|
|
84
85
|
- statistic: float - The Kolmogorov-Smirnov test statistic
|
|
85
86
|
- location: float - The normalized location where the KS statistic was achieved
|
|
86
|
-
- dist: float - The
|
|
87
|
+
- dist: float - The IQR-scaled Wasserstein distance between distributions
|
|
87
88
|
- p_value: float - The p-value from the KS test
|
|
88
89
|
|
|
89
90
|
See Also
|
|
@@ -194,9 +194,9 @@ def label_parity(
|
|
|
194
194
|
f"Found {len(observed_dist)} unique classes in observed label distribution, "
|
|
195
195
|
f"but found {len(expected_dist)} unique classes in expected label distribution. "
|
|
196
196
|
"This can happen when some class ids have zero instances in one dataset but "
|
|
197
|
-
"not in the other.
|
|
198
|
-
"
|
|
199
|
-
"
|
|
197
|
+
"not in the other. Try setting the num_classes parameter to the known number "
|
|
198
|
+
"of unique class ids, so that classes with zero instances are still included "
|
|
199
|
+
"in the distributions.",
|
|
200
200
|
)
|
|
201
201
|
|
|
202
202
|
cs, p = chisquare(f_obs=observed_dist, f_exp=expected_dist)
|
|
@@ -16,12 +16,6 @@ from dataeval.protocols import SequenceLike
|
|
|
16
16
|
|
|
17
17
|
_logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
|
-
_NATS2BITS = 1.442695
|
|
20
|
-
"""
|
|
21
|
-
_NATS2BITS is the reciprocal of natural log of 2. If you have an information/entropy-type quantity measured in nats,
|
|
22
|
-
which is what many library functions return, multiply it by _NATS2BITS to get it in bits.
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
19
|
|
|
26
20
|
def _calc_median_deviations(reference: NDArray[Any], test: NDArray[Any]) -> NDArray[Any]:
|
|
27
21
|
"""
|
|
@@ -189,12 +183,14 @@ def factor_predictors( # noqa: C901
|
|
|
189
183
|
discrete_features: list[bool] | None = None,
|
|
190
184
|
) -> Mapping[str, float]:
|
|
191
185
|
"""
|
|
192
|
-
Compute mutual information between metadata factors and flagged sample indices.
|
|
186
|
+
Compute a measure of mutual information between metadata factors and flagged sample indices.
|
|
193
187
|
|
|
194
188
|
Given a set of metadata factors per sample and indices of flagged samples, this function
|
|
195
189
|
calculates the mutual information between each factor and the flagged status.
|
|
196
190
|
In other words, it finds which metadata factors most likely correlate to a
|
|
197
|
-
flagged sample (e.g., outliers, OOD samples, or other anomalies).
|
|
191
|
+
flagged sample (e.g., outliers, OOD samples, or other anomalies). The maximum possible MI
|
|
192
|
+
is equal to the entropy of the flagged indices, so we normalize by that entropy in order
|
|
193
|
+
to return a measure of association on a scale from 0 to 1.
|
|
198
194
|
|
|
199
195
|
Parameters
|
|
200
196
|
----------
|
|
@@ -213,14 +209,15 @@ def factor_predictors( # noqa: C901
|
|
|
213
209
|
-------
|
|
214
210
|
Mapping[str, float]
|
|
215
211
|
A map with keys corresponding to factor names, and values indicating the strength of association
|
|
216
|
-
between each named factor and the flagged status, as mutual information
|
|
212
|
+
between each named factor and the flagged status, as normalized mutual information.
|
|
217
213
|
Returns dict with 0.0 values for all factors if no indices are provided.
|
|
218
214
|
|
|
219
215
|
Notes
|
|
220
216
|
-----
|
|
221
217
|
A high mutual information between a factor and flagged samples is an indication of correlation,
|
|
222
218
|
but not causation. Additional analysis should be done to determine how to handle factors
|
|
223
|
-
with a high mutual information.
|
|
219
|
+
with a high mutual information. And note that "high" is always relative to the information
|
|
220
|
+
or entropy represented by the flagged indices, which is why we use that entropy to normalize.
|
|
224
221
|
|
|
225
222
|
Examples
|
|
226
223
|
--------
|
|
@@ -230,7 +227,7 @@ def factor_predictors( # noqa: C901
|
|
|
230
227
|
... }
|
|
231
228
|
>>> indices = [2, 3, 4] # Flag last three samples
|
|
232
229
|
>>> factor_predictors(factors, indices)
|
|
233
|
-
{'time': 0.
|
|
230
|
+
{'time': 0.866750699769533, 'altitude': 0.0}
|
|
234
231
|
"""
|
|
235
232
|
if not factors:
|
|
236
233
|
raise ValueError("factors dictionary cannot be empty")
|
|
@@ -266,15 +263,22 @@ def factor_predictors( # noqa: C901
|
|
|
266
263
|
f"discrete_features length ({len(discrete_features)}) must match number of factors ({len(factor_names)})",
|
|
267
264
|
)
|
|
268
265
|
|
|
269
|
-
mutual_info_values = (
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
n_jobs=get_max_processes(), # type: ignore
|
|
276
|
-
)
|
|
277
|
-
* _NATS2BITS
|
|
266
|
+
mutual_info_values = mutual_info_classif(
|
|
267
|
+
X=scaled_data,
|
|
268
|
+
y=sample_mask,
|
|
269
|
+
discrete_features=discrete_features, # type: ignore - sklearn function not typed
|
|
270
|
+
random_state=get_seed(),
|
|
271
|
+
n_jobs=get_max_processes(), # type: ignore
|
|
278
272
|
)
|
|
279
273
|
|
|
274
|
+
# We normalize the mutual info by the entropy of the flag, i.e. by its maximal
|
|
275
|
+
# information content. This yields a true measure of the strength of
|
|
276
|
+
# association between metadata factors and the flag, from 0 to 1.
|
|
277
|
+
if 0 < (frac_flagged := len(indices) / n_samples) < 1:
|
|
278
|
+
flagged_entropy = -(frac_flagged * np.log(frac_flagged) + (1 - frac_flagged) * np.log(1 - frac_flagged))
|
|
279
|
+
mutual_info_values = np.clip(mutual_info_values / flagged_entropy, 0, 1)
|
|
280
|
+
else:
|
|
281
|
+
# all or none are flagged, no MI possible.
|
|
282
|
+
mutual_info_values = np.zeros_like(mutual_info_values)
|
|
283
|
+
|
|
280
284
|
return {k: mutual_info_values[i] for i, k in enumerate(factor_names)}
|
|
@@ -19,7 +19,7 @@ _logger = logging.getLogger(__name__)
|
|
|
19
19
|
|
|
20
20
|
class MutualInfoResult(TypedDict):
|
|
21
21
|
"""
|
|
22
|
-
Type definition for mutual information output.
|
|
22
|
+
Type definition for normalized mutual information output.
|
|
23
23
|
|
|
24
24
|
Attributes
|
|
25
25
|
----------
|
|
@@ -60,18 +60,18 @@ def _merge_labels_and_factors(
|
|
|
60
60
|
factor_data: NDArray[np.intp],
|
|
61
61
|
discrete_features: Iterable[bool] | None,
|
|
62
62
|
) -> tuple[NDArray[np.intp], list[bool]]:
|
|
63
|
-
|
|
63
|
+
discrete_list = [True] + (
|
|
64
64
|
[not is_continuous(d) for d in factor_data.T] if discrete_features is None else list(discrete_features)
|
|
65
65
|
)
|
|
66
66
|
|
|
67
67
|
# Use numeric data for MI
|
|
68
68
|
data = np.hstack((class_labels[:, np.newaxis], factor_data))
|
|
69
69
|
# Present discrete features composed of distinct values as continuous for `mutual_info_classif`
|
|
70
|
-
for i in range(len(
|
|
70
|
+
for i in range(len(discrete_list)):
|
|
71
71
|
if len(data) == len(np.unique(data[:, i])):
|
|
72
|
-
|
|
72
|
+
discrete_list[i] = False
|
|
73
73
|
|
|
74
|
-
return data,
|
|
74
|
+
return data, discrete_list
|
|
75
75
|
|
|
76
76
|
|
|
77
77
|
def mutual_info( # noqa: C901
|
|
@@ -81,7 +81,7 @@ def mutual_info( # noqa: C901
|
|
|
81
81
|
num_neighbors: int = 5,
|
|
82
82
|
) -> MutualInfoResult:
|
|
83
83
|
"""
|
|
84
|
-
Compute mutual information between factors, transformed to lie in [0, 1].
|
|
84
|
+
Compute normalized mutual information between factors, transformed to lie in [0, 1].
|
|
85
85
|
|
|
86
86
|
Factors include class label, metadata, and label/image properties.
|
|
87
87
|
|
|
@@ -101,8 +101,8 @@ def mutual_info( # noqa: C901
|
|
|
101
101
|
MutualInfoResult
|
|
102
102
|
TypedDict containing:
|
|
103
103
|
|
|
104
|
-
- class_to_factor: NDArray[np.float64] - 1D array of MI between class labels and each factor
|
|
105
|
-
- interfactor: NDArray[np.float64] - (num_factors) x (num_factors) matrix of MI between factors only
|
|
104
|
+
- class_to_factor: NDArray[np.float64] - 1D array of normalized MI between class labels and each factor
|
|
105
|
+
- interfactor: NDArray[np.float64] - (num_factors) x (num_factors) matrix of normalized MI between factors only
|
|
106
106
|
|
|
107
107
|
Notes
|
|
108
108
|
-----
|
|
@@ -120,7 +120,7 @@ def mutual_info( # noqa: C901
|
|
|
120
120
|
|
|
121
121
|
Example
|
|
122
122
|
-------
|
|
123
|
-
Return balance (mutual information) of factors with class_labels
|
|
123
|
+
Return balance (normalized mutual information) of factors with class_labels
|
|
124
124
|
|
|
125
125
|
>>> rng = np.random.default_rng(175)
|
|
126
126
|
>>> class_labels = rng.choice([0, 1, 2], size=100)
|
|
@@ -155,7 +155,7 @@ def mutual_info( # noqa: C901
|
|
|
155
155
|
data, discrete_list = _merge_labels_and_factors(class_labels_np, factor_data_np, discrete_feat_np)
|
|
156
156
|
num_factors = len(discrete_list)
|
|
157
157
|
|
|
158
|
-
_logger.debug("Computing
|
|
158
|
+
_logger.debug("Computing NMI for %d factors (%d discrete)", num_factors, sum(discrete_list))
|
|
159
159
|
|
|
160
160
|
# initialize output matrix
|
|
161
161
|
mi = np.full((num_factors, num_factors), np.nan, dtype=np.float32)
|
|
@@ -195,7 +195,7 @@ def mutual_info( # noqa: C901
|
|
|
195
195
|
full_matrix = 0.5 * (mi + mi.T).astype(np.float64)
|
|
196
196
|
|
|
197
197
|
_logger.info(
|
|
198
|
-
"Mutual info calculation complete: %d factors, mean class_to_factor
|
|
198
|
+
"Mutual info calculation complete: %d factors, mean class_to_factor NMI=%.4f",
|
|
199
199
|
num_factors - 1,
|
|
200
200
|
np.mean(full_matrix[0, 1:]),
|
|
201
201
|
)
|
|
@@ -208,12 +208,12 @@ def mutual_info( # noqa: C901
|
|
|
208
208
|
|
|
209
209
|
def mutual_info_classwise(
|
|
210
210
|
class_labels: Array1D[int],
|
|
211
|
-
factor_data: Array2D[int],
|
|
211
|
+
factor_data: Array2D[int | float],
|
|
212
212
|
discrete_features: Array1D[bool] | None = None,
|
|
213
213
|
num_neighbors: int = 5,
|
|
214
214
|
) -> NDArray[np.float64]:
|
|
215
215
|
"""
|
|
216
|
-
Compute mutual information (
|
|
216
|
+
Compute normalized mutual information (NMI) between factors.
|
|
217
217
|
|
|
218
218
|
Factors include class label, metadata, and label/image properties.
|
|
219
219
|
|
|
@@ -221,7 +221,7 @@ def mutual_info_classwise(
|
|
|
221
221
|
----------
|
|
222
222
|
class_labels : Array1D[int]
|
|
223
223
|
Target class labels as integer indices. Can be a 1D list, or array-like object.
|
|
224
|
-
factor_data : Array2D[int]
|
|
224
|
+
factor_data : Array2D[int | float]
|
|
225
225
|
Factor values after binning or digitization. Can be a 1D list, or array-like object.
|
|
226
226
|
discrete_features : Array1D[bool] | None = None
|
|
227
227
|
Boolean array or iterable defining whether or not the feature set is discretized.
|
|
@@ -232,19 +232,19 @@ def mutual_info_classwise(
|
|
|
232
232
|
Returns
|
|
233
233
|
-------
|
|
234
234
|
NDArray[np.float64]
|
|
235
|
-
(
|
|
235
|
+
(num_classes) x (num_factors+1) estimate of normalized mutual information
|
|
236
236
|
between num_factors metadata factors and class label. Symmetry is enforced.
|
|
237
237
|
|
|
238
238
|
Notes
|
|
239
239
|
-----
|
|
240
240
|
We use `mutual_info_classif` from sklearn since class label is categorical.
|
|
241
241
|
`mutual_info_classif` outputs are consistent up to O(1e-4) and depend on a random
|
|
242
|
-
seed. MI is computed differently for categorical and continuous variables.
|
|
243
|
-
return a transformation of MI onto the interval [0, 1].
|
|
242
|
+
seed. MI is computed differently for categorical and continuous variables. In all cases,
|
|
243
|
+
we return either a normalization or transformation of MI onto the interval [0, 1].
|
|
244
244
|
|
|
245
245
|
Example
|
|
246
246
|
-------
|
|
247
|
-
Return classwise balance (mutual information) of factors with individual class_labels
|
|
247
|
+
Return classwise balance (normalized mutual information) of factors with individual class_labels
|
|
248
248
|
|
|
249
249
|
>>> rng = np.random.default_rng(175)
|
|
250
250
|
>>> class_labels = rng.choice([0, 1, 2], size=100)
|
|
@@ -267,7 +267,7 @@ def mutual_info_classwise(
|
|
|
267
267
|
_logger.info("Starting mutual_info_classwise calculation with num_neighbors=%d", num_neighbors)
|
|
268
268
|
|
|
269
269
|
class_labels_np = as_numpy(class_labels, dtype=np.intp, required_ndim=1)
|
|
270
|
-
factor_data_np = as_numpy(factor_data,
|
|
270
|
+
factor_data_np = as_numpy(factor_data, required_ndim=2)
|
|
271
271
|
discrete_feat_np = opt_as_numpy(discrete_features, dtype=np.bool_, required_ndim=1)
|
|
272
272
|
|
|
273
273
|
num_neighbors = _validate_num_neighbors(num_neighbors)
|
|
@@ -276,7 +276,7 @@ def mutual_info_classwise(
|
|
|
276
276
|
u_classes = np.unique(class_labels_np)
|
|
277
277
|
num_classes = len(u_classes)
|
|
278
278
|
|
|
279
|
-
_logger.debug("Computing classwise
|
|
279
|
+
_logger.debug("Computing classwise NMI for %d classes and %d factors", num_classes, num_factors)
|
|
280
280
|
|
|
281
281
|
# classwise targets (binary indicators)
|
|
282
282
|
tgt_bin = data[:, 0][:, None] == u_classes
|
|
@@ -11,14 +11,14 @@ import logging
|
|
|
11
11
|
from sklearn.metrics import average_precision_score
|
|
12
12
|
|
|
13
13
|
from dataeval._experimental import experimental
|
|
14
|
-
from dataeval.
|
|
14
|
+
from dataeval.protocols import ArrayLike
|
|
15
15
|
from dataeval.utils._internal import as_numpy
|
|
16
16
|
|
|
17
17
|
_logger = logging.getLogger(__name__)
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
@experimental
|
|
21
|
-
def uap(labels:
|
|
21
|
+
def uap(labels: ArrayLike, scores: ArrayLike) -> float:
|
|
22
22
|
"""
|
|
23
23
|
Estimate the empirical mean precision for the upperbound average precision.
|
|
24
24
|
|
|
@@ -988,7 +988,7 @@ class Duplicates(Evaluator):
|
|
|
988
988
|
|
|
989
989
|
Attributes
|
|
990
990
|
----------
|
|
991
|
-
flags : ImageStats, default ImageStats.
|
|
991
|
+
flags : ImageStats, default ImageStats.HASH_DUPLICATES_BASIC
|
|
992
992
|
Statistics to compute for hash-based duplicate detection.
|
|
993
993
|
cluster_sensitivity : float or None, default None
|
|
994
994
|
Distance factor for cluster-based near duplicate detection. Scales
|
|
@@ -978,10 +978,10 @@ class Outliers(Evaluator):
|
|
|
978
978
|
|
|
979
979
|
- ``AdaptiveThreshold`` (default): Uses tail-weighted Double-MAD (separate MAD for
|
|
980
980
|
data below and above the median) with automatic multiplier scaling for heavy
|
|
981
|
-
tails to produce asymmetric bounds. Default multiplier: 3.
|
|
981
|
+
tails to produce asymmetric bounds. Default multiplier: 3.5.
|
|
982
982
|
- ``ModifiedZScoreThreshold``: Based on median absolute deviation. Default multiplier: 3.5.
|
|
983
983
|
Modified z score = :math:`0.6745 * |x_i - x̃| / MAD`
|
|
984
|
-
- ``ZScoreThreshold``: Based on standard deviation from mean. Default multiplier: 3.
|
|
984
|
+
- ``ZScoreThreshold``: Based on standard deviation from mean. Default multiplier: 3.0.
|
|
985
985
|
Z score = :math:`|x_i - \mu| / \sigma`
|
|
986
986
|
- ``IQRThreshold``: Based on interquartile range. Default multiplier: 1.5.
|
|
987
987
|
Outliers are outside :math:`[Q_1 - 1.5 \cdot IQR, Q_3 + 1.5 \cdot IQR]`
|
|
@@ -364,6 +364,9 @@ class Prioritize(Evaluator):
|
|
|
364
364
|
----------
|
|
365
365
|
extractor : FeatureExtractor
|
|
366
366
|
Feature extractor instance to use for extracting embeddings from data.
|
|
367
|
+
batch_size : int or None, default None
|
|
368
|
+
Batch size for embedding computation. When None, uses the global
|
|
369
|
+
batch size from :func:`~dataeval.config.get_batch_size`.
|
|
367
370
|
method : {"knn", "kmeans_distance", "kmeans_complexity", "hdbscan_distance", \
|
|
368
371
|
"hdbscan_complexity"}, default "knn"
|
|
369
372
|
Ranking method to use:
|
|
@@ -470,6 +473,9 @@ class Prioritize(Evaluator):
|
|
|
470
473
|
extractor : FeatureExtractor or None
|
|
471
474
|
Feature extractor instance to use for extracting embeddings
|
|
472
475
|
from data.
|
|
476
|
+
batch_size : int or None, default None
|
|
477
|
+
Batch size for embedding computation. When None, uses the global
|
|
478
|
+
batch size from :func:`~dataeval.config.get_batch_size`.
|
|
473
479
|
method : {"knn", "kmeans_distance", "kmeans_complexity", "hdbscan_distance", \
|
|
474
480
|
"hdbscan_complexity"}, default "knn"
|
|
475
481
|
Ranking method to use.
|
|
@@ -481,9 +487,6 @@ class Prioritize(Evaluator):
|
|
|
481
487
|
Number of K-means initializations (kmeans methods only).
|
|
482
488
|
max_cluster_size : int or None, default None
|
|
483
489
|
Maximum cluster size for HDBSCAN methods.
|
|
484
|
-
batch_size : int or None, default None
|
|
485
|
-
Batch size for embedding computation. When None, uses the global
|
|
486
|
-
batch size from :func:`~dataeval.config.get_batch_size`.
|
|
487
490
|
order : {"easy_first", "hard_first"}, default "easy_first"
|
|
488
491
|
Sort direction for output indices.
|
|
489
492
|
policy : {"difficulty", "stratified", "class_balanced"}, default "difficulty"
|
|
@@ -43,7 +43,8 @@ class DriftOutput(DictOutput, Generic[TDetails]):
|
|
|
43
43
|
For multivariate methods, this is the corrected threshold after
|
|
44
44
|
Bonferroni or FDR correction.
|
|
45
45
|
distance : float
|
|
46
|
-
Instance-level test statistic or distance metric
|
|
46
|
+
Instance-level test statistic or distance metric. Typically >= 0, but can be
|
|
47
|
+
slightly negative for metrics like unbiased MMD².
|
|
47
48
|
For univariate methods, this is the mean distance across all features.
|
|
48
49
|
Higher values indicate greater deviation from reference distribution.
|
|
49
50
|
metric_name : str
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
__all__ = []
|
|
4
4
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, Literal
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from numpy.typing import NDArray
|
|
@@ -32,6 +32,11 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
32
32
|
reference as class 0, runs repeated k-fold CV, and returns per-point
|
|
33
33
|
class-1 rates. Points with rates exceeding the threshold are flagged OOD.
|
|
34
34
|
|
|
35
|
+
Note: By default, this detector uses the ``n_std`` based threshold for
|
|
36
|
+
predictions. If a value for ``threshold_perc`` is provided (either directly
|
|
37
|
+
or via config), it will use percentile-based thresholding from reference
|
|
38
|
+
scores instead.
|
|
39
|
+
|
|
35
40
|
Parameters
|
|
36
41
|
----------
|
|
37
42
|
n_folds : int, default 5
|
|
@@ -40,11 +45,12 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
40
45
|
Number of times to repeat the k-fold split.
|
|
41
46
|
n_std : float, default 2.0
|
|
42
47
|
Number of standard deviations above the null mean for threshold.
|
|
48
|
+
Used when threshold_perc is not explicitly set.
|
|
43
49
|
hyperparameters : dict or None, default None
|
|
44
50
|
LightGBM hyperparameters.
|
|
45
51
|
threshold_perc : float or None, default None
|
|
46
52
|
Percentage of reference data considered normal (0-100).
|
|
47
|
-
If
|
|
53
|
+
If provided, overrides ``n_std`` for percentile-based thresholding.
|
|
48
54
|
extractor : FeatureExtractor or None, default None
|
|
49
55
|
Feature extractor for transforming input data before scoring.
|
|
50
56
|
When provided, raw data is passed through the extractor in both
|
|
@@ -59,7 +65,7 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
59
65
|
>>> test = np.random.randn(50, 8).astype(np.float32) + 3
|
|
60
66
|
>>> detector = OODDomainClassifier(n_folds=3, n_repeats=3)
|
|
61
67
|
>>> detector.fit(ref)
|
|
62
|
-
OODDomainClassifier(n_folds=3, n_repeats=3, n_std=2.0, threshold_perc=
|
|
68
|
+
OODDomainClassifier(n_folds=3, n_repeats=3, n_std=2.0, threshold_perc=None, hyperparameters=None, extractor=None, fitted=True)
|
|
63
69
|
>>> predictions = detector.predict(test)
|
|
64
70
|
""" # noqa: E501
|
|
65
71
|
|
|
@@ -76,8 +82,9 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
76
82
|
Number of k-fold repeats.
|
|
77
83
|
n_std : float, default 2.0
|
|
78
84
|
Threshold multiplier for standard deviations above null mean.
|
|
79
|
-
|
|
80
|
-
|
|
85
|
+
Used when threshold_perc is None.
|
|
86
|
+
threshold_perc : float or None, default None
|
|
87
|
+
Percentile-based threshold. If provided, overrides n_std.
|
|
81
88
|
hyperparameters : dict or None, default None
|
|
82
89
|
LightGBM hyperparameters.
|
|
83
90
|
extractor : FeatureExtractor or None, default None
|
|
@@ -87,7 +94,7 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
87
94
|
n_folds: int = 5
|
|
88
95
|
n_repeats: int = 5
|
|
89
96
|
n_std: float = 2.0
|
|
90
|
-
threshold_perc: float =
|
|
97
|
+
threshold_perc: float | None = None
|
|
91
98
|
hyperparameters: dict[str, Any] | None = None
|
|
92
99
|
extractor: FeatureExtractor | None = None
|
|
93
100
|
|
|
@@ -103,8 +110,11 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
103
110
|
) -> None:
|
|
104
111
|
base_config = config or OODDomainClassifier.Config()
|
|
105
112
|
|
|
106
|
-
|
|
107
|
-
|
|
113
|
+
self._threshold_perc_set = threshold_perc is not None or (
|
|
114
|
+
config is not None and config.threshold_perc is not None
|
|
115
|
+
)
|
|
116
|
+
perc = threshold_perc if threshold_perc is not None else (base_config.threshold_perc or 95.0)
|
|
117
|
+
super().__init__(perc)
|
|
108
118
|
|
|
109
119
|
self._n_folds = n_folds if n_folds is not None else base_config.n_folds
|
|
110
120
|
self._n_repeats = n_repeats if n_repeats is not None else base_config.n_repeats
|
|
@@ -115,7 +125,7 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
115
125
|
n_folds=self._n_folds,
|
|
116
126
|
n_repeats=self._n_repeats,
|
|
117
127
|
n_std=self._n_std,
|
|
118
|
-
threshold_perc=threshold_perc,
|
|
128
|
+
threshold_perc=threshold_perc if threshold_perc is not None else base_config.threshold_perc,
|
|
119
129
|
hyperparameters=self._hyperparameters,
|
|
120
130
|
extractor=self._extractor,
|
|
121
131
|
)
|
|
@@ -177,6 +187,12 @@ class OODDomainClassifier(ExtractorMixin, BaseOOD):
|
|
|
177
187
|
self._ref_score = self.score(reference_data)
|
|
178
188
|
return self
|
|
179
189
|
|
|
190
|
+
def _threshold_score(self, ood_type: Literal["feature", "instance"] = "instance") -> np.floating:
|
|
191
|
+
"""Get the threshold score. Prefers n_std threshold unless threshold_perc was explicitly set."""
|
|
192
|
+
if not self._threshold_perc_set and ood_type == "instance":
|
|
193
|
+
return np.float64(self._threshold)
|
|
194
|
+
return super()._threshold_score(ood_type)
|
|
195
|
+
|
|
180
196
|
def _score(self, x: NDArray[np.float32], batch_size: int | None = None) -> OODScoreOutput: # noqa: ARG002
|
|
181
197
|
"""Compute per-point class-1 rates for test data vs reference."""
|
|
182
198
|
x_ref = self._reference_data
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|