fairscope 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fairscope/__init__.py +55 -0
- fairscope/core/__init__.py +48 -0
- fairscope/core/_utils.py +44 -0
- fairscope/core/bootstrap.py +66 -0
- fairscope/core/calibration.py +168 -0
- fairscope/core/correction.py +49 -0
- fairscope/core/delong.py +151 -0
- fairscope/core/metrics.py +96 -0
- fairscope/federated/__init__.py +9 -0
- fairscope/federated/audit.py +217 -0
- fairscope/healthcare/__init__.py +5 -0
- fairscope/healthcare/audit.py +227 -0
- fairscope/lending/__init__.py +5 -0
- fairscope/lending/audit.py +160 -0
- fairscope/nlp/__init__.py +29 -0
- fairscope/nlp/attribution.py +54 -0
- fairscope/nlp/cross_platform.py +178 -0
- fairscope/nlp/metrics.py +87 -0
- fairscope/nlp/significance.py +51 -0
- fairscope-0.3.0.dist-info/METADATA +202 -0
- fairscope-0.3.0.dist-info/RECORD +23 -0
- fairscope-0.3.0.dist-info/WHEEL +4 -0
- fairscope-0.3.0.dist-info/licenses/LICENSE +21 -0
fairscope/__init__.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""fairscope: subgroup-stratified, calibration-aware fairness auditing for ML models.
|
|
2
|
+
|
|
3
|
+
A peer-reviewed-method-backed Python library that fills documented gaps in mainstream
|
|
4
|
+
fairness toolkits (AIF360, Fairlearn): per-subgroup DeLong confidence intervals,
|
|
5
|
+
per-subgroup Expected Calibration Error, calibration-aware fairness, a five-axis
|
|
6
|
+
Cross-Platform Fairness Evaluation (CPFE) protocol, and per-node federated audits.
|
|
7
|
+
|
|
8
|
+
See ``docs/DESIGN.md`` for the methods, API design, and roadmap.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.3.0"
|
|
12
|
+
|
|
13
|
+
__all__ = ["FairnessAudit", "__version__"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def FairnessAudit(model, domain, **kwargs):
|
|
17
|
+
"""Route to a domain-specific fairness audit.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
model : the fitted classifier (or ``None`` with precomputed scores via the domain API).
|
|
22
|
+
domain : str
|
|
23
|
+
The audit domain. Implemented: ``"healthcare"`` (uses ``model``), ``"nlp"``
|
|
24
|
+
(the CPFE protocol; operates on precomputed ``platform_data``, ``model`` ignored),
|
|
25
|
+
``"federated"`` (cross-node audit of precomputed ``node_data``, ``model``
|
|
26
|
+
ignored), and ``"lending"`` (annual approval-gap + subgroup CATE, ``model``
|
|
27
|
+
ignored).
|
|
28
|
+
**kwargs : passed through to the domain audit class.
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> import fairscope
|
|
33
|
+
>>> callable(fairscope.FairnessAudit)
|
|
34
|
+
True
|
|
35
|
+
"""
|
|
36
|
+
if domain == "healthcare":
|
|
37
|
+
from .healthcare import HealthcareFairnessAudit
|
|
38
|
+
|
|
39
|
+
return HealthcareFairnessAudit(model, **kwargs)
|
|
40
|
+
if domain == "nlp":
|
|
41
|
+
from .nlp import CPFEProtocol
|
|
42
|
+
|
|
43
|
+
return CPFEProtocol(**kwargs)
|
|
44
|
+
if domain == "federated":
|
|
45
|
+
from .federated import FederatedFairnessAudit
|
|
46
|
+
|
|
47
|
+
return FederatedFairnessAudit(**kwargs)
|
|
48
|
+
if domain == "lending":
|
|
49
|
+
from .lending import LendingFairnessAudit
|
|
50
|
+
|
|
51
|
+
return LendingFairnessAudit(**kwargs)
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"unknown or unimplemented domain: {domain!r}; "
|
|
54
|
+
"available domains: 'healthcare', 'nlp', 'federated', 'lending'"
|
|
55
|
+
)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Core statistical primitives for fairscope.
|
|
2
|
+
|
|
3
|
+
Public API for subgroup-stratified, calibration-aware fairness auditing:
|
|
4
|
+
DeLong AUC confidence intervals and tests, a stratified bootstrap AUC test, calibration
|
|
5
|
+
error and recalibration, multiple-comparison corrections, and subgroup fairness metrics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .bootstrap import bootstrap_auc_test
|
|
9
|
+
from .calibration import (
|
|
10
|
+
ece_by_group,
|
|
11
|
+
expected_calibration_error,
|
|
12
|
+
isotonic_recalibrate,
|
|
13
|
+
maximum_calibration_error,
|
|
14
|
+
reliability_diagram,
|
|
15
|
+
temperature_scale,
|
|
16
|
+
)
|
|
17
|
+
from .correction import benjamini_hochberg, bonferroni
|
|
18
|
+
from .delong import (
|
|
19
|
+
delong_auc_ci,
|
|
20
|
+
delong_by_group,
|
|
21
|
+
delong_paired_test,
|
|
22
|
+
delong_unpaired_test,
|
|
23
|
+
)
|
|
24
|
+
from .metrics import disparate_impact, equalized_odds_difference, subgroup_metrics
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
# AUC confidence intervals and tests (delong)
|
|
28
|
+
"delong_auc_ci",
|
|
29
|
+
"delong_paired_test",
|
|
30
|
+
"delong_unpaired_test",
|
|
31
|
+
"delong_by_group",
|
|
32
|
+
# AUC significance via stratified bootstrap
|
|
33
|
+
"bootstrap_auc_test",
|
|
34
|
+
# calibration error and recalibration
|
|
35
|
+
"expected_calibration_error",
|
|
36
|
+
"maximum_calibration_error",
|
|
37
|
+
"ece_by_group",
|
|
38
|
+
"reliability_diagram",
|
|
39
|
+
"temperature_scale",
|
|
40
|
+
"isotonic_recalibrate",
|
|
41
|
+
# multiple-comparison correction
|
|
42
|
+
"bonferroni",
|
|
43
|
+
"benjamini_hochberg",
|
|
44
|
+
# subgroup fairness metrics
|
|
45
|
+
"disparate_impact",
|
|
46
|
+
"equalized_odds_difference",
|
|
47
|
+
"subgroup_metrics",
|
|
48
|
+
]
|
fairscope/core/_utils.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Shared input validation. Fail loudly and early; never fail silently."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _as_1d_arrays(*arrays):
|
|
9
|
+
"""Coerce inputs to 1-D numpy arrays of equal length; raise otherwise."""
|
|
10
|
+
out = [np.asarray(a) for a in arrays]
|
|
11
|
+
n = len(out[0])
|
|
12
|
+
for a in out:
|
|
13
|
+
if a.ndim != 1:
|
|
14
|
+
raise ValueError("inputs must be 1-D arrays")
|
|
15
|
+
if len(a) != n:
|
|
16
|
+
raise ValueError("inputs must all have the same length")
|
|
17
|
+
return out
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _check_binary_values(y):
|
|
21
|
+
"""Validate labels are binary 0/1 with no NaN. Single-class is allowed."""
|
|
22
|
+
y = np.asarray(y)
|
|
23
|
+
if np.any(np.isnan(y.astype(float))):
|
|
24
|
+
raise ValueError("y_true contains NaN")
|
|
25
|
+
uniq = np.unique(y)
|
|
26
|
+
if not set(uniq.tolist()).issubset({0, 1}):
|
|
27
|
+
raise ValueError(f"y_true must be binary 0/1; got values {uniq.tolist()}")
|
|
28
|
+
return y.astype(int)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _check_binary_labels(y):
|
|
32
|
+
"""Like _check_binary_values, but require BOTH classes present (AUC needs both)."""
|
|
33
|
+
y = _check_binary_values(y)
|
|
34
|
+
if len(np.unique(y)) < 2:
|
|
35
|
+
raise ValueError("y_true must contain both classes (0 and 1)")
|
|
36
|
+
return y
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _check_scores(s):
|
|
40
|
+
"""Validate scores are finite floats."""
|
|
41
|
+
s = np.asarray(s, dtype=float)
|
|
42
|
+
if np.any(np.isnan(s)):
|
|
43
|
+
raise ValueError("scores contain NaN")
|
|
44
|
+
return s
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Stratified bootstrap test for the difference between two AUCs.
|
|
2
|
+
|
|
3
|
+
This is the significance procedure used for the cross-platform macro-AUC comparison in the
|
|
4
|
+
CPFE protocol (stratified bootstrap standard errors, B = 2000 by default). It complements
|
|
5
|
+
``delong`` (analytic per-subgroup CIs).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from scipy import stats
|
|
12
|
+
from sklearn.metrics import roc_auc_score
|
|
13
|
+
|
|
14
|
+
from ._utils import _as_1d_arrays, _check_binary_labels, _check_scores
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def bootstrap_auc_test(y_true, score_a, score_b, n_boot=2000, random_state=None):
|
|
18
|
+
"""Stratified bootstrap test of (AUC_a - AUC_b) on the same samples.
|
|
19
|
+
|
|
20
|
+
Resamples positives and negatives separately (preserving class balance). Returns a
|
|
21
|
+
dict: auc_a, auc_b, delta, se, z, p_value, ci_lower, ci_upper, n_boot.
|
|
22
|
+
|
|
23
|
+
Examples
|
|
24
|
+
--------
|
|
25
|
+
>>> import numpy as np
|
|
26
|
+
>>> rng = np.random.default_rng(0)
|
|
27
|
+
>>> y = rng.integers(0, 2, 400)
|
|
28
|
+
>>> a = rng.random(400) + 0.8 * y
|
|
29
|
+
>>> b = rng.random(400) + 0.05 * y
|
|
30
|
+
>>> bootstrap_auc_test(y, a, b, n_boot=200, random_state=0)["delta"] > 0
|
|
31
|
+
True
|
|
32
|
+
"""
|
|
33
|
+
y_true, score_a, score_b = _as_1d_arrays(y_true, score_a, score_b)
|
|
34
|
+
y = _check_binary_labels(y_true)
|
|
35
|
+
a = _check_scores(score_a)
|
|
36
|
+
b = _check_scores(score_b)
|
|
37
|
+
rng = np.random.default_rng(random_state)
|
|
38
|
+
|
|
39
|
+
pos_idx = np.flatnonzero(y == 1)
|
|
40
|
+
neg_idx = np.flatnonzero(y == 0)
|
|
41
|
+
auc_a = float(roc_auc_score(y, a))
|
|
42
|
+
auc_b = float(roc_auc_score(y, b))
|
|
43
|
+
observed = auc_a - auc_b
|
|
44
|
+
|
|
45
|
+
deltas = np.empty(n_boot, dtype=float)
|
|
46
|
+
for i in range(n_boot):
|
|
47
|
+
bp = rng.choice(pos_idx, size=pos_idx.size, replace=True)
|
|
48
|
+
bn = rng.choice(neg_idx, size=neg_idx.size, replace=True)
|
|
49
|
+
idx = np.concatenate([bp, bn])
|
|
50
|
+
yb = y[idx]
|
|
51
|
+
deltas[i] = roc_auc_score(yb, a[idx]) - roc_auc_score(yb, b[idx])
|
|
52
|
+
|
|
53
|
+
se = float(deltas.std(ddof=1))
|
|
54
|
+
z = observed / se if se > 0 else 0.0
|
|
55
|
+
lo, hi = np.percentile(deltas, [2.5, 97.5])
|
|
56
|
+
return {
|
|
57
|
+
"auc_a": auc_a,
|
|
58
|
+
"auc_b": auc_b,
|
|
59
|
+
"delta": observed,
|
|
60
|
+
"se": se,
|
|
61
|
+
"z": float(z),
|
|
62
|
+
"p_value": float(2.0 * stats.norm.sf(abs(z))),
|
|
63
|
+
"ci_lower": float(lo),
|
|
64
|
+
"ci_upper": float(hi),
|
|
65
|
+
"n_boot": n_boot,
|
|
66
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Calibration analysis and a subgroup-stratified interface to standard recalibration.
|
|
2
|
+
|
|
3
|
+
ECE/MCE bin the predicted positive-class probability (equal-width or equal-frequency) and
|
|
4
|
+
compare each bin's mean probability to its observed frequency. ``temperature_scale``
|
|
5
|
+
(Guo et al., 2017) and ``isotonic_recalibrate`` (Zadrozny & Elkan, 2002) are standard
|
|
6
|
+
recalibration methods; the contribution here is the per-subgroup interface and pre/post-ECE
|
|
7
|
+
reporting, not the methods themselves.
|
|
8
|
+
|
|
9
|
+
References
|
|
10
|
+
----------
|
|
11
|
+
Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. (2017). On calibration of modern neural
|
|
12
|
+
networks. ICML.
|
|
13
|
+
Zadrozny, B., & Elkan, C. (2002). Transforming classifier scores into accurate multiclass
|
|
14
|
+
probability estimates. KDD.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
from scipy import optimize
|
|
21
|
+
from sklearn.isotonic import IsotonicRegression
|
|
22
|
+
|
|
23
|
+
from ._utils import _as_1d_arrays, _check_binary_values, _check_scores
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _bin_edges(y_prob, n_bins, strategy):
|
|
27
|
+
if strategy == "uniform":
|
|
28
|
+
return np.linspace(0.0, 1.0, n_bins + 1)
|
|
29
|
+
if strategy == "quantile":
|
|
30
|
+
edges = np.unique(np.quantile(y_prob, np.linspace(0, 1, n_bins + 1)))
|
|
31
|
+
edges[0], edges[-1] = 0.0, 1.0
|
|
32
|
+
return edges
|
|
33
|
+
raise ValueError("strategy must be 'uniform' or 'quantile'")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _binned_gaps(y_true, y_prob, n_bins, strategy):
|
|
37
|
+
edges = _bin_edges(y_prob, n_bins, strategy)
|
|
38
|
+
idx = np.digitize(y_prob, edges[1:-1], right=False)
|
|
39
|
+
n = len(y_prob)
|
|
40
|
+
gaps, weights = [], []
|
|
41
|
+
for b in range(len(edges) - 1):
|
|
42
|
+
mask = idx == b
|
|
43
|
+
if not np.any(mask):
|
|
44
|
+
continue
|
|
45
|
+
conf = float(np.mean(y_prob[mask]))
|
|
46
|
+
acc = float(np.mean(y_true[mask]))
|
|
47
|
+
gaps.append(abs(acc - conf))
|
|
48
|
+
weights.append(mask.sum() / n)
|
|
49
|
+
return np.array(gaps), np.array(weights)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def expected_calibration_error(y_true, y_prob, n_bins=10, strategy="uniform"):
|
|
53
|
+
"""Weighted mean |observed frequency - mean predicted probability| across bins.
|
|
54
|
+
|
|
55
|
+
Returns 0.0 if there is no data. ``strategy`` is 'uniform' (equal-width) or 'quantile'
|
|
56
|
+
(equal-frequency) bins.
|
|
57
|
+
|
|
58
|
+
Examples
|
|
59
|
+
--------
|
|
60
|
+
>>> import numpy as np
|
|
61
|
+
>>> y = np.array([0, 0, 1, 1])
|
|
62
|
+
>>> p = np.array([0.0, 0.0, 1.0, 1.0])
|
|
63
|
+
>>> expected_calibration_error(y, p)
|
|
64
|
+
0.0
|
|
65
|
+
"""
|
|
66
|
+
y_true, y_prob = _as_1d_arrays(y_true, y_prob)
|
|
67
|
+
y = _check_binary_values(y_true)
|
|
68
|
+
p = _check_scores(y_prob)
|
|
69
|
+
gaps, weights = _binned_gaps(y, p, n_bins, strategy)
|
|
70
|
+
return float(np.sum(weights * gaps)) if gaps.size else 0.0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def maximum_calibration_error(y_true, y_prob, n_bins=10, strategy="uniform"):
|
|
74
|
+
"""Maximum |observed frequency - mean predicted probability| over bins."""
|
|
75
|
+
y_true, y_prob = _as_1d_arrays(y_true, y_prob)
|
|
76
|
+
y = _check_binary_values(y_true)
|
|
77
|
+
p = _check_scores(y_prob)
|
|
78
|
+
gaps, _ = _binned_gaps(y, p, n_bins, strategy)
|
|
79
|
+
return float(np.max(gaps)) if gaps.size else 0.0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def ece_by_group(y_true, y_prob, groups, n_bins=10, strategy="uniform"):
|
|
83
|
+
"""Per-subgroup Expected Calibration Error. Returns ``{group: ece}``."""
|
|
84
|
+
y_true, y_prob, groups = _as_1d_arrays(y_true, y_prob, groups)
|
|
85
|
+
out = {}
|
|
86
|
+
for g in np.unique(groups):
|
|
87
|
+
mask = groups == g
|
|
88
|
+
out[g] = expected_calibration_error(y_true[mask], y_prob[mask], n_bins, strategy)
|
|
89
|
+
return out
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def reliability_diagram(y_true, y_prob, groups=None, n_bins=10):
|
|
93
|
+
"""Reliability diagram (mean predicted probability vs observed frequency).
|
|
94
|
+
|
|
95
|
+
Returns a matplotlib ``Figure``. If ``groups`` is given, draws one curve per subgroup.
|
|
96
|
+
"""
|
|
97
|
+
import matplotlib.pyplot as plt
|
|
98
|
+
|
|
99
|
+
y_true, y_prob = _as_1d_arrays(y_true, y_prob)
|
|
100
|
+
fig, ax = plt.subplots()
|
|
101
|
+
ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="perfect")
|
|
102
|
+
|
|
103
|
+
def _curve(yt, yp, label):
|
|
104
|
+
edges = np.linspace(0, 1, n_bins + 1)
|
|
105
|
+
idx = np.digitize(yp, edges[1:-1])
|
|
106
|
+
xs, ys = [], []
|
|
107
|
+
for b in range(n_bins):
|
|
108
|
+
m = idx == b
|
|
109
|
+
if np.any(m):
|
|
110
|
+
xs.append(float(np.mean(yp[m])))
|
|
111
|
+
ys.append(float(np.mean(yt[m])))
|
|
112
|
+
ax.plot(xs, ys, marker="o", label=label)
|
|
113
|
+
|
|
114
|
+
if groups is None:
|
|
115
|
+
_curve(np.asarray(y_true), np.asarray(y_prob), "model")
|
|
116
|
+
else:
|
|
117
|
+
groups = np.asarray(groups)
|
|
118
|
+
for g in np.unique(groups):
|
|
119
|
+
m = groups == g
|
|
120
|
+
_curve(np.asarray(y_true)[m], np.asarray(y_prob)[m], f"group={g}")
|
|
121
|
+
ax.set_xlabel("mean predicted probability")
|
|
122
|
+
ax.set_ylabel("observed frequency")
|
|
123
|
+
ax.legend()
|
|
124
|
+
return fig
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def temperature_scale(logits, y_true, max_iter=200):
|
|
128
|
+
"""Fit a single temperature T>0 by minimizing NLL (Guo et al., 2017).
|
|
129
|
+
|
|
130
|
+
``logits`` may be 1-D (binary positive-class logit) or 2-D (n, n_classes). Returns
|
|
131
|
+
``(T, calibrated_probabilities)``; calibrated probabilities are 1-D for 1-D input.
|
|
132
|
+
"""
|
|
133
|
+
logits = np.asarray(logits, dtype=float)
|
|
134
|
+
y = np.asarray(y_true).astype(int)
|
|
135
|
+
logits2 = np.column_stack([np.zeros_like(logits), logits]) if logits.ndim == 1 else logits
|
|
136
|
+
|
|
137
|
+
def _nll(log_t):
|
|
138
|
+
t = np.exp(log_t[0]) # parameterize as exp() to keep T > 0
|
|
139
|
+
z = logits2 / t
|
|
140
|
+
z = z - z.max(axis=1, keepdims=True)
|
|
141
|
+
logsumexp = np.log(np.exp(z).sum(axis=1))
|
|
142
|
+
logp = z[np.arange(len(y)), y] - logsumexp
|
|
143
|
+
return -float(np.mean(logp))
|
|
144
|
+
|
|
145
|
+
res = optimize.minimize(
|
|
146
|
+
_nll,
|
|
147
|
+
x0=[0.0],
|
|
148
|
+
method="Nelder-Mead",
|
|
149
|
+
options={"maxiter": max_iter, "xatol": 1e-4, "fatol": 1e-6},
|
|
150
|
+
)
|
|
151
|
+
temp = float(np.exp(res.x[0]))
|
|
152
|
+
z = logits2 / temp
|
|
153
|
+
z = z - z.max(axis=1, keepdims=True)
|
|
154
|
+
probs = np.exp(z)
|
|
155
|
+
probs /= probs.sum(axis=1, keepdims=True)
|
|
156
|
+
calibrated = probs[:, 1] if logits.ndim == 1 else probs
|
|
157
|
+
return temp, calibrated
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def isotonic_recalibrate(probs, y_true):
|
|
161
|
+
"""Fit isotonic regression mapping predicted prob -> calibrated prob
|
|
162
|
+
(Zadrozny & Elkan, 2002). Returns ``(fitted_model, calibrated_probabilities)``."""
|
|
163
|
+
probs, y_true = _as_1d_arrays(probs, y_true)
|
|
164
|
+
p = _check_scores(probs)
|
|
165
|
+
y = _check_binary_values(y_true)
|
|
166
|
+
model = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
|
|
167
|
+
calibrated = model.fit_transform(p, y)
|
|
168
|
+
return model, calibrated
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Multiple-comparison corrections. Outputs match
|
|
2
|
+
``statsmodels.stats.multitest.multipletests`` for methods 'bonferroni' and 'fdr_bh'.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _check_pvals(p):
|
|
11
|
+
p = np.asarray(p, dtype=float)
|
|
12
|
+
if p.ndim != 1:
|
|
13
|
+
raise ValueError("p_values must be a 1-D array")
|
|
14
|
+
if np.any(np.isnan(p)):
|
|
15
|
+
raise ValueError("p_values contain NaN")
|
|
16
|
+
if np.any((p < 0) | (p > 1)):
|
|
17
|
+
raise ValueError("p_values must lie in [0, 1]")
|
|
18
|
+
return p
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def bonferroni(p_values, alpha=0.05):
|
|
22
|
+
"""Bonferroni correction. Returns adjusted, reject, threshold.
|
|
23
|
+
|
|
24
|
+
Examples
|
|
25
|
+
--------
|
|
26
|
+
>>> bonferroni([0.01, 0.5], alpha=0.05)["adjusted"].tolist()
|
|
27
|
+
[0.02, 1.0]
|
|
28
|
+
"""
|
|
29
|
+
p = _check_pvals(p_values)
|
|
30
|
+
m = len(p)
|
|
31
|
+
return {
|
|
32
|
+
"adjusted": np.minimum(p * m, 1.0),
|
|
33
|
+
"reject": p <= alpha / m,
|
|
34
|
+
"threshold": alpha / m,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def benjamini_hochberg(p_values, alpha=0.05):
|
|
39
|
+
"""Benjamini-Hochberg FDR correction (step-up). Returns adjusted, reject."""
|
|
40
|
+
p = _check_pvals(p_values)
|
|
41
|
+
m = len(p)
|
|
42
|
+
order = np.argsort(p, kind="mergesort")
|
|
43
|
+
ranked = p[order]
|
|
44
|
+
adj_sorted = ranked * m / np.arange(1, m + 1)
|
|
45
|
+
adj_sorted = np.minimum.accumulate(adj_sorted[::-1])[::-1]
|
|
46
|
+
adj_sorted = np.minimum(adj_sorted, 1.0)
|
|
47
|
+
adjusted = np.empty(m, dtype=float)
|
|
48
|
+
adjusted[order] = adj_sorted
|
|
49
|
+
return {"adjusted": adjusted, "reject": adjusted <= alpha}
|
fairscope/core/delong.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""DeLong (1988) confidence intervals and tests for AUC, via the fast midrank
|
|
2
|
+
algorithm of Sun & Xu (2014).
|
|
3
|
+
|
|
4
|
+
References
|
|
5
|
+
----------
|
|
6
|
+
DeLong, E. R., DeLong, D. M., & Clarke-Pearson, D. L. (1988). Comparing the areas under
|
|
7
|
+
two or more correlated ROC curves. Biometrics, 44(3), 837-845.
|
|
8
|
+
Sun, X., & Xu, W. (2014). Fast implementation of DeLong's algorithm for comparing the
|
|
9
|
+
areas under correlated ROC curves. IEEE Signal Processing Letters, 21(11), 1389-1393.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from scipy import stats
|
|
16
|
+
|
|
17
|
+
from ._utils import _as_1d_arrays, _check_binary_labels, _check_scores
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _compute_midrank(x: np.ndarray) -> np.ndarray:
|
|
21
|
+
"""Midranks (average ranks for ties), O(n log n)."""
|
|
22
|
+
order = np.argsort(x, kind="mergesort")
|
|
23
|
+
sorted_x = x[order]
|
|
24
|
+
n = len(x)
|
|
25
|
+
midrank = np.zeros(n, dtype=float)
|
|
26
|
+
i = 0
|
|
27
|
+
while i < n:
|
|
28
|
+
j = i
|
|
29
|
+
while j < n and sorted_x[j] == sorted_x[i]:
|
|
30
|
+
j += 1
|
|
31
|
+
midrank[i:j] = 0.5 * (i + j - 1) + 1.0
|
|
32
|
+
i = j
|
|
33
|
+
out = np.empty(n, dtype=float)
|
|
34
|
+
out[order] = midrank
|
|
35
|
+
return out
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _fast_delong(sorted_scores: np.ndarray, n_pos: int):
|
|
39
|
+
"""Sun & Xu (2014). ``sorted_scores`` has shape (k, N) with the n_pos positives
|
|
40
|
+
first. Returns (aucs: shape (k,), covariance: shape (k, k))."""
|
|
41
|
+
k, total = sorted_scores.shape
|
|
42
|
+
n_neg = total - n_pos
|
|
43
|
+
pos = sorted_scores[:, :n_pos]
|
|
44
|
+
neg = sorted_scores[:, n_pos:]
|
|
45
|
+
tx = np.empty((k, n_pos))
|
|
46
|
+
ty = np.empty((k, n_neg))
|
|
47
|
+
tz = np.empty((k, total))
|
|
48
|
+
for r in range(k):
|
|
49
|
+
tx[r] = _compute_midrank(pos[r])
|
|
50
|
+
ty[r] = _compute_midrank(neg[r])
|
|
51
|
+
tz[r] = _compute_midrank(sorted_scores[r])
|
|
52
|
+
aucs = tz[:, :n_pos].sum(axis=1) / n_pos / n_neg - (n_pos + 1.0) / (2.0 * n_neg)
|
|
53
|
+
v01 = (tz[:, :n_pos] - tx) / n_neg
|
|
54
|
+
v10 = 1.0 - (tz[:, n_pos:] - ty) / n_pos
|
|
55
|
+
sx = np.cov(v01)
|
|
56
|
+
sy = np.cov(v10)
|
|
57
|
+
cov = sx / n_pos + sy / n_neg
|
|
58
|
+
return aucs, np.atleast_2d(cov)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _prepare(y: np.ndarray, score_list):
|
|
62
|
+
order = np.argsort(-y, kind="mergesort") # label 1 (positives) first
|
|
63
|
+
n_pos = int((y == 1).sum())
|
|
64
|
+
sorted_scores = np.vstack([np.asarray(s)[order] for s in score_list])
|
|
65
|
+
return sorted_scores, n_pos
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def delong_auc_ci(y_true, y_score, alpha=0.05):
|
|
69
|
+
"""AUC with a DeLong (1 - alpha) normal-approximation confidence interval.
|
|
70
|
+
|
|
71
|
+
Returns a dict: auc, ci_lower, ci_upper, se, n_pos, n_neg.
|
|
72
|
+
|
|
73
|
+
Examples
|
|
74
|
+
--------
|
|
75
|
+
>>> import numpy as np
|
|
76
|
+
>>> y = np.array([1, 1, 1, 1, 0, 0, 0, 0])
|
|
77
|
+
>>> s = np.array([0.9, 0.8, 0.7, 0.4, 0.6, 0.5, 0.3, 0.2])
|
|
78
|
+
>>> round(delong_auc_ci(y, s)["auc"], 3)
|
|
79
|
+
0.875
|
|
80
|
+
"""
|
|
81
|
+
y_true, y_score = _as_1d_arrays(y_true, y_score)
|
|
82
|
+
y = _check_binary_labels(y_true)
|
|
83
|
+
s = _check_scores(y_score)
|
|
84
|
+
sorted_scores, n_pos = _prepare(y, [s])
|
|
85
|
+
aucs, cov = _fast_delong(sorted_scores, n_pos)
|
|
86
|
+
auc = float(aucs[0])
|
|
87
|
+
var = float(cov[0, 0])
|
|
88
|
+
se = float(np.sqrt(var)) if var > 0 else 0.0
|
|
89
|
+
z = float(stats.norm.ppf(1 - alpha / 2.0))
|
|
90
|
+
return {
|
|
91
|
+
"auc": auc,
|
|
92
|
+
"ci_lower": max(0.0, auc - z * se),
|
|
93
|
+
"ci_upper": min(1.0, auc + z * se),
|
|
94
|
+
"se": se,
|
|
95
|
+
"n_pos": n_pos,
|
|
96
|
+
"n_neg": len(y) - n_pos,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def delong_paired_test(y_true, score_a, score_b):
|
|
101
|
+
"""Covariance-aware paired DeLong test for two scores on the SAME samples.
|
|
102
|
+
|
|
103
|
+
Returns: auc_a, auc_b, delta, z, p_value.
|
|
104
|
+
"""
|
|
105
|
+
y_true, score_a, score_b = _as_1d_arrays(y_true, score_a, score_b)
|
|
106
|
+
y = _check_binary_labels(y_true)
|
|
107
|
+
a = _check_scores(score_a)
|
|
108
|
+
b = _check_scores(score_b)
|
|
109
|
+
sorted_scores, n_pos = _prepare(y, [a, b])
|
|
110
|
+
aucs, cov = _fast_delong(sorted_scores, n_pos)
|
|
111
|
+
auc_a, auc_b = float(aucs[0]), float(aucs[1])
|
|
112
|
+
var = float(cov[0, 0] + cov[1, 1] - 2.0 * cov[0, 1])
|
|
113
|
+
se = float(np.sqrt(var)) if var > 0 else 0.0
|
|
114
|
+
delta = auc_a - auc_b
|
|
115
|
+
z = delta / se if se > 0 else 0.0
|
|
116
|
+
return {
|
|
117
|
+
"auc_a": auc_a,
|
|
118
|
+
"auc_b": auc_b,
|
|
119
|
+
"delta": delta,
|
|
120
|
+
"z": float(z),
|
|
121
|
+
"p_value": float(2.0 * stats.norm.sf(abs(z))),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def delong_unpaired_test(y_true_a, score_a, y_true_b, score_b):
|
|
126
|
+
"""Unpaired DeLong test for two INDEPENDENT samples (the cross-platform case)."""
|
|
127
|
+
a = delong_auc_ci(y_true_a, score_a)
|
|
128
|
+
b = delong_auc_ci(y_true_b, score_b)
|
|
129
|
+
delta = a["auc"] - b["auc"]
|
|
130
|
+
se = float(np.sqrt(a["se"] ** 2 + b["se"] ** 2))
|
|
131
|
+
z = delta / se if se > 0 else 0.0
|
|
132
|
+
return {
|
|
133
|
+
"auc_a": a["auc"],
|
|
134
|
+
"auc_b": b["auc"],
|
|
135
|
+
"delta": delta,
|
|
136
|
+
"z": float(z),
|
|
137
|
+
"p_value": float(2.0 * stats.norm.sf(abs(z))),
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def delong_by_group(y_true, y_score, groups, alpha=0.05):
|
|
142
|
+
"""Per-subgroup DeLong AUC CIs. Returns {group: ci_dict}."""
|
|
143
|
+
y_true, y_score, groups = _as_1d_arrays(y_true, y_score, groups)
|
|
144
|
+
out = {}
|
|
145
|
+
for g in np.unique(groups):
|
|
146
|
+
mask = groups == g
|
|
147
|
+
try:
|
|
148
|
+
out[g] = delong_auc_ci(y_true[mask], y_score[mask], alpha=alpha)
|
|
149
|
+
except ValueError as exc:
|
|
150
|
+
raise ValueError(f"subgroup {g!r}: {exc}") from exc
|
|
151
|
+
return out
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Subgroup performance and fairness metrics.
|
|
2
|
+
|
|
3
|
+
Disparate impact is the symmetric ratio of positive-prediction rates, and equalized odds
|
|
4
|
+
difference is the absolute true-positive-rate gap, both as defined in the CPFE paper
|
|
5
|
+
(Pall & Yadav). Note: this EOD follows the paper's definition |TPR_a - TPR_b|; some
|
|
6
|
+
toolkits (e.g. Fairlearn) instead report max(|TPR diff|, |FPR diff|).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from sklearn.metrics import brier_score_loss, f1_score, roc_auc_score
|
|
13
|
+
|
|
14
|
+
from ._utils import _as_1d_arrays, _check_binary_labels, _check_binary_values
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _group_mask(groups, g):
|
|
18
|
+
mask = groups == g
|
|
19
|
+
if not np.any(mask):
|
|
20
|
+
raise ValueError(f"group {g!r} not present in `groups`")
|
|
21
|
+
return mask
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def disparate_impact(y_pred, groups, group_a, group_b):
|
|
25
|
+
"""Symmetric disparate impact between two groups: min(rate_a/rate_b, rate_b/rate_a).
|
|
26
|
+
|
|
27
|
+
``rate = P(y_pred == 1 | group)`` over hard 0/1 predictions. Result is in (0, 1];
|
|
28
|
+
< 0.80 violates the four-fifths rule and < 0.50 is a severe disparity (interpretation
|
|
29
|
+
per the CPFE paper). If both rates are 0 the groups are treated as equal (returns 1.0);
|
|
30
|
+
if exactly one rate is 0 the disparity is maximal (returns 0.0).
|
|
31
|
+
|
|
32
|
+
Examples
|
|
33
|
+
--------
|
|
34
|
+
>>> import numpy as np
|
|
35
|
+
>>> y_pred = np.array([1, 1, 0, 0, 1, 0, 0, 0])
|
|
36
|
+
>>> g = np.array(["A", "A", "A", "A", "B", "B", "B", "B"])
|
|
37
|
+
>>> round(disparate_impact(y_pred, g, "A", "B"), 3)
|
|
38
|
+
0.5
|
|
39
|
+
"""
|
|
40
|
+
y_pred, groups = _as_1d_arrays(y_pred, groups)
|
|
41
|
+
yp = _check_binary_values(y_pred)
|
|
42
|
+
ra = float(np.mean(yp[_group_mask(groups, group_a)] == 1))
|
|
43
|
+
rb = float(np.mean(yp[_group_mask(groups, group_b)] == 1))
|
|
44
|
+
if ra == 0.0 and rb == 0.0:
|
|
45
|
+
return 1.0
|
|
46
|
+
if ra == 0.0 or rb == 0.0:
|
|
47
|
+
return 0.0
|
|
48
|
+
return float(min(ra / rb, rb / ra))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def equalized_odds_difference(y_true, y_pred, groups, group_a, group_b):
|
|
52
|
+
"""|TPR_a - TPR_b|, the equalized odds difference as defined in the CPFE paper.
|
|
53
|
+
|
|
54
|
+
``TPR = P(y_pred == 1 | y_true == 1, group)``. Raises if either group has no
|
|
55
|
+
positive labels (TPR undefined).
|
|
56
|
+
"""
|
|
57
|
+
y_true, y_pred, groups = _as_1d_arrays(y_true, y_pred, groups)
|
|
58
|
+
yt = _check_binary_values(y_true)
|
|
59
|
+
yp = _check_binary_values(y_pred)
|
|
60
|
+
|
|
61
|
+
def _tpr(g):
|
|
62
|
+
mask = _group_mask(groups, g)
|
|
63
|
+
pos = (yt == 1) & mask
|
|
64
|
+
if pos.sum() == 0:
|
|
65
|
+
raise ValueError(f"group {g!r} has no positive labels; TPR is undefined")
|
|
66
|
+
return float(np.mean(yp[pos] == 1))
|
|
67
|
+
|
|
68
|
+
return float(abs(_tpr(group_a) - _tpr(group_b)))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def subgroup_metrics(y_true, y_score, groups, threshold=0.5):
|
|
72
|
+
"""Per-subgroup discrimination metrics: ``{group: {auc, brier, f1, n}}``.
|
|
73
|
+
|
|
74
|
+
``y_score`` are predicted probabilities in [0, 1] (Brier assumes a probability);
|
|
75
|
+
``threshold`` binarizes them for F1. Raises if any subgroup is single-class (AUC
|
|
76
|
+
undefined) or if ``y_score`` contains NaN.
|
|
77
|
+
"""
|
|
78
|
+
y_true, y_score, groups = _as_1d_arrays(y_true, y_score, groups)
|
|
79
|
+
y = _check_binary_labels(y_true)
|
|
80
|
+
s = np.asarray(y_score, dtype=float)
|
|
81
|
+
if np.any(np.isnan(s)):
|
|
82
|
+
raise ValueError("y_score contains NaN")
|
|
83
|
+
preds = (s >= threshold).astype(int)
|
|
84
|
+
out = {}
|
|
85
|
+
for g in np.unique(groups):
|
|
86
|
+
mask = groups == g
|
|
87
|
+
yt = y[mask]
|
|
88
|
+
if len(np.unique(yt)) < 2:
|
|
89
|
+
raise ValueError(f"subgroup {g!r} has a single class; AUC undefined")
|
|
90
|
+
out[g] = {
|
|
91
|
+
"auc": float(roc_auc_score(yt, s[mask])),
|
|
92
|
+
"brier": float(brier_score_loss(yt, s[mask])),
|
|
93
|
+
"f1": float(f1_score(yt, preds[mask], zero_division=0)),
|
|
94
|
+
"n": int(mask.sum()),
|
|
95
|
+
}
|
|
96
|
+
return out
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Cross-node (federated / multi-site) fairness auditing built on fairscope.core.
|
|
2
|
+
|
|
3
|
+
AUDITS per-node predictions of an already-trained model. It does NOT perform federated
|
|
4
|
+
training and provides NO privacy guarantees.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .audit import FederatedFairnessAudit, FederatedReport
|
|
8
|
+
|
|
9
|
+
__all__ = ["FederatedFairnessAudit", "FederatedReport"]
|