fairscope 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fairscope/__init__.py ADDED
@@ -0,0 +1,55 @@
1
+ """fairscope: subgroup-stratified, calibration-aware fairness auditing for ML models.
2
+
3
+ A peer-reviewed-method-backed Python library that fills documented gaps in mainstream
4
+ fairness toolkits (AIF360, Fairlearn): per-subgroup DeLong confidence intervals,
5
+ per-subgroup Expected Calibration Error, calibration-aware fairness, a five-axis
6
+ Cross-Platform Fairness Evaluation (CPFE) protocol, and per-node federated audits.
7
+
8
+ See ``docs/DESIGN.md`` for the methods, API design, and roadmap.
9
+ """
10
+
11
+ __version__ = "0.3.0"
12
+
13
+ __all__ = ["FairnessAudit", "__version__"]
14
+
15
+
16
+ def FairnessAudit(model, domain, **kwargs):
17
+ """Route to a domain-specific fairness audit.
18
+
19
+ Parameters
20
+ ----------
21
+ model : the fitted classifier (or ``None`` with precomputed scores via the domain API).
22
+ domain : str
23
+ The audit domain. Implemented: ``"healthcare"`` (uses ``model``), ``"nlp"``
24
+ (the CPFE protocol; operates on precomputed ``platform_data``, ``model`` ignored),
25
+ ``"federated"`` (cross-node audit of precomputed ``node_data``, ``model``
26
+ ignored), and ``"lending"`` (annual approval-gap + subgroup CATE, ``model``
27
+ ignored).
28
+ **kwargs : passed through to the domain audit class.
29
+
30
+ Examples
31
+ --------
32
+ >>> import fairscope
33
+ >>> callable(fairscope.FairnessAudit)
34
+ True
35
+ """
36
+ if domain == "healthcare":
37
+ from .healthcare import HealthcareFairnessAudit
38
+
39
+ return HealthcareFairnessAudit(model, **kwargs)
40
+ if domain == "nlp":
41
+ from .nlp import CPFEProtocol
42
+
43
+ return CPFEProtocol(**kwargs)
44
+ if domain == "federated":
45
+ from .federated import FederatedFairnessAudit
46
+
47
+ return FederatedFairnessAudit(**kwargs)
48
+ if domain == "lending":
49
+ from .lending import LendingFairnessAudit
50
+
51
+ return LendingFairnessAudit(**kwargs)
52
+ raise ValueError(
53
+ f"unknown or unimplemented domain: {domain!r}; "
54
+ "available domains: 'healthcare', 'nlp', 'federated', 'lending'"
55
+ )
@@ -0,0 +1,48 @@
1
+ """Core statistical primitives for fairscope.
2
+
3
+ Public API for subgroup-stratified, calibration-aware fairness auditing:
4
+ DeLong AUC confidence intervals and tests, a stratified bootstrap AUC test, calibration
5
+ error and recalibration, multiple-comparison corrections, and subgroup fairness metrics.
6
+ """
7
+
8
+ from .bootstrap import bootstrap_auc_test
9
+ from .calibration import (
10
+ ece_by_group,
11
+ expected_calibration_error,
12
+ isotonic_recalibrate,
13
+ maximum_calibration_error,
14
+ reliability_diagram,
15
+ temperature_scale,
16
+ )
17
+ from .correction import benjamini_hochberg, bonferroni
18
+ from .delong import (
19
+ delong_auc_ci,
20
+ delong_by_group,
21
+ delong_paired_test,
22
+ delong_unpaired_test,
23
+ )
24
+ from .metrics import disparate_impact, equalized_odds_difference, subgroup_metrics
25
+
26
+ __all__ = [
27
+ # AUC confidence intervals and tests (delong)
28
+ "delong_auc_ci",
29
+ "delong_paired_test",
30
+ "delong_unpaired_test",
31
+ "delong_by_group",
32
+ # AUC significance via stratified bootstrap
33
+ "bootstrap_auc_test",
34
+ # calibration error and recalibration
35
+ "expected_calibration_error",
36
+ "maximum_calibration_error",
37
+ "ece_by_group",
38
+ "reliability_diagram",
39
+ "temperature_scale",
40
+ "isotonic_recalibrate",
41
+ # multiple-comparison correction
42
+ "bonferroni",
43
+ "benjamini_hochberg",
44
+ # subgroup fairness metrics
45
+ "disparate_impact",
46
+ "equalized_odds_difference",
47
+ "subgroup_metrics",
48
+ ]
@@ -0,0 +1,44 @@
1
+ """Shared input validation. Fail loudly and early; never fail silently."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import numpy as np
6
+
7
+
8
+ def _as_1d_arrays(*arrays):
9
+ """Coerce inputs to 1-D numpy arrays of equal length; raise otherwise."""
10
+ out = [np.asarray(a) for a in arrays]
11
+ n = len(out[0])
12
+ for a in out:
13
+ if a.ndim != 1:
14
+ raise ValueError("inputs must be 1-D arrays")
15
+ if len(a) != n:
16
+ raise ValueError("inputs must all have the same length")
17
+ return out
18
+
19
+
20
+ def _check_binary_values(y):
21
+ """Validate labels are binary 0/1 with no NaN. Single-class is allowed."""
22
+ y = np.asarray(y)
23
+ if np.any(np.isnan(y.astype(float))):
24
+ raise ValueError("y_true contains NaN")
25
+ uniq = np.unique(y)
26
+ if not set(uniq.tolist()).issubset({0, 1}):
27
+ raise ValueError(f"y_true must be binary 0/1; got values {uniq.tolist()}")
28
+ return y.astype(int)
29
+
30
+
31
+ def _check_binary_labels(y):
32
+ """Like _check_binary_values, but require BOTH classes present (AUC needs both)."""
33
+ y = _check_binary_values(y)
34
+ if len(np.unique(y)) < 2:
35
+ raise ValueError("y_true must contain both classes (0 and 1)")
36
+ return y
37
+
38
+
39
+ def _check_scores(s):
40
+ """Validate scores are finite floats."""
41
+ s = np.asarray(s, dtype=float)
42
+ if np.any(np.isnan(s)):
43
+ raise ValueError("scores contain NaN")
44
+ return s
@@ -0,0 +1,66 @@
1
+ """Stratified bootstrap test for the difference between two AUCs.
2
+
3
+ This is the significance procedure used for the cross-platform macro-AUC comparison in the
4
+ CPFE protocol (stratified bootstrap standard errors, B = 2000 by default). It complements
5
+ ``delong`` (analytic per-subgroup CIs).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import numpy as np
11
+ from scipy import stats
12
+ from sklearn.metrics import roc_auc_score
13
+
14
+ from ._utils import _as_1d_arrays, _check_binary_labels, _check_scores
15
+
16
+
17
+ def bootstrap_auc_test(y_true, score_a, score_b, n_boot=2000, random_state=None):
18
+ """Stratified bootstrap test of (AUC_a - AUC_b) on the same samples.
19
+
20
+ Resamples positives and negatives separately (preserving class balance). Returns a
21
+ dict: auc_a, auc_b, delta, se, z, p_value, ci_lower, ci_upper, n_boot.
22
+
23
+ Examples
24
+ --------
25
+ >>> import numpy as np
26
+ >>> rng = np.random.default_rng(0)
27
+ >>> y = rng.integers(0, 2, 400)
28
+ >>> a = rng.random(400) + 0.8 * y
29
+ >>> b = rng.random(400) + 0.05 * y
30
+ >>> bootstrap_auc_test(y, a, b, n_boot=200, random_state=0)["delta"] > 0
31
+ True
32
+ """
33
+ y_true, score_a, score_b = _as_1d_arrays(y_true, score_a, score_b)
34
+ y = _check_binary_labels(y_true)
35
+ a = _check_scores(score_a)
36
+ b = _check_scores(score_b)
37
+ rng = np.random.default_rng(random_state)
38
+
39
+ pos_idx = np.flatnonzero(y == 1)
40
+ neg_idx = np.flatnonzero(y == 0)
41
+ auc_a = float(roc_auc_score(y, a))
42
+ auc_b = float(roc_auc_score(y, b))
43
+ observed = auc_a - auc_b
44
+
45
+ deltas = np.empty(n_boot, dtype=float)
46
+ for i in range(n_boot):
47
+ bp = rng.choice(pos_idx, size=pos_idx.size, replace=True)
48
+ bn = rng.choice(neg_idx, size=neg_idx.size, replace=True)
49
+ idx = np.concatenate([bp, bn])
50
+ yb = y[idx]
51
+ deltas[i] = roc_auc_score(yb, a[idx]) - roc_auc_score(yb, b[idx])
52
+
53
+ se = float(deltas.std(ddof=1))
54
+ z = observed / se if se > 0 else 0.0
55
+ lo, hi = np.percentile(deltas, [2.5, 97.5])
56
+ return {
57
+ "auc_a": auc_a,
58
+ "auc_b": auc_b,
59
+ "delta": observed,
60
+ "se": se,
61
+ "z": float(z),
62
+ "p_value": float(2.0 * stats.norm.sf(abs(z))),
63
+ "ci_lower": float(lo),
64
+ "ci_upper": float(hi),
65
+ "n_boot": n_boot,
66
+ }
@@ -0,0 +1,168 @@
1
+ """Calibration analysis and a subgroup-stratified interface to standard recalibration.
2
+
3
+ ECE/MCE bin the predicted positive-class probability (equal-width or equal-frequency) and
4
+ compare each bin's mean probability to its observed frequency. ``temperature_scale``
5
+ (Guo et al., 2017) and ``isotonic_recalibrate`` (Zadrozny & Elkan, 2002) are standard
6
+ recalibration methods; the contribution here is the per-subgroup interface and pre/post-ECE
7
+ reporting, not the methods themselves.
8
+
9
+ References
10
+ ----------
11
+ Guo, C., Pleiss, G., Sun, Y., & Weinberger, K. Q. (2017). On calibration of modern neural
12
+ networks. ICML.
13
+ Zadrozny, B., & Elkan, C. (2002). Transforming classifier scores into accurate multiclass
14
+ probability estimates. KDD.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import numpy as np
20
+ from scipy import optimize
21
+ from sklearn.isotonic import IsotonicRegression
22
+
23
+ from ._utils import _as_1d_arrays, _check_binary_values, _check_scores
24
+
25
+
26
+ def _bin_edges(y_prob, n_bins, strategy):
27
+ if strategy == "uniform":
28
+ return np.linspace(0.0, 1.0, n_bins + 1)
29
+ if strategy == "quantile":
30
+ edges = np.unique(np.quantile(y_prob, np.linspace(0, 1, n_bins + 1)))
31
+ edges[0], edges[-1] = 0.0, 1.0
32
+ return edges
33
+ raise ValueError("strategy must be 'uniform' or 'quantile'")
34
+
35
+
36
+ def _binned_gaps(y_true, y_prob, n_bins, strategy):
37
+ edges = _bin_edges(y_prob, n_bins, strategy)
38
+ idx = np.digitize(y_prob, edges[1:-1], right=False)
39
+ n = len(y_prob)
40
+ gaps, weights = [], []
41
+ for b in range(len(edges) - 1):
42
+ mask = idx == b
43
+ if not np.any(mask):
44
+ continue
45
+ conf = float(np.mean(y_prob[mask]))
46
+ acc = float(np.mean(y_true[mask]))
47
+ gaps.append(abs(acc - conf))
48
+ weights.append(mask.sum() / n)
49
+ return np.array(gaps), np.array(weights)
50
+
51
+
52
+ def expected_calibration_error(y_true, y_prob, n_bins=10, strategy="uniform"):
53
+ """Weighted mean |observed frequency - mean predicted probability| across bins.
54
+
55
+ Returns 0.0 if there is no data. ``strategy`` is 'uniform' (equal-width) or 'quantile'
56
+ (equal-frequency) bins.
57
+
58
+ Examples
59
+ --------
60
+ >>> import numpy as np
61
+ >>> y = np.array([0, 0, 1, 1])
62
+ >>> p = np.array([0.0, 0.0, 1.0, 1.0])
63
+ >>> expected_calibration_error(y, p)
64
+ 0.0
65
+ """
66
+ y_true, y_prob = _as_1d_arrays(y_true, y_prob)
67
+ y = _check_binary_values(y_true)
68
+ p = _check_scores(y_prob)
69
+ gaps, weights = _binned_gaps(y, p, n_bins, strategy)
70
+ return float(np.sum(weights * gaps)) if gaps.size else 0.0
71
+
72
+
73
+ def maximum_calibration_error(y_true, y_prob, n_bins=10, strategy="uniform"):
74
+ """Maximum |observed frequency - mean predicted probability| over bins."""
75
+ y_true, y_prob = _as_1d_arrays(y_true, y_prob)
76
+ y = _check_binary_values(y_true)
77
+ p = _check_scores(y_prob)
78
+ gaps, _ = _binned_gaps(y, p, n_bins, strategy)
79
+ return float(np.max(gaps)) if gaps.size else 0.0
80
+
81
+
82
+ def ece_by_group(y_true, y_prob, groups, n_bins=10, strategy="uniform"):
83
+ """Per-subgroup Expected Calibration Error. Returns ``{group: ece}``."""
84
+ y_true, y_prob, groups = _as_1d_arrays(y_true, y_prob, groups)
85
+ out = {}
86
+ for g in np.unique(groups):
87
+ mask = groups == g
88
+ out[g] = expected_calibration_error(y_true[mask], y_prob[mask], n_bins, strategy)
89
+ return out
90
+
91
+
92
+ def reliability_diagram(y_true, y_prob, groups=None, n_bins=10):
93
+ """Reliability diagram (mean predicted probability vs observed frequency).
94
+
95
+ Returns a matplotlib ``Figure``. If ``groups`` is given, draws one curve per subgroup.
96
+ """
97
+ import matplotlib.pyplot as plt
98
+
99
+ y_true, y_prob = _as_1d_arrays(y_true, y_prob)
100
+ fig, ax = plt.subplots()
101
+ ax.plot([0, 1], [0, 1], linestyle="--", color="gray", label="perfect")
102
+
103
+ def _curve(yt, yp, label):
104
+ edges = np.linspace(0, 1, n_bins + 1)
105
+ idx = np.digitize(yp, edges[1:-1])
106
+ xs, ys = [], []
107
+ for b in range(n_bins):
108
+ m = idx == b
109
+ if np.any(m):
110
+ xs.append(float(np.mean(yp[m])))
111
+ ys.append(float(np.mean(yt[m])))
112
+ ax.plot(xs, ys, marker="o", label=label)
113
+
114
+ if groups is None:
115
+ _curve(np.asarray(y_true), np.asarray(y_prob), "model")
116
+ else:
117
+ groups = np.asarray(groups)
118
+ for g in np.unique(groups):
119
+ m = groups == g
120
+ _curve(np.asarray(y_true)[m], np.asarray(y_prob)[m], f"group={g}")
121
+ ax.set_xlabel("mean predicted probability")
122
+ ax.set_ylabel("observed frequency")
123
+ ax.legend()
124
+ return fig
125
+
126
+
127
+ def temperature_scale(logits, y_true, max_iter=200):
128
+ """Fit a single temperature T>0 by minimizing NLL (Guo et al., 2017).
129
+
130
+ ``logits`` may be 1-D (binary positive-class logit) or 2-D (n, n_classes). Returns
131
+ ``(T, calibrated_probabilities)``; calibrated probabilities are 1-D for 1-D input.
132
+ """
133
+ logits = np.asarray(logits, dtype=float)
134
+ y = np.asarray(y_true).astype(int)
135
+ logits2 = np.column_stack([np.zeros_like(logits), logits]) if logits.ndim == 1 else logits
136
+
137
+ def _nll(log_t):
138
+ t = np.exp(log_t[0]) # parameterize as exp() to keep T > 0
139
+ z = logits2 / t
140
+ z = z - z.max(axis=1, keepdims=True)
141
+ logsumexp = np.log(np.exp(z).sum(axis=1))
142
+ logp = z[np.arange(len(y)), y] - logsumexp
143
+ return -float(np.mean(logp))
144
+
145
+ res = optimize.minimize(
146
+ _nll,
147
+ x0=[0.0],
148
+ method="Nelder-Mead",
149
+ options={"maxiter": max_iter, "xatol": 1e-4, "fatol": 1e-6},
150
+ )
151
+ temp = float(np.exp(res.x[0]))
152
+ z = logits2 / temp
153
+ z = z - z.max(axis=1, keepdims=True)
154
+ probs = np.exp(z)
155
+ probs /= probs.sum(axis=1, keepdims=True)
156
+ calibrated = probs[:, 1] if logits.ndim == 1 else probs
157
+ return temp, calibrated
158
+
159
+
160
+ def isotonic_recalibrate(probs, y_true):
161
+ """Fit isotonic regression mapping predicted prob -> calibrated prob
162
+ (Zadrozny & Elkan, 2002). Returns ``(fitted_model, calibrated_probabilities)``."""
163
+ probs, y_true = _as_1d_arrays(probs, y_true)
164
+ p = _check_scores(probs)
165
+ y = _check_binary_values(y_true)
166
+ model = IsotonicRegression(out_of_bounds="clip", y_min=0.0, y_max=1.0)
167
+ calibrated = model.fit_transform(p, y)
168
+ return model, calibrated
@@ -0,0 +1,49 @@
1
+ """Multiple-comparison corrections. Outputs match
2
+ ``statsmodels.stats.multitest.multipletests`` for methods 'bonferroni' and 'fdr_bh'.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import numpy as np
8
+
9
+
10
+ def _check_pvals(p):
11
+ p = np.asarray(p, dtype=float)
12
+ if p.ndim != 1:
13
+ raise ValueError("p_values must be a 1-D array")
14
+ if np.any(np.isnan(p)):
15
+ raise ValueError("p_values contain NaN")
16
+ if np.any((p < 0) | (p > 1)):
17
+ raise ValueError("p_values must lie in [0, 1]")
18
+ return p
19
+
20
+
21
+ def bonferroni(p_values, alpha=0.05):
22
+ """Bonferroni correction. Returns adjusted, reject, threshold.
23
+
24
+ Examples
25
+ --------
26
+ >>> bonferroni([0.01, 0.5], alpha=0.05)["adjusted"].tolist()
27
+ [0.02, 1.0]
28
+ """
29
+ p = _check_pvals(p_values)
30
+ m = len(p)
31
+ return {
32
+ "adjusted": np.minimum(p * m, 1.0),
33
+ "reject": p <= alpha / m,
34
+ "threshold": alpha / m,
35
+ }
36
+
37
+
38
+ def benjamini_hochberg(p_values, alpha=0.05):
39
+ """Benjamini-Hochberg FDR correction (step-up). Returns adjusted, reject."""
40
+ p = _check_pvals(p_values)
41
+ m = len(p)
42
+ order = np.argsort(p, kind="mergesort")
43
+ ranked = p[order]
44
+ adj_sorted = ranked * m / np.arange(1, m + 1)
45
+ adj_sorted = np.minimum.accumulate(adj_sorted[::-1])[::-1]
46
+ adj_sorted = np.minimum(adj_sorted, 1.0)
47
+ adjusted = np.empty(m, dtype=float)
48
+ adjusted[order] = adj_sorted
49
+ return {"adjusted": adjusted, "reject": adjusted <= alpha}
@@ -0,0 +1,151 @@
1
+ """DeLong (1988) confidence intervals and tests for AUC, via the fast midrank
2
+ algorithm of Sun & Xu (2014).
3
+
4
+ References
5
+ ----------
6
+ DeLong, E. R., DeLong, D. M., & Clarke-Pearson, D. L. (1988). Comparing the areas under
7
+ two or more correlated ROC curves. Biometrics, 44(3), 837-845.
8
+ Sun, X., & Xu, W. (2014). Fast implementation of DeLong's algorithm for comparing the
9
+ areas under correlated ROC curves. IEEE Signal Processing Letters, 21(11), 1389-1393.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import numpy as np
15
+ from scipy import stats
16
+
17
+ from ._utils import _as_1d_arrays, _check_binary_labels, _check_scores
18
+
19
+
20
+ def _compute_midrank(x: np.ndarray) -> np.ndarray:
21
+ """Midranks (average ranks for ties), O(n log n)."""
22
+ order = np.argsort(x, kind="mergesort")
23
+ sorted_x = x[order]
24
+ n = len(x)
25
+ midrank = np.zeros(n, dtype=float)
26
+ i = 0
27
+ while i < n:
28
+ j = i
29
+ while j < n and sorted_x[j] == sorted_x[i]:
30
+ j += 1
31
+ midrank[i:j] = 0.5 * (i + j - 1) + 1.0
32
+ i = j
33
+ out = np.empty(n, dtype=float)
34
+ out[order] = midrank
35
+ return out
36
+
37
+
38
+ def _fast_delong(sorted_scores: np.ndarray, n_pos: int):
39
+ """Sun & Xu (2014). ``sorted_scores`` has shape (k, N) with the n_pos positives
40
+ first. Returns (aucs: shape (k,), covariance: shape (k, k))."""
41
+ k, total = sorted_scores.shape
42
+ n_neg = total - n_pos
43
+ pos = sorted_scores[:, :n_pos]
44
+ neg = sorted_scores[:, n_pos:]
45
+ tx = np.empty((k, n_pos))
46
+ ty = np.empty((k, n_neg))
47
+ tz = np.empty((k, total))
48
+ for r in range(k):
49
+ tx[r] = _compute_midrank(pos[r])
50
+ ty[r] = _compute_midrank(neg[r])
51
+ tz[r] = _compute_midrank(sorted_scores[r])
52
+ aucs = tz[:, :n_pos].sum(axis=1) / n_pos / n_neg - (n_pos + 1.0) / (2.0 * n_neg)
53
+ v01 = (tz[:, :n_pos] - tx) / n_neg
54
+ v10 = 1.0 - (tz[:, n_pos:] - ty) / n_pos
55
+ sx = np.cov(v01)
56
+ sy = np.cov(v10)
57
+ cov = sx / n_pos + sy / n_neg
58
+ return aucs, np.atleast_2d(cov)
59
+
60
+
61
+ def _prepare(y: np.ndarray, score_list):
62
+ order = np.argsort(-y, kind="mergesort") # label 1 (positives) first
63
+ n_pos = int((y == 1).sum())
64
+ sorted_scores = np.vstack([np.asarray(s)[order] for s in score_list])
65
+ return sorted_scores, n_pos
66
+
67
+
68
+ def delong_auc_ci(y_true, y_score, alpha=0.05):
69
+ """AUC with a DeLong (1 - alpha) normal-approximation confidence interval.
70
+
71
+ Returns a dict: auc, ci_lower, ci_upper, se, n_pos, n_neg.
72
+
73
+ Examples
74
+ --------
75
+ >>> import numpy as np
76
+ >>> y = np.array([1, 1, 1, 1, 0, 0, 0, 0])
77
+ >>> s = np.array([0.9, 0.8, 0.7, 0.4, 0.6, 0.5, 0.3, 0.2])
78
+ >>> round(delong_auc_ci(y, s)["auc"], 3)
79
+ 0.875
80
+ """
81
+ y_true, y_score = _as_1d_arrays(y_true, y_score)
82
+ y = _check_binary_labels(y_true)
83
+ s = _check_scores(y_score)
84
+ sorted_scores, n_pos = _prepare(y, [s])
85
+ aucs, cov = _fast_delong(sorted_scores, n_pos)
86
+ auc = float(aucs[0])
87
+ var = float(cov[0, 0])
88
+ se = float(np.sqrt(var)) if var > 0 else 0.0
89
+ z = float(stats.norm.ppf(1 - alpha / 2.0))
90
+ return {
91
+ "auc": auc,
92
+ "ci_lower": max(0.0, auc - z * se),
93
+ "ci_upper": min(1.0, auc + z * se),
94
+ "se": se,
95
+ "n_pos": n_pos,
96
+ "n_neg": len(y) - n_pos,
97
+ }
98
+
99
+
100
+ def delong_paired_test(y_true, score_a, score_b):
101
+ """Covariance-aware paired DeLong test for two scores on the SAME samples.
102
+
103
+ Returns: auc_a, auc_b, delta, z, p_value.
104
+ """
105
+ y_true, score_a, score_b = _as_1d_arrays(y_true, score_a, score_b)
106
+ y = _check_binary_labels(y_true)
107
+ a = _check_scores(score_a)
108
+ b = _check_scores(score_b)
109
+ sorted_scores, n_pos = _prepare(y, [a, b])
110
+ aucs, cov = _fast_delong(sorted_scores, n_pos)
111
+ auc_a, auc_b = float(aucs[0]), float(aucs[1])
112
+ var = float(cov[0, 0] + cov[1, 1] - 2.0 * cov[0, 1])
113
+ se = float(np.sqrt(var)) if var > 0 else 0.0
114
+ delta = auc_a - auc_b
115
+ z = delta / se if se > 0 else 0.0
116
+ return {
117
+ "auc_a": auc_a,
118
+ "auc_b": auc_b,
119
+ "delta": delta,
120
+ "z": float(z),
121
+ "p_value": float(2.0 * stats.norm.sf(abs(z))),
122
+ }
123
+
124
+
125
+ def delong_unpaired_test(y_true_a, score_a, y_true_b, score_b):
126
+ """Unpaired DeLong test for two INDEPENDENT samples (the cross-platform case)."""
127
+ a = delong_auc_ci(y_true_a, score_a)
128
+ b = delong_auc_ci(y_true_b, score_b)
129
+ delta = a["auc"] - b["auc"]
130
+ se = float(np.sqrt(a["se"] ** 2 + b["se"] ** 2))
131
+ z = delta / se if se > 0 else 0.0
132
+ return {
133
+ "auc_a": a["auc"],
134
+ "auc_b": b["auc"],
135
+ "delta": delta,
136
+ "z": float(z),
137
+ "p_value": float(2.0 * stats.norm.sf(abs(z))),
138
+ }
139
+
140
+
141
+ def delong_by_group(y_true, y_score, groups, alpha=0.05):
142
+ """Per-subgroup DeLong AUC CIs. Returns {group: ci_dict}."""
143
+ y_true, y_score, groups = _as_1d_arrays(y_true, y_score, groups)
144
+ out = {}
145
+ for g in np.unique(groups):
146
+ mask = groups == g
147
+ try:
148
+ out[g] = delong_auc_ci(y_true[mask], y_score[mask], alpha=alpha)
149
+ except ValueError as exc:
150
+ raise ValueError(f"subgroup {g!r}: {exc}") from exc
151
+ return out
@@ -0,0 +1,96 @@
1
+ """Subgroup performance and fairness metrics.
2
+
3
+ Disparate impact is the symmetric ratio of positive-prediction rates, and equalized odds
4
+ difference is the absolute true-positive-rate gap, both as defined in the CPFE paper
5
+ (Pall & Yadav). Note: this EOD follows the paper's definition |TPR_a - TPR_b|; some
6
+ toolkits (e.g. Fairlearn) instead report max(|TPR diff|, |FPR diff|).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import numpy as np
12
+ from sklearn.metrics import brier_score_loss, f1_score, roc_auc_score
13
+
14
+ from ._utils import _as_1d_arrays, _check_binary_labels, _check_binary_values
15
+
16
+
17
+ def _group_mask(groups, g):
18
+ mask = groups == g
19
+ if not np.any(mask):
20
+ raise ValueError(f"group {g!r} not present in `groups`")
21
+ return mask
22
+
23
+
24
+ def disparate_impact(y_pred, groups, group_a, group_b):
25
+ """Symmetric disparate impact between two groups: min(rate_a/rate_b, rate_b/rate_a).
26
+
27
+ ``rate = P(y_pred == 1 | group)`` over hard 0/1 predictions. Result is in (0, 1];
28
+ < 0.80 violates the four-fifths rule and < 0.50 is a severe disparity (interpretation
29
+ per the CPFE paper). If both rates are 0 the groups are treated as equal (returns 1.0);
30
+ if exactly one rate is 0 the disparity is maximal (returns 0.0).
31
+
32
+ Examples
33
+ --------
34
+ >>> import numpy as np
35
+ >>> y_pred = np.array([1, 1, 0, 0, 1, 0, 0, 0])
36
+ >>> g = np.array(["A", "A", "A", "A", "B", "B", "B", "B"])
37
+ >>> round(disparate_impact(y_pred, g, "A", "B"), 3)
38
+ 0.5
39
+ """
40
+ y_pred, groups = _as_1d_arrays(y_pred, groups)
41
+ yp = _check_binary_values(y_pred)
42
+ ra = float(np.mean(yp[_group_mask(groups, group_a)] == 1))
43
+ rb = float(np.mean(yp[_group_mask(groups, group_b)] == 1))
44
+ if ra == 0.0 and rb == 0.0:
45
+ return 1.0
46
+ if ra == 0.0 or rb == 0.0:
47
+ return 0.0
48
+ return float(min(ra / rb, rb / ra))
49
+
50
+
51
+ def equalized_odds_difference(y_true, y_pred, groups, group_a, group_b):
52
+ """|TPR_a - TPR_b|, the equalized odds difference as defined in the CPFE paper.
53
+
54
+ ``TPR = P(y_pred == 1 | y_true == 1, group)``. Raises if either group has no
55
+ positive labels (TPR undefined).
56
+ """
57
+ y_true, y_pred, groups = _as_1d_arrays(y_true, y_pred, groups)
58
+ yt = _check_binary_values(y_true)
59
+ yp = _check_binary_values(y_pred)
60
+
61
+ def _tpr(g):
62
+ mask = _group_mask(groups, g)
63
+ pos = (yt == 1) & mask
64
+ if pos.sum() == 0:
65
+ raise ValueError(f"group {g!r} has no positive labels; TPR is undefined")
66
+ return float(np.mean(yp[pos] == 1))
67
+
68
+ return float(abs(_tpr(group_a) - _tpr(group_b)))
69
+
70
+
71
+ def subgroup_metrics(y_true, y_score, groups, threshold=0.5):
72
+ """Per-subgroup discrimination metrics: ``{group: {auc, brier, f1, n}}``.
73
+
74
+ ``y_score`` are predicted probabilities in [0, 1] (Brier assumes a probability);
75
+ ``threshold`` binarizes them for F1. Raises if any subgroup is single-class (AUC
76
+ undefined) or if ``y_score`` contains NaN.
77
+ """
78
+ y_true, y_score, groups = _as_1d_arrays(y_true, y_score, groups)
79
+ y = _check_binary_labels(y_true)
80
+ s = np.asarray(y_score, dtype=float)
81
+ if np.any(np.isnan(s)):
82
+ raise ValueError("y_score contains NaN")
83
+ preds = (s >= threshold).astype(int)
84
+ out = {}
85
+ for g in np.unique(groups):
86
+ mask = groups == g
87
+ yt = y[mask]
88
+ if len(np.unique(yt)) < 2:
89
+ raise ValueError(f"subgroup {g!r} has a single class; AUC undefined")
90
+ out[g] = {
91
+ "auc": float(roc_auc_score(yt, s[mask])),
92
+ "brier": float(brier_score_loss(yt, s[mask])),
93
+ "f1": float(f1_score(yt, preds[mask], zero_division=0)),
94
+ "n": int(mask.sum()),
95
+ }
96
+ return out
@@ -0,0 +1,9 @@
1
+ """Cross-node (federated / multi-site) fairness auditing built on fairscope.core.
2
+
3
+ AUDITS per-node predictions of an already-trained model. It does NOT perform federated
4
+ training and provides NO privacy guarantees.
5
+ """
6
+
7
+ from .audit import FederatedFairnessAudit, FederatedReport
8
+
9
+ __all__ = ["FederatedFairnessAudit", "FederatedReport"]