fairscope 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fairscope/__init__.py +55 -0
- fairscope/core/__init__.py +48 -0
- fairscope/core/_utils.py +44 -0
- fairscope/core/bootstrap.py +66 -0
- fairscope/core/calibration.py +168 -0
- fairscope/core/correction.py +49 -0
- fairscope/core/delong.py +151 -0
- fairscope/core/metrics.py +96 -0
- fairscope/federated/__init__.py +9 -0
- fairscope/federated/audit.py +217 -0
- fairscope/healthcare/__init__.py +5 -0
- fairscope/healthcare/audit.py +227 -0
- fairscope/lending/__init__.py +5 -0
- fairscope/lending/audit.py +160 -0
- fairscope/nlp/__init__.py +29 -0
- fairscope/nlp/attribution.py +54 -0
- fairscope/nlp/cross_platform.py +178 -0
- fairscope/nlp/metrics.py +87 -0
- fairscope/nlp/significance.py +51 -0
- fairscope-0.3.0.dist-info/METADATA +202 -0
- fairscope-0.3.0.dist-info/RECORD +23 -0
- fairscope-0.3.0.dist-info/WHEEL +4 -0
- fairscope-0.3.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""One-call cross-node fairness audit. Composes fairscope.core; invents no statistics.
|
|
2
|
+
|
|
3
|
+
Per node: DeLong AUC confidence interval -> Expected Calibration Error -> Brier/F1.
|
|
4
|
+
Across nodes: max-min AUC gap and Bonferroni-corrected pairwise (unpaired) DeLong tests.
|
|
5
|
+
An optional per-node recalibration step reports pre/post ECE. Mirrors the cross-node
|
|
6
|
+
evaluation in the privacy-preserving federated-learning study.
|
|
7
|
+
|
|
8
|
+
IMPORTANT: this module audits per-node PREDICTIONS only. It does not train models, perform
|
|
9
|
+
secure aggregation, or provide any privacy guarantee.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from itertools import combinations
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
import pandas as pd
|
|
18
|
+
from sklearn.metrics import brier_score_loss, f1_score
|
|
19
|
+
|
|
20
|
+
from ..core import (
|
|
21
|
+
bonferroni,
|
|
22
|
+
delong_auc_ci,
|
|
23
|
+
delong_unpaired_test,
|
|
24
|
+
expected_calibration_error,
|
|
25
|
+
isotonic_recalibrate,
|
|
26
|
+
temperature_scale,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FederatedFairnessAudit:
|
|
31
|
+
"""Audit per-node predictions for cross-node fairness.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
node_data : dict ``{node_name: (y_true, y_score)}`` where ``y_score`` is the
|
|
36
|
+
positive-class probability for that node's evaluation samples.
|
|
37
|
+
n_bins : int, ECE bin count.
|
|
38
|
+
alpha : float, significance level for DeLong CIs and the Bonferroni correction.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, node_data, *, n_bins=10, alpha=0.05):
|
|
42
|
+
self.node_data = {
|
|
43
|
+
k: (np.asarray(y), np.asarray(s, dtype=float)) for k, (y, s) in node_data.items()
|
|
44
|
+
}
|
|
45
|
+
self.n_bins = n_bins
|
|
46
|
+
self.alpha = alpha
|
|
47
|
+
|
|
48
|
+
def run(self) -> FederatedReport:
|
|
49
|
+
per_node = {}
|
|
50
|
+
for node, (y, s) in self.node_data.items():
|
|
51
|
+
if len(np.unique(y)) < 2:
|
|
52
|
+
raise ValueError(f"node {node!r} has a single class; AUC undefined")
|
|
53
|
+
ci = delong_auc_ci(y, s, alpha=self.alpha)
|
|
54
|
+
ece = expected_calibration_error(y, s, n_bins=self.n_bins)
|
|
55
|
+
per_node[node] = {
|
|
56
|
+
"ci": ci,
|
|
57
|
+
"ece": float(ece),
|
|
58
|
+
"n": int(len(y)),
|
|
59
|
+
"brier": float(brier_score_loss(y, s)),
|
|
60
|
+
"f1": float(f1_score(y, (s >= 0.5).astype(int), zero_division=0)),
|
|
61
|
+
}
|
|
62
|
+
nodes = sorted(per_node)
|
|
63
|
+
pairs, pvals = [], []
|
|
64
|
+
for a, b in combinations(nodes, 2):
|
|
65
|
+
ya, sa = self.node_data[a]
|
|
66
|
+
yb, sb = self.node_data[b]
|
|
67
|
+
res = delong_unpaired_test(ya, sa, yb, sb)
|
|
68
|
+
pairs.append((a, b))
|
|
69
|
+
pvals.append(res["p_value"])
|
|
70
|
+
corrected = bonferroni(np.array(pvals), alpha=self.alpha) if pvals else None
|
|
71
|
+
return FederatedReport(
|
|
72
|
+
per_node,
|
|
73
|
+
pairs,
|
|
74
|
+
pvals,
|
|
75
|
+
corrected,
|
|
76
|
+
node_data=self.node_data,
|
|
77
|
+
alpha=self.alpha,
|
|
78
|
+
n_bins=self.n_bins,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class FederatedReport:
|
|
83
|
+
"""Holds cross-node audit results; renders tables (and plots/PDF in later tasks)."""
|
|
84
|
+
|
|
85
|
+
def __init__(self, per_node, pairs, pvals, corrected, *, node_data, alpha, n_bins):
|
|
86
|
+
self.per_node = per_node
|
|
87
|
+
self.pairs = pairs
|
|
88
|
+
self.pvals = pvals
|
|
89
|
+
self.corrected = corrected
|
|
90
|
+
self._node_data = node_data
|
|
91
|
+
self.alpha = alpha
|
|
92
|
+
self.n_bins = n_bins
|
|
93
|
+
|
|
94
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
95
|
+
rows = []
|
|
96
|
+
for node, r in self.per_node.items():
|
|
97
|
+
ci = r["ci"]
|
|
98
|
+
rows.append(
|
|
99
|
+
{
|
|
100
|
+
"node": node,
|
|
101
|
+
"n": r["n"],
|
|
102
|
+
"auc": ci["auc"],
|
|
103
|
+
"ci_lower": ci["ci_lower"],
|
|
104
|
+
"ci_upper": ci["ci_upper"],
|
|
105
|
+
"ece": r["ece"],
|
|
106
|
+
"brier": r["brier"],
|
|
107
|
+
"f1": r["f1"],
|
|
108
|
+
}
|
|
109
|
+
)
|
|
110
|
+
return pd.DataFrame(rows)
|
|
111
|
+
|
|
112
|
+
def disparity(self) -> dict:
|
|
113
|
+
"""Cross-node AUC disparity summary."""
|
|
114
|
+
aucs = {n: r["ci"]["auc"] for n, r in self.per_node.items()}
|
|
115
|
+
hi = max(aucs, key=aucs.get)
|
|
116
|
+
lo = min(aucs, key=aucs.get)
|
|
117
|
+
return {
|
|
118
|
+
"max_auc_gap": aucs[hi] - aucs[lo],
|
|
119
|
+
"best": hi,
|
|
120
|
+
"worst": lo,
|
|
121
|
+
"worst_pair": (lo, hi),
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
def recalibrate(self, method="temperature") -> dict:
|
|
125
|
+
"""Recalibrate each node on its own (y, score) and report pre/post ECE.
|
|
126
|
+
|
|
127
|
+
``method`` is 'temperature' (Guo et al., 2017) or 'isotonic' (Zadrozny &
|
|
128
|
+
Elkan, 2002) — both standard methods from ``fairscope.core``. Temperature
|
|
129
|
+
scaling operates on logits, so the per-node probabilities are converted to
|
|
130
|
+
logits first. Returns ``{node: {"ece_pre": float, "ece_post": float}}``.
|
|
131
|
+
|
|
132
|
+
NOTE: this fits and evaluates on the same per-node data (an in-sample
|
|
133
|
+
diagnostic). For a deployment estimate, recalibrate on a held-out split.
|
|
134
|
+
"""
|
|
135
|
+
if method not in ("temperature", "isotonic"):
|
|
136
|
+
raise ValueError(f"unknown method: {method!r}; use 'temperature' or 'isotonic'")
|
|
137
|
+
out = {}
|
|
138
|
+
for node, (y, s) in self._node_data.items():
|
|
139
|
+
pre = expected_calibration_error(y, s, n_bins=self.n_bins)
|
|
140
|
+
if method == "temperature":
|
|
141
|
+
p = np.clip(s, 1e-7, 1 - 1e-7)
|
|
142
|
+
logits = np.log(p / (1 - p))
|
|
143
|
+
_, s_cal = temperature_scale(logits, y)
|
|
144
|
+
else: # isotonic
|
|
145
|
+
_, s_cal = isotonic_recalibrate(s, y)
|
|
146
|
+
post = expected_calibration_error(y, s_cal, n_bins=self.n_bins)
|
|
147
|
+
out[node] = {"ece_pre": float(pre), "ece_post": float(post)}
|
|
148
|
+
return out
|
|
149
|
+
|
|
150
|
+
def plot_auc_forest(self):
|
|
151
|
+
"""Forest plot of per-node AUC with DeLong 95% CIs. Returns a Figure."""
|
|
152
|
+
import matplotlib.pyplot as plt
|
|
153
|
+
|
|
154
|
+
df = self.to_dataframe()
|
|
155
|
+
ys = np.arange(len(df))
|
|
156
|
+
xerr = np.vstack([df.auc - df.ci_lower, df.ci_upper - df.auc])
|
|
157
|
+
fig, ax = plt.subplots(figsize=(6, 0.5 * len(df) + 1.5))
|
|
158
|
+
ax.errorbar(df.auc, ys, xerr=xerr, fmt="o", capsize=3)
|
|
159
|
+
ax.set_yticks(ys)
|
|
160
|
+
ax.set_yticklabels(df.node)
|
|
161
|
+
ax.set_xlabel("AUC (95% DeLong CI)")
|
|
162
|
+
ax.axvline(0.5, color="gray", linestyle="--", linewidth=1)
|
|
163
|
+
ax.set_title("Per-node discrimination")
|
|
164
|
+
fig.tight_layout()
|
|
165
|
+
return fig
|
|
166
|
+
|
|
167
|
+
def plot_calibration(self):
|
|
168
|
+
"""Per-node reliability curves drawn with ``core.reliability_diagram``
|
|
169
|
+
(federated retains each node's (y, score), so these are true curves).
|
|
170
|
+
Returns a Figure."""
|
|
171
|
+
from ..core import reliability_diagram
|
|
172
|
+
|
|
173
|
+
ys, ss, labels = [], [], []
|
|
174
|
+
for node, (y, s) in self._node_data.items():
|
|
175
|
+
ys.append(np.asarray(y))
|
|
176
|
+
ss.append(np.asarray(s))
|
|
177
|
+
labels.append(np.full(len(y), node))
|
|
178
|
+
return reliability_diagram(
|
|
179
|
+
np.concatenate(ys),
|
|
180
|
+
np.concatenate(ss),
|
|
181
|
+
groups=np.concatenate(labels),
|
|
182
|
+
n_bins=self.n_bins,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def to_pdf(self, path):
|
|
186
|
+
"""Write a multi-page PDF: summary, per-node AUC forest, per-node calibration.
|
|
187
|
+
Uses matplotlib only (no extra dependency)."""
|
|
188
|
+
import matplotlib.pyplot as plt
|
|
189
|
+
from matplotlib.backends.backend_pdf import PdfPages
|
|
190
|
+
|
|
191
|
+
with PdfPages(path) as pdf:
|
|
192
|
+
fig0, ax = plt.subplots(figsize=(8.5, 11))
|
|
193
|
+
ax.axis("off")
|
|
194
|
+
ax.text(0.02, 0.98, self.summary(), family="monospace", va="top", fontsize=8)
|
|
195
|
+
pdf.savefig(fig0)
|
|
196
|
+
plt.close(fig0)
|
|
197
|
+
|
|
198
|
+
forest = self.plot_auc_forest()
|
|
199
|
+
pdf.savefig(forest)
|
|
200
|
+
plt.close(forest)
|
|
201
|
+
|
|
202
|
+
cal = self.plot_calibration()
|
|
203
|
+
cal.axes[0].set_title("Per-node calibration")
|
|
204
|
+
pdf.savefig(cal)
|
|
205
|
+
plt.close(cal)
|
|
206
|
+
|
|
207
|
+
def summary(self) -> str:
|
|
208
|
+
lines = [self.to_dataframe().to_string(index=False)]
|
|
209
|
+
d = self.disparity()
|
|
210
|
+
lines.append(f"cross-node AUC gap: {d['best']} vs {d['worst']} = {d['max_auc_gap']:.3f}")
|
|
211
|
+
if self.corrected is not None:
|
|
212
|
+
for (a, b), padj, rej in zip(
|
|
213
|
+
self.pairs, self.corrected["adjusted"], self.corrected["reject"]
|
|
214
|
+
):
|
|
215
|
+
if rej:
|
|
216
|
+
lines.append(f" {a} vs {b}: Bonferroni-adjusted p={padj:.4g} (significant)")
|
|
217
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""One-call clinical fairness audit. Composes fairscope.core; invents no statistics.
|
|
2
|
+
|
|
3
|
+
Pipeline per protected attribute: per-subgroup DeLong AUC CIs -> per-subgroup ECE ->
|
|
4
|
+
per-subgroup Brier/F1 -> Bonferroni-corrected pairwise (unpaired) DeLong tests across the
|
|
5
|
+
attribute's subgroups. Mirrors the analysis in the diabetes paper (IEEE CIPHER 2026).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from itertools import combinations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from ..core import (
|
|
16
|
+
bonferroni,
|
|
17
|
+
delong_by_group,
|
|
18
|
+
delong_unpaired_test,
|
|
19
|
+
ece_by_group,
|
|
20
|
+
subgroup_metrics,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HealthcareFairnessAudit:
|
|
25
|
+
"""Audit a fitted classifier (or precomputed scores) for subgroup fairness.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
model : object with ``predict_proba`` (positive-class probability in column 1), or
|
|
30
|
+
``None`` when using :meth:`from_scores`.
|
|
31
|
+
X_test, y_test : test features and binary labels.
|
|
32
|
+
protected_attr : dict ``{attribute_name: 1-D array of subgroup labels per sample}``.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, model, X_test, y_test, protected_attr, *, n_bins=10, alpha=0.05):
|
|
36
|
+
self.model = model
|
|
37
|
+
self.X_test = X_test
|
|
38
|
+
self.y_test = np.asarray(y_test)
|
|
39
|
+
self.protected_attr = protected_attr
|
|
40
|
+
self.n_bins = n_bins
|
|
41
|
+
self.alpha = alpha
|
|
42
|
+
self._scores = None
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_scores(cls, y_true, y_score, protected_attr, *, n_bins=10, alpha=0.05):
|
|
46
|
+
"""Build an audit from precomputed positive-class probabilities (no model)."""
|
|
47
|
+
obj = cls(None, None, y_true, protected_attr, n_bins=n_bins, alpha=alpha)
|
|
48
|
+
obj._scores = np.asarray(y_score, dtype=float)
|
|
49
|
+
return obj
|
|
50
|
+
|
|
51
|
+
def _get_scores(self):
|
|
52
|
+
if self._scores is not None:
|
|
53
|
+
return self._scores
|
|
54
|
+
if self.model is None or not hasattr(self.model, "predict_proba"):
|
|
55
|
+
raise ValueError(
|
|
56
|
+
"model must implement predict_proba(); or use "
|
|
57
|
+
"HealthcareFairnessAudit.from_scores(...)"
|
|
58
|
+
)
|
|
59
|
+
return np.asarray(self.model.predict_proba(self.X_test))[:, 1]
|
|
60
|
+
|
|
61
|
+
def run(self) -> HealthcareReport:
|
|
62
|
+
"""Run the audit and return a :class:`HealthcareReport`.
|
|
63
|
+
|
|
64
|
+
Raises a clear, attribute-named ``ValueError`` if a subgroup is single-class
|
|
65
|
+
(AUC undefined) rather than letting a low-level error surface.
|
|
66
|
+
"""
|
|
67
|
+
y = self.y_test
|
|
68
|
+
s = self._get_scores()
|
|
69
|
+
results = {}
|
|
70
|
+
for name, groups in self.protected_attr.items():
|
|
71
|
+
groups = np.asarray(groups)
|
|
72
|
+
try:
|
|
73
|
+
cis = delong_by_group(y, s, groups, alpha=self.alpha)
|
|
74
|
+
eces = ece_by_group(y, s, groups, n_bins=self.n_bins)
|
|
75
|
+
mets = subgroup_metrics(y, s, groups)
|
|
76
|
+
except ValueError as exc:
|
|
77
|
+
raise ValueError(f"protected attribute {name!r}: {exc}") from exc
|
|
78
|
+
vals = sorted(np.unique(groups).tolist())
|
|
79
|
+
pairs, pvals, deltas = [], [], []
|
|
80
|
+
for a, b in combinations(vals, 2):
|
|
81
|
+
ma, mb = groups == a, groups == b
|
|
82
|
+
res = delong_unpaired_test(y[ma], s[ma], y[mb], s[mb])
|
|
83
|
+
pairs.append((a, b))
|
|
84
|
+
pvals.append(res["p_value"])
|
|
85
|
+
deltas.append(res["delta"])
|
|
86
|
+
corrected = bonferroni(np.array(pvals), alpha=self.alpha) if pvals else None
|
|
87
|
+
results[name] = {
|
|
88
|
+
"groups": vals,
|
|
89
|
+
"ci": cis,
|
|
90
|
+
"ece": eces,
|
|
91
|
+
"metrics": mets,
|
|
92
|
+
"pairs": pairs,
|
|
93
|
+
"delta": deltas,
|
|
94
|
+
"p_value": pvals,
|
|
95
|
+
"p_adjusted": corrected["adjusted"].tolist() if corrected else [],
|
|
96
|
+
"reject": corrected["reject"].tolist() if corrected else [],
|
|
97
|
+
}
|
|
98
|
+
return HealthcareReport(
|
|
99
|
+
results, y, s, self.protected_attr, alpha=self.alpha, n_bins=self.n_bins
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def shap_summary(self, max_samples=200):
|
|
103
|
+
"""Mean absolute SHAP value per feature (optional). Requires
|
|
104
|
+
``pip install fairscope[shap]`` and a model (not ``from_scores``). Returns a dict
|
|
105
|
+
``{feature_index: mean_abs_shap}``."""
|
|
106
|
+
try:
|
|
107
|
+
import shap # noqa: F401
|
|
108
|
+
except ImportError as exc:
|
|
109
|
+
raise ImportError(
|
|
110
|
+
"SHAP summary requires the optional dependency: pip install fairscope[shap]"
|
|
111
|
+
) from exc
|
|
112
|
+
return _shap_mean_abs(self.model, self.X_test, max_samples) # pragma: no cover
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class HealthcareReport:
|
|
116
|
+
"""Holds audit results and the raw (y, score, groups) needed to render reliability
|
|
117
|
+
curves; provides tables (here), and plots/PDF (see plotting methods)."""
|
|
118
|
+
|
|
119
|
+
def __init__(self, results, y_true, y_score, protected_attr, *, alpha=0.05, n_bins=10):
|
|
120
|
+
self.results = results
|
|
121
|
+
self.y_true = np.asarray(y_true)
|
|
122
|
+
self.y_score = np.asarray(y_score)
|
|
123
|
+
self.protected_attr = protected_attr
|
|
124
|
+
self.alpha = alpha
|
|
125
|
+
self.n_bins = n_bins
|
|
126
|
+
|
|
127
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
128
|
+
rows = []
|
|
129
|
+
for attr, r in self.results.items():
|
|
130
|
+
for g in r["groups"]:
|
|
131
|
+
ci = r["ci"][g]
|
|
132
|
+
m = r["metrics"][g]
|
|
133
|
+
rows.append(
|
|
134
|
+
{
|
|
135
|
+
"attribute": attr,
|
|
136
|
+
"group": g,
|
|
137
|
+
"n": m["n"],
|
|
138
|
+
"auc": ci["auc"],
|
|
139
|
+
"ci_lower": ci["ci_lower"],
|
|
140
|
+
"ci_upper": ci["ci_upper"],
|
|
141
|
+
"ece": r["ece"][g],
|
|
142
|
+
"brier": m["brier"],
|
|
143
|
+
"f1": m["f1"],
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
return pd.DataFrame(rows)
|
|
147
|
+
|
|
148
|
+
def summary(self) -> str:
|
|
149
|
+
"""Return a human-readable summary string (no print side effect)."""
|
|
150
|
+
df = self.to_dataframe()
|
|
151
|
+
lines = [df.to_string(index=False)]
|
|
152
|
+
for attr, r in self.results.items():
|
|
153
|
+
aucs = {g: r["ci"][g]["auc"] for g in r["groups"]}
|
|
154
|
+
hi, lo = max(aucs, key=aucs.get), min(aucs, key=aucs.get)
|
|
155
|
+
lines.append(
|
|
156
|
+
f"[{attr}] largest AUC gap: {hi} {aucs[hi]:.3f} vs "
|
|
157
|
+
f"{lo} {aucs[lo]:.3f} (delta={aucs[hi] - aucs[lo]:.3f})"
|
|
158
|
+
)
|
|
159
|
+
for (a, b), padj, rej in zip(r["pairs"], r["p_adjusted"], r["reject"]):
|
|
160
|
+
if rej:
|
|
161
|
+
lines.append(f" {a} vs {b}: Bonferroni-adjusted p={padj:.4g} (significant)")
|
|
162
|
+
return "\n".join(lines)
|
|
163
|
+
|
|
164
|
+
def plot_auc_forest(self, attribute=None):
|
|
165
|
+
"""Forest plot of per-subgroup AUC with DeLong 95% CIs. Returns a Figure."""
|
|
166
|
+
import matplotlib.pyplot as plt
|
|
167
|
+
|
|
168
|
+
df = self.to_dataframe()
|
|
169
|
+
if attribute is not None:
|
|
170
|
+
df = df[df.attribute == attribute]
|
|
171
|
+
labels = [f"{a}:{g}" for a, g in zip(df.attribute, df.group)]
|
|
172
|
+
ys = np.arange(len(df))
|
|
173
|
+
xerr = np.vstack([df.auc - df.ci_lower, df.ci_upper - df.auc])
|
|
174
|
+
fig, ax = plt.subplots(figsize=(6, 0.5 * len(df) + 1.5))
|
|
175
|
+
ax.errorbar(df.auc, ys, xerr=xerr, fmt="o", capsize=3)
|
|
176
|
+
ax.set_yticks(ys)
|
|
177
|
+
ax.set_yticklabels(labels)
|
|
178
|
+
ax.set_xlabel("AUC (95% DeLong CI)")
|
|
179
|
+
ax.axvline(0.5, color="gray", linestyle="--", linewidth=1)
|
|
180
|
+
ax.set_title("Per-subgroup discrimination")
|
|
181
|
+
fig.tight_layout()
|
|
182
|
+
return fig
|
|
183
|
+
|
|
184
|
+
def plot_calibration(self, attribute=None):
|
|
185
|
+
"""Reliability curves per subgroup for one attribute, drawn with
|
|
186
|
+
``core.reliability_diagram``. Returns a Figure."""
|
|
187
|
+
from ..core import reliability_diagram
|
|
188
|
+
|
|
189
|
+
attr = attribute if attribute is not None else next(iter(self.protected_attr))
|
|
190
|
+
groups = np.asarray(self.protected_attr[attr])
|
|
191
|
+
return reliability_diagram(self.y_true, self.y_score, groups=groups, n_bins=self.n_bins)
|
|
192
|
+
|
|
193
|
+
def to_pdf(self, path):
|
|
194
|
+
"""Write a multi-page PDF: summary, AUC forest, and one calibration page per
|
|
195
|
+
attribute. Uses matplotlib only (no extra dependency)."""
|
|
196
|
+
import matplotlib.pyplot as plt
|
|
197
|
+
from matplotlib.backends.backend_pdf import PdfPages
|
|
198
|
+
|
|
199
|
+
with PdfPages(path) as pdf:
|
|
200
|
+
fig0, ax = plt.subplots(figsize=(8.5, 11))
|
|
201
|
+
ax.axis("off")
|
|
202
|
+
ax.text(0.02, 0.98, self.summary(), family="monospace", va="top", fontsize=8)
|
|
203
|
+
pdf.savefig(fig0)
|
|
204
|
+
plt.close(fig0)
|
|
205
|
+
|
|
206
|
+
forest = self.plot_auc_forest()
|
|
207
|
+
pdf.savefig(forest)
|
|
208
|
+
plt.close(forest)
|
|
209
|
+
|
|
210
|
+
for attr in self.protected_attr:
|
|
211
|
+
cal = self.plot_calibration(attr)
|
|
212
|
+
cal.axes[0].set_title(f"Calibration: {attr}")
|
|
213
|
+
pdf.savefig(cal)
|
|
214
|
+
plt.close(cal)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _shap_mean_abs(model, X_test, max_samples): # pragma: no cover - needs optional shap
|
|
218
|
+
import shap
|
|
219
|
+
|
|
220
|
+
if model is None:
|
|
221
|
+
raise ValueError("shap_summary requires a model (not from_scores).")
|
|
222
|
+
X = np.asarray(X_test)[:max_samples]
|
|
223
|
+
explainer = shap.Explainer(model.predict_proba, X)
|
|
224
|
+
values = explainer(X).values
|
|
225
|
+
vals = np.abs(values).mean(axis=0)
|
|
226
|
+
vals = vals[:, 1] if vals.ndim == 2 else vals
|
|
227
|
+
return {i: float(v) for i, v in enumerate(np.ravel(vals))}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""One-call lending fairness audit.
|
|
2
|
+
|
|
3
|
+
Annual approval-gap analysis composes fairscope.core and is purely descriptive (no causal
|
|
4
|
+
claim). Subgroup CATE estimation (Task 2) wraps econml under stated DML assumptions.
|
|
5
|
+
Mirrors the analyses in P1 (Causal Forest DML, HMDA) and P2 (descriptive disparities).
|
|
6
|
+
Ships no HMDA data and no model.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from ..core import disparate_impact
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LendingFairnessAudit:
|
|
18
|
+
"""Audit mortgage-approval outcomes for annual subgroup disparities.
|
|
19
|
+
|
|
20
|
+
Parameters
|
|
21
|
+
----------
|
|
22
|
+
approved : 1-D binary array (1 = approved).
|
|
23
|
+
group : 1-D array of protected-group labels, aligned with ``approved``.
|
|
24
|
+
year : 1-D array of years, aligned with ``approved``.
|
|
25
|
+
reference : the group label to compare every other group against. Must be present
|
|
26
|
+
in each year.
|
|
27
|
+
alpha : significance level (reserved for the CATE step).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, *, approved, group, year, reference, alpha=0.05):
|
|
31
|
+
self.approved = np.asarray(approved)
|
|
32
|
+
self.group = np.asarray(group)
|
|
33
|
+
self.year = np.asarray(year)
|
|
34
|
+
self.reference = reference
|
|
35
|
+
self.alpha = alpha
|
|
36
|
+
|
|
37
|
+
@classmethod
|
|
38
|
+
def from_outcomes(cls, approved, group, year, *, reference, alpha=0.05):
|
|
39
|
+
"""Build an audit from precomputed approval outcomes."""
|
|
40
|
+
return cls(approved=approved, group=group, year=year, reference=reference, alpha=alpha)
|
|
41
|
+
|
|
42
|
+
def estimate_cate(
|
|
43
|
+
self,
|
|
44
|
+
X,
|
|
45
|
+
*,
|
|
46
|
+
treatment=None,
|
|
47
|
+
outcome=None,
|
|
48
|
+
model_y=None,
|
|
49
|
+
model_t=None,
|
|
50
|
+
n_estimators=500,
|
|
51
|
+
random_state=0,
|
|
52
|
+
):
|
|
53
|
+
"""Per-subgroup conditional average treatment effect (CATE) of the protected
|
|
54
|
+
attribute on approval, via Causal Forest DML (``econml.dml.CausalForestDML``,
|
|
55
|
+
as in P1).
|
|
56
|
+
|
|
57
|
+
The CAUSAL CLAIM IS CONDITIONAL on the DML assumptions (unconfoundedness given
|
|
58
|
+
the supplied features ``X``, and overlap). This estimates an effect under those
|
|
59
|
+
assumptions; it does not, on its own, prove discrimination.
|
|
60
|
+
|
|
61
|
+
Parameters
|
|
62
|
+
----------
|
|
63
|
+
X : array (n, k) of heterogeneity features.
|
|
64
|
+
treatment : binary array; defaults to ``group != reference``.
|
|
65
|
+
outcome : binary array; defaults to the ``approved`` outcomes.
|
|
66
|
+
model_y, model_t : nuisance estimators; default to random forests.
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
dict
|
|
71
|
+
Keys ``ate``, ``effect``, and ``effect_interval``.
|
|
72
|
+
|
|
73
|
+
Notes
|
|
74
|
+
-----
|
|
75
|
+
Requires the optional dependency ``econml`` (``pip install fairscope[lending]``).
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
import econml.dml # noqa: F401
|
|
79
|
+
except ImportError as exc: # optional dependency
|
|
80
|
+
raise ImportError(
|
|
81
|
+
"Subgroup CATE requires the optional dependency: " "pip install fairscope[lending]"
|
|
82
|
+
) from exc
|
|
83
|
+
return _causal_forest_cate( # pragma: no cover - exercised only with econml installed
|
|
84
|
+
self, X, treatment, outcome, model_y, model_t, n_estimators, random_state
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def run(self) -> LendingReport:
|
|
88
|
+
rows = []
|
|
89
|
+
for yr in sorted(np.unique(self.year).tolist()):
|
|
90
|
+
m = self.year == yr
|
|
91
|
+
approved_y = self.approved[m]
|
|
92
|
+
group_y = self.group[m]
|
|
93
|
+
for g in sorted(np.unique(group_y).tolist()):
|
|
94
|
+
sel = group_y == g
|
|
95
|
+
rate = float(approved_y[sel].mean())
|
|
96
|
+
di = float(disparate_impact(approved_y, group_y, g, self.reference))
|
|
97
|
+
rows.append(
|
|
98
|
+
{
|
|
99
|
+
"year": yr,
|
|
100
|
+
"group": g,
|
|
101
|
+
"n": int(sel.sum()),
|
|
102
|
+
"approval_rate": rate,
|
|
103
|
+
"disparate_impact": di,
|
|
104
|
+
}
|
|
105
|
+
)
|
|
106
|
+
return LendingReport(pd.DataFrame(rows), reference=self.reference, alpha=self.alpha)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class LendingReport:
|
|
110
|
+
"""Holds the annual approval-gap table (CATE results are returned separately)."""
|
|
111
|
+
|
|
112
|
+
def __init__(self, df, *, reference, alpha=0.05):
|
|
113
|
+
self._df = df
|
|
114
|
+
self.reference = reference
|
|
115
|
+
self.alpha = alpha
|
|
116
|
+
|
|
117
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
118
|
+
return self._df.copy()
|
|
119
|
+
|
|
120
|
+
def summary(self) -> str:
|
|
121
|
+
lines = [self._df.to_string(index=False)]
|
|
122
|
+
nonref = self._df[self._df.group != self.reference]
|
|
123
|
+
if not nonref.empty:
|
|
124
|
+
worst = nonref.nsmallest(1, "disparate_impact").iloc[0]
|
|
125
|
+
lines.append(
|
|
126
|
+
f"largest gap: {worst.group} in {int(worst.year)} "
|
|
127
|
+
f"DI={worst.disparate_impact:.3f} (approval {worst.approval_rate:.3f})"
|
|
128
|
+
)
|
|
129
|
+
return "\n".join(lines)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _causal_forest_cate( # pragma: no cover - exercised only with econml installed
|
|
133
|
+
audit, X, treatment, outcome, model_y, model_t, n_estimators, random_state
|
|
134
|
+
):
|
|
135
|
+
"""Causal Forest DML CATE. Isolated so the optional-econml path is excluded from
|
|
136
|
+
coverage in CI (econml is not installed there); the importorskip tests exercise it
|
|
137
|
+
locally and for any contributor who installs ``fairscope[lending]``."""
|
|
138
|
+
from econml.dml import CausalForestDML
|
|
139
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
140
|
+
|
|
141
|
+
if treatment is None:
|
|
142
|
+
t = (audit.group != audit.reference).astype(int)
|
|
143
|
+
else:
|
|
144
|
+
t = np.asarray(treatment)
|
|
145
|
+
y = audit.approved if outcome is None else np.asarray(outcome)
|
|
146
|
+
x = np.asarray(X)
|
|
147
|
+
est = CausalForestDML(
|
|
148
|
+
model_y=model_y or RandomForestRegressor(random_state=random_state),
|
|
149
|
+
model_t=model_t or RandomForestClassifier(random_state=random_state),
|
|
150
|
+
discrete_treatment=True,
|
|
151
|
+
n_estimators=n_estimators,
|
|
152
|
+
random_state=random_state,
|
|
153
|
+
)
|
|
154
|
+
est.fit(y, t, X=x)
|
|
155
|
+
lo, hi = est.effect_interval(x, alpha=audit.alpha)
|
|
156
|
+
return {
|
|
157
|
+
"ate": float(est.ate(x)),
|
|
158
|
+
"effect": est.effect(x),
|
|
159
|
+
"effect_interval": (lo, hi),
|
|
160
|
+
}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Cross-Platform Fairness Evaluation (CPFE) for NLP, built on fairscope.core.
|
|
2
|
+
|
|
3
|
+
Public API: the five-axis ``CPFEProtocol``/``CPFEReport``, the multiclass metric and
|
|
4
|
+
significance primitives (axes 1-4), and the attribution-stability functions (axis 5).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .attribution import jaccard_topk, token_saliency
|
|
8
|
+
from .cross_platform import CPFEProtocol, CPFEReport
|
|
9
|
+
from .metrics import (
|
|
10
|
+
macro_auc,
|
|
11
|
+
macro_f1,
|
|
12
|
+
multiclass_ece,
|
|
13
|
+
per_class_disparate_impact,
|
|
14
|
+
per_class_equalized_odds,
|
|
15
|
+
)
|
|
16
|
+
from .significance import bootstrap_macro_auc_test
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"CPFEProtocol",
|
|
20
|
+
"CPFEReport",
|
|
21
|
+
"macro_auc",
|
|
22
|
+
"macro_f1",
|
|
23
|
+
"multiclass_ece",
|
|
24
|
+
"per_class_disparate_impact",
|
|
25
|
+
"per_class_equalized_odds",
|
|
26
|
+
"bootstrap_macro_auc_test",
|
|
27
|
+
"jaccard_topk",
|
|
28
|
+
"token_saliency",
|
|
29
|
+
]
|