fairscope 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fairscope/__init__.py +55 -0
- fairscope/core/__init__.py +48 -0
- fairscope/core/_utils.py +44 -0
- fairscope/core/bootstrap.py +66 -0
- fairscope/core/calibration.py +168 -0
- fairscope/core/correction.py +49 -0
- fairscope/core/delong.py +151 -0
- fairscope/core/metrics.py +96 -0
- fairscope/federated/__init__.py +9 -0
- fairscope/federated/audit.py +217 -0
- fairscope/healthcare/__init__.py +5 -0
- fairscope/healthcare/audit.py +227 -0
- fairscope/lending/__init__.py +5 -0
- fairscope/lending/audit.py +160 -0
- fairscope/nlp/__init__.py +29 -0
- fairscope/nlp/attribution.py +54 -0
- fairscope/nlp/cross_platform.py +178 -0
- fairscope/nlp/metrics.py +87 -0
- fairscope/nlp/significance.py +51 -0
- fairscope-0.3.0.dist-info/METADATA +202 -0
- fairscope-0.3.0.dist-info/RECORD +23 -0
- fairscope-0.3.0.dist-info/WHEEL +4 -0
- fairscope-0.3.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Axis 5 of CPFE: attribution stability via Jaccard overlap of top-K gradient-saliency
|
|
2
|
+
token sets across platforms (CPFE paper). The Jaccard computation is dependency-free; the
|
|
3
|
+
gradient-saliency extraction uses Captum and requires ``pip install fairscope[nlp]``.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def jaccard_topk(saliency_a, saliency_b, k):
|
|
10
|
+
"""Jaccard overlap of the top-k tokens by saliency: ``|topK(A) ∩ topK(B)| / |union|``.
|
|
11
|
+
|
|
12
|
+
``saliency_*`` map token -> saliency score. Returns 0.0 if both are empty.
|
|
13
|
+
|
|
14
|
+
Examples
|
|
15
|
+
--------
|
|
16
|
+
>>> jaccard_topk({"a": 0.9, "b": 0.8}, {"a": 0.7, "c": 0.6}, k=2)
|
|
17
|
+
0.3333333333333333
|
|
18
|
+
"""
|
|
19
|
+
top_a = set(sorted(saliency_a, key=saliency_a.get, reverse=True)[:k])
|
|
20
|
+
top_b = set(sorted(saliency_b, key=saliency_b.get, reverse=True)[:k])
|
|
21
|
+
union = top_a | top_b
|
|
22
|
+
return len(top_a & top_b) / len(union) if union else 0.0
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def token_saliency(model, tokenizer, text, target=None):
|
|
26
|
+
"""Per-token gradient saliency ``s_i = ‖∂P(y|x)/∂E_i‖₂`` via Captum (optional).
|
|
27
|
+
Requires ``pip install fairscope[nlp]``. Returns ``{token: saliency}``."""
|
|
28
|
+
try:
|
|
29
|
+
import captum # noqa: F401 (captum depends on torch; one import gates the extra)
|
|
30
|
+
except ImportError as exc:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"token_saliency requires the optional dependency: pip install fairscope[nlp]"
|
|
33
|
+
) from exc
|
|
34
|
+
return _captum_token_saliency(model, tokenizer, text, target) # pragma: no cover
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _captum_token_saliency(model, tokenizer, text, target): # pragma: no cover - needs nlp extra
|
|
38
|
+
from captum.attr import Saliency
|
|
39
|
+
|
|
40
|
+
enc = tokenizer(text, return_tensors="pt")
|
|
41
|
+
embeddings = model.get_input_embeddings()(enc["input_ids"])
|
|
42
|
+
embeddings.requires_grad_(True)
|
|
43
|
+
|
|
44
|
+
def forward(emb):
|
|
45
|
+
return model(inputs_embeds=emb, attention_mask=enc["attention_mask"]).logits.softmax(-1)
|
|
46
|
+
|
|
47
|
+
tgt = target if target is not None else int(forward(embeddings).argmax())
|
|
48
|
+
grads = Saliency(forward).attribute(embeddings, target=tgt, abs=False)
|
|
49
|
+
scores = grads.norm(dim=-1).squeeze(0).detach().numpy()
|
|
50
|
+
tokens = tokenizer.convert_ids_to_tokens(enc["input_ids"].squeeze(0))
|
|
51
|
+
agg = {}
|
|
52
|
+
for tok, score in zip(tokens, scores):
|
|
53
|
+
agg[tok] = max(agg.get(tok, 0.0), float(score))
|
|
54
|
+
return agg
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""The five-axis Cross-Platform Fairness Evaluation (CPFE) protocol (Pall & Yadav).
|
|
2
|
+
|
|
3
|
+
Axes 1-4 run on precomputed per-platform outputs (no torch); attribution stability
|
|
4
|
+
(axis 5) is provided separately via ``fairscope.nlp.attribution`` behind the optional
|
|
5
|
+
``fairscope[nlp]`` extra.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
|
|
13
|
+
from ..core import bonferroni
|
|
14
|
+
from .metrics import (
|
|
15
|
+
macro_auc,
|
|
16
|
+
macro_f1,
|
|
17
|
+
multiclass_ece,
|
|
18
|
+
per_class_disparate_impact,
|
|
19
|
+
per_class_equalized_odds,
|
|
20
|
+
)
|
|
21
|
+
from .significance import bootstrap_macro_auc_test
|
|
22
|
+
|
|
23
|
+
# Reference bands EXPLICITLY STATED in the CPFE paper (descriptive diagnostic, NOT
|
|
24
|
+
# regulatory standards). P4 declines to set a delta-AUC decision threshold (Section 6.6),
|
|
25
|
+
# so that one is a constructor argument with an illustrative default instead.
|
|
26
|
+
DI_FOUR_FIFTHS = 0.80 # P4 Sec 4.4: DI < 0.80 violates the four-fifths rule
|
|
27
|
+
DI_SEVERE = 0.50 # P4 Sec 4.4: DI < 0.50 is a severe disparity
|
|
28
|
+
ECE_WELL_CALIBRATED = 0.10 # P4 Suppl. Fig. S2: ECE < 0.10 well-calibrated
|
|
29
|
+
ECE_MODERATE = 0.20 # P4 Suppl. Fig. S2: ECE > 0.20 moderate miscalibration
|
|
30
|
+
JACCARD_INSTABILITY = 0.20 # P4 Suppl. Fig. S7: J < 0.20 attribution instability
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CPFEProtocol:
|
|
34
|
+
"""Run the CPFE five-axis evaluation over precomputed per-platform outputs.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
platform_data : dict ``{name: {"y_true": array, "probs": (n, n_classes) array}}``.
|
|
39
|
+
reference : the within-platform name (e.g. the training platform).
|
|
40
|
+
n_classes : number of classes.
|
|
41
|
+
delta_auc_pct_max : ILLUSTRATIVE macro-AUC-drop screening limit (percent) used by
|
|
42
|
+
``CPFEReport.deployment_readiness``. NOT a published cutoff: P4 Section 6.6 declines
|
|
43
|
+
to set one (observed drops were 28.6-39.5%); the default echoes that ">30%" magnitude
|
|
44
|
+
and is labelled illustrative everywhere it surfaces.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def __init__(
|
|
48
|
+
self,
|
|
49
|
+
platform_data,
|
|
50
|
+
reference,
|
|
51
|
+
n_classes,
|
|
52
|
+
*,
|
|
53
|
+
n_bins=10,
|
|
54
|
+
alpha=0.05,
|
|
55
|
+
n_boot=2000,
|
|
56
|
+
delta_auc_pct_max=30.0,
|
|
57
|
+
):
|
|
58
|
+
if reference not in platform_data:
|
|
59
|
+
raise ValueError(f"reference platform {reference!r} not in platform_data")
|
|
60
|
+
self.platform_data = platform_data
|
|
61
|
+
self.reference = reference
|
|
62
|
+
self.n_classes = n_classes
|
|
63
|
+
self.n_bins = n_bins
|
|
64
|
+
self.alpha = alpha
|
|
65
|
+
self.n_boot = n_boot
|
|
66
|
+
self.delta_auc_pct_max = delta_auc_pct_max
|
|
67
|
+
|
|
68
|
+
def run(self) -> CPFEReport:
|
|
69
|
+
ref = self.platform_data[self.reference]
|
|
70
|
+
ref_auc = macro_auc(ref["y_true"], ref["probs"])
|
|
71
|
+
others = [p for p in self.platform_data if p != self.reference]
|
|
72
|
+
|
|
73
|
+
performance = {}
|
|
74
|
+
for name, d in self.platform_data.items():
|
|
75
|
+
a = macro_auc(d["y_true"], d["probs"])
|
|
76
|
+
performance[name] = {
|
|
77
|
+
"macro_auc": a,
|
|
78
|
+
"macro_f1": macro_f1(d["y_true"], d["probs"]),
|
|
79
|
+
"ece": multiclass_ece(d["y_true"], d["probs"], self.n_bins),
|
|
80
|
+
"delta_auc_pct": 100.0 * (a - ref_auc) / ref_auc,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
significance, equity, raw_p = {}, {}, []
|
|
84
|
+
for name in others:
|
|
85
|
+
d = self.platform_data[name]
|
|
86
|
+
sig = bootstrap_macro_auc_test(
|
|
87
|
+
ref["y_true"],
|
|
88
|
+
ref["probs"],
|
|
89
|
+
d["y_true"],
|
|
90
|
+
d["probs"],
|
|
91
|
+
n_boot=self.n_boot,
|
|
92
|
+
random_state=0,
|
|
93
|
+
)
|
|
94
|
+
significance[name] = sig
|
|
95
|
+
raw_p.append(sig["p_value"])
|
|
96
|
+
equity[name] = {
|
|
97
|
+
"disparate_impact": per_class_disparate_impact(
|
|
98
|
+
ref["probs"], d["probs"], self.n_classes
|
|
99
|
+
),
|
|
100
|
+
"equalized_odds": per_class_equalized_odds(
|
|
101
|
+
ref["y_true"], ref["probs"], d["y_true"], d["probs"], self.n_classes
|
|
102
|
+
),
|
|
103
|
+
}
|
|
104
|
+
if raw_p:
|
|
105
|
+
adj = bonferroni(np.array(raw_p), alpha=self.alpha)
|
|
106
|
+
for name, padj, rej in zip(others, adj["adjusted"], adj["reject"]):
|
|
107
|
+
significance[name]["p_adjusted"] = float(padj)
|
|
108
|
+
significance[name]["reject"] = bool(rej)
|
|
109
|
+
|
|
110
|
+
return CPFEReport(performance, significance, equity, self.reference, self.delta_auc_pct_max)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class CPFEReport:
|
|
114
|
+
"""Holds the five-axis results and renders tables and a deployment-readiness diagnostic."""
|
|
115
|
+
|
|
116
|
+
def __init__(self, performance, significance, equity, reference, delta_auc_pct_max):
|
|
117
|
+
self.performance = performance
|
|
118
|
+
self.significance = significance
|
|
119
|
+
self.equity = equity
|
|
120
|
+
self.reference = reference
|
|
121
|
+
self.delta_auc_pct_max = delta_auc_pct_max
|
|
122
|
+
|
|
123
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
124
|
+
return pd.DataFrame([{"platform": name, **m} for name, m in self.performance.items()])
|
|
125
|
+
|
|
126
|
+
def deployment_readiness(self):
|
|
127
|
+
"""Structured per-axis, per-platform screening DIAGNOSTIC -- NOT a deployment
|
|
128
|
+
decision. Following the CPFE paper (Sections 6.5-6.6), cross-platform degradation
|
|
129
|
+
is an informative diagnostic, not definitive evidence of bias.
|
|
130
|
+
|
|
131
|
+
Thresholds: calibration uses P4's stated ECE bands (Suppl. Fig. S2); equity uses
|
|
132
|
+
P4's four-fifths rule; discrimination uses an ILLUSTRATIVE ``delta_auc_pct_max``
|
|
133
|
+
(P4 Section 6.6 declines to set a published cutoff). Returns
|
|
134
|
+
``{platform: {"ready": bool, "axes": {axis: {pass, value, threshold, source, reason}}}}``.
|
|
135
|
+
"""
|
|
136
|
+
verdict = {}
|
|
137
|
+
for name, perf in self.performance.items():
|
|
138
|
+
if name == self.reference:
|
|
139
|
+
continue
|
|
140
|
+
drop = -perf["delta_auc_pct"]
|
|
141
|
+
ece = perf["ece"]
|
|
142
|
+
di = self.equity.get(name, {}).get("disparate_impact", {})
|
|
143
|
+
violations = sorted(c for c, v in di.items() if v < DI_FOUR_FIFTHS)
|
|
144
|
+
severe = sorted(c for c, v in di.items() if v < DI_SEVERE)
|
|
145
|
+
if violations:
|
|
146
|
+
equity_reason = (
|
|
147
|
+
f"four-fifths violations (DI<{DI_FOUR_FIFTHS}) for classes "
|
|
148
|
+
f"{violations}; severe (<{DI_SEVERE}) {severe}"
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
equity_reason = "no four-fifths violations"
|
|
152
|
+
axes = {
|
|
153
|
+
"discrimination": {
|
|
154
|
+
"pass": drop <= self.delta_auc_pct_max,
|
|
155
|
+
"value": drop,
|
|
156
|
+
"threshold": self.delta_auc_pct_max,
|
|
157
|
+
"source": "illustrative (not a published cutoff; P4 Section 6.6)",
|
|
158
|
+
"reason": f"macro-AUC drop {drop:.1f}% vs reference "
|
|
159
|
+
f"(illustrative limit {self.delta_auc_pct_max:.0f}%)",
|
|
160
|
+
},
|
|
161
|
+
"calibration": {
|
|
162
|
+
"pass": ece < ECE_WELL_CALIBRATED,
|
|
163
|
+
"value": ece,
|
|
164
|
+
"threshold": ECE_WELL_CALIBRATED,
|
|
165
|
+
"source": "P4 Suppl. Fig. S2",
|
|
166
|
+
"reason": f"ECE {ece:.3f} (well-calibrated < {ECE_WELL_CALIBRATED}; "
|
|
167
|
+
f"moderate miscalibration > {ECE_MODERATE})",
|
|
168
|
+
},
|
|
169
|
+
"equity": {
|
|
170
|
+
"pass": len(violations) == 0,
|
|
171
|
+
"value": {"violations": violations, "severe": severe},
|
|
172
|
+
"threshold": DI_FOUR_FIFTHS,
|
|
173
|
+
"source": "P4 four-fifths rule (Sec 4.4)",
|
|
174
|
+
"reason": equity_reason,
|
|
175
|
+
},
|
|
176
|
+
}
|
|
177
|
+
verdict[name] = {"ready": all(a["pass"] for a in axes.values()), "axes": axes}
|
|
178
|
+
return verdict
|
fairscope/nlp/metrics.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Multiclass metrics for the CPFE protocol (axes 1, 2, 4 primitives).
|
|
2
|
+
|
|
3
|
+
Pure functions that reuse ``fairscope.core`` where the definition is shared. The
|
|
4
|
+
confidence-accuracy ECE follows the formula in the CPFE paper (Pall & Yadav), and is
|
|
5
|
+
distinct from ``core.expected_calibration_error`` (binary prob-vs-frequency calibration).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from sklearn.metrics import f1_score, roc_auc_score
|
|
12
|
+
|
|
13
|
+
from ..core import disparate_impact, equalized_odds_difference
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _check_probs(y_true, probs):
|
|
17
|
+
y = np.asarray(y_true)
|
|
18
|
+
p = np.asarray(probs, dtype=float)
|
|
19
|
+
if p.ndim != 2:
|
|
20
|
+
raise ValueError("probs must be a 2-D array of shape (n_samples, n_classes)")
|
|
21
|
+
if len(y) != p.shape[0]:
|
|
22
|
+
raise ValueError("y_true and probs must have the same number of rows")
|
|
23
|
+
if np.any(np.isnan(p)):
|
|
24
|
+
raise ValueError("probs contain NaN")
|
|
25
|
+
return y, p
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def macro_auc(y_true, probs):
|
|
29
|
+
"""Macro one-vs-rest AUC. Requires every class present in ``y_true``."""
|
|
30
|
+
y, p = _check_probs(y_true, probs)
|
|
31
|
+
return float(roc_auc_score(y, p, multi_class="ovr", average="macro"))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def macro_f1(y_true, probs):
|
|
35
|
+
"""Macro F1 of the argmax predictions."""
|
|
36
|
+
y, p = _check_probs(y_true, probs)
|
|
37
|
+
return float(f1_score(y, p.argmax(axis=1), average="macro"))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def multiclass_ece(y_true, probs, n_bins=10):
|
|
41
|
+
"""Confidence-accuracy Expected Calibration Error (Guo et al. 2017; CPFE paper):
|
|
42
|
+
``ECE = sum_m (|B_m|/n) * |acc(B_m) - conf(B_m)|`` with ``conf = max prob`` and
|
|
43
|
+
``acc = top-1 correct``."""
|
|
44
|
+
y, p = _check_probs(y_true, probs)
|
|
45
|
+
conf = p.max(axis=1)
|
|
46
|
+
correct = (p.argmax(axis=1) == y).astype(float)
|
|
47
|
+
edges = np.linspace(0.0, 1.0, n_bins + 1)
|
|
48
|
+
idx = np.digitize(conf, edges[1:-1])
|
|
49
|
+
n = len(y)
|
|
50
|
+
ece = 0.0
|
|
51
|
+
for b in range(n_bins):
|
|
52
|
+
m = idx == b
|
|
53
|
+
if np.any(m):
|
|
54
|
+
ece += (m.sum() / n) * abs(correct[m].mean() - conf[m].mean())
|
|
55
|
+
return float(ece)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def per_class_disparate_impact(probs_a, probs_b, n_classes):
|
|
59
|
+
"""Symmetric DI per class between two platforms, reusing ``core.disparate_impact`` with
|
|
60
|
+
the class binarized (``pred == c``) and platform as the two-group label."""
|
|
61
|
+
pa = np.asarray(probs_a).argmax(axis=1)
|
|
62
|
+
pb = np.asarray(probs_b).argmax(axis=1)
|
|
63
|
+
groups = np.array(["a"] * len(pa) + ["b"] * len(pb))
|
|
64
|
+
out = {}
|
|
65
|
+
for c in range(n_classes):
|
|
66
|
+
ypred = np.concatenate([(pa == c).astype(int), (pb == c).astype(int)])
|
|
67
|
+
out[c] = disparate_impact(ypred, groups, "a", "b")
|
|
68
|
+
return out
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def per_class_equalized_odds(y_a, probs_a, y_b, probs_b, n_classes):
|
|
72
|
+
"""EOD per class between two platforms (``|TPR_c(A) - TPR_c(B)|``), reusing
|
|
73
|
+
``core.equalized_odds_difference``. A class with no positive labels in a platform is
|
|
74
|
+
returned as ``None`` (TPR undefined)."""
|
|
75
|
+
pa = np.asarray(probs_a).argmax(axis=1)
|
|
76
|
+
pb = np.asarray(probs_b).argmax(axis=1)
|
|
77
|
+
ya, yb = np.asarray(y_a), np.asarray(y_b)
|
|
78
|
+
groups = np.array(["a"] * len(ya) + ["b"] * len(yb))
|
|
79
|
+
out = {}
|
|
80
|
+
for c in range(n_classes):
|
|
81
|
+
yt = np.concatenate([(ya == c).astype(int), (yb == c).astype(int)])
|
|
82
|
+
yp = np.concatenate([(pa == c).astype(int), (pb == c).astype(int)])
|
|
83
|
+
try:
|
|
84
|
+
out[c] = equalized_odds_difference(yt, yp, groups, "a", "b")
|
|
85
|
+
except ValueError:
|
|
86
|
+
out[c] = None # a platform has no examples of class c
|
|
87
|
+
return out
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Axis 3 of CPFE: unpaired bootstrap comparison of macro one-vs-rest AUC across two
|
|
2
|
+
platforms (independent test sets), as in the CPFE paper (stratified bootstrap standard
|
|
3
|
+
errors, B = 2000 by default, combined for a normal-approximation z-test).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from scipy import stats
|
|
10
|
+
|
|
11
|
+
from .metrics import macro_auc
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _bootstrap_se(y, probs, n_boot, rng):
|
|
15
|
+
"""Stratified (by class) bootstrap standard error of the macro AUC."""
|
|
16
|
+
y = np.asarray(y)
|
|
17
|
+
probs = np.asarray(probs, dtype=float)
|
|
18
|
+
class_idx = [np.flatnonzero(y == c) for c in np.unique(y)]
|
|
19
|
+
aucs = np.empty(n_boot)
|
|
20
|
+
for i in range(n_boot):
|
|
21
|
+
idx = np.concatenate([rng.choice(ci, size=ci.size, replace=True) for ci in class_idx])
|
|
22
|
+
aucs[i] = macro_auc(y[idx], probs[idx])
|
|
23
|
+
return float(aucs.std(ddof=1))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def bootstrap_macro_auc_test(y_a, probs_a, y_b, probs_b, n_boot=2000, random_state=None):
|
|
27
|
+
"""Compare macro AUC across two platforms (independent test sets).
|
|
28
|
+
|
|
29
|
+
Each platform's macro-AUC standard error is estimated by a class-stratified bootstrap;
|
|
30
|
+
the errors are combined for an unpaired z-test. Returns a dict: auc_a, auc_b, delta,
|
|
31
|
+
se, z, p_value, n_boot.
|
|
32
|
+
"""
|
|
33
|
+
rng = np.random.default_rng(random_state)
|
|
34
|
+
probs_a = np.asarray(probs_a, dtype=float)
|
|
35
|
+
probs_b = np.asarray(probs_b, dtype=float)
|
|
36
|
+
auc_a = macro_auc(y_a, probs_a)
|
|
37
|
+
auc_b = macro_auc(y_b, probs_b)
|
|
38
|
+
se_a = _bootstrap_se(y_a, probs_a, n_boot, rng)
|
|
39
|
+
se_b = _bootstrap_se(y_b, probs_b, n_boot, rng)
|
|
40
|
+
delta = auc_a - auc_b
|
|
41
|
+
se = float(np.sqrt(se_a**2 + se_b**2))
|
|
42
|
+
z = delta / se if se > 0 else 0.0
|
|
43
|
+
return {
|
|
44
|
+
"auc_a": auc_a,
|
|
45
|
+
"auc_b": auc_b,
|
|
46
|
+
"delta": delta,
|
|
47
|
+
"se": se,
|
|
48
|
+
"z": float(z),
|
|
49
|
+
"p_value": float(2.0 * stats.norm.sf(abs(z))),
|
|
50
|
+
"n_boot": n_boot,
|
|
51
|
+
}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fairscope
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Subgroup-stratified, calibration-aware fairness auditing for ML models, grounded in peer-reviewed methods.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Rajveer-code/fairscope
|
|
6
|
+
Project-URL: Repository, https://github.com/Rajveer-code/fairscope
|
|
7
|
+
Project-URL: Issues, https://github.com/Rajveer-code/fairscope/issues
|
|
8
|
+
Author: Rajveer Singh Pall
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: auc,calibration,delong,fairness,machine-learning,model-auditing,subgroup-analysis
|
|
12
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: matplotlib>=3.6
|
|
22
|
+
Requires-Dist: numpy>=1.23
|
|
23
|
+
Requires-Dist: pandas>=1.5
|
|
24
|
+
Requires-Dist: scikit-learn>=1.1
|
|
25
|
+
Requires-Dist: scipy>=1.9
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: captum>=0.6; extra == 'all'
|
|
28
|
+
Requires-Dist: econml>=0.15; extra == 'all'
|
|
29
|
+
Requires-Dist: shap>=0.42; extra == 'all'
|
|
30
|
+
Requires-Dist: torch>=2.0; extra == 'all'
|
|
31
|
+
Requires-Dist: transformers>=4.30; extra == 'all'
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: black>=24.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: nbmake>=1.5; extra == 'dev'
|
|
35
|
+
Requires-Dist: pre-commit>=3.5; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest>=7.4; extra == 'dev'
|
|
38
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
39
|
+
Requires-Dist: statsmodels>=0.14; extra == 'dev'
|
|
40
|
+
Provides-Extra: docs
|
|
41
|
+
Requires-Dist: mkdocs-material>=9.4; extra == 'docs'
|
|
42
|
+
Requires-Dist: mkdocs>=1.5; extra == 'docs'
|
|
43
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
|
|
44
|
+
Provides-Extra: lending
|
|
45
|
+
Requires-Dist: econml>=0.15; extra == 'lending'
|
|
46
|
+
Provides-Extra: nlp
|
|
47
|
+
Requires-Dist: captum>=0.6; extra == 'nlp'
|
|
48
|
+
Requires-Dist: torch>=2.0; extra == 'nlp'
|
|
49
|
+
Requires-Dist: transformers>=4.30; extra == 'nlp'
|
|
50
|
+
Provides-Extra: shap
|
|
51
|
+
Requires-Dist: shap>=0.42; extra == 'shap'
|
|
52
|
+
Description-Content-Type: text/markdown
|
|
53
|
+
|
|
54
|
+
# fairscope
|
|
55
|
+
|
|
56
|
+
[](https://github.com/Rajveer-code/fairscope/actions/workflows/ci.yml)
|
|
57
|
+
[](https://www.python.org/)
|
|
58
|
+
[](LICENSE)
|
|
59
|
+
[](https://rajveer-code.github.io/fairscope/)
|
|
60
|
+
|
|
61
|
+
**Subgroup-stratified, calibration-aware fairness auditing for machine-learning models — grounded in peer-reviewed methods.**
|
|
62
|
+
|
|
63
|
+
📖 **Documentation:** <https://rajveer-code.github.io/fairscope/>
|
|
64
|
+
|
|
65
|
+
`fairscope` packages statistical machinery that mainstream fairness toolkits do not expose as
|
|
66
|
+
first-class, subgroup-stratified functions, and adds one novel protocol on top:
|
|
67
|
+
|
|
68
|
+
- **DeLong confidence intervals** for per-subgroup AUC (fast midrank algorithm).
|
|
69
|
+
- **Per-subgroup Expected/Maximum Calibration Error** with reliability diagrams.
|
|
70
|
+
- **Significance testing** of subgroup performance gaps (paired/unpaired DeLong, stratified
|
|
71
|
+
bootstrap) with **Bonferroni / Benjamini–Hochberg** correction.
|
|
72
|
+
- A subgroup-stratified **interface to standard recalibration** — temperature scaling
|
|
73
|
+
(Guo et al. 2017) and isotonic regression (Zadrozny & Elkan 2002), with pre/post-ECE.
|
|
74
|
+
- A novel five-axis **Cross-Platform Fairness Evaluation (CPFE)** protocol.
|
|
75
|
+
- One-call **domain audits**: `healthcare`, `lending`, `federated`.
|
|
76
|
+
|
|
77
|
+
Only the CPFE protocol is presented as novel. Every other function ports a documented method
|
|
78
|
+
and cites its source; the recalibration methods are standard, and the contribution there is the
|
|
79
|
+
per-subgroup interface and pre/post-ECE reporting.
|
|
80
|
+
|
|
81
|
+
> **Status — v0.3.0.** All five modules (`core`, `healthcare`, `nlp`/CPFE, `federated`,
|
|
82
|
+
> `lending`) are implemented, tested, and released. 100% line coverage on the statistical core;
|
|
83
|
+
> CI green across Python 3.9–3.12. See [`docs/DESIGN.md`](docs/DESIGN.md) for methods and design.
|
|
84
|
+
|
|
85
|
+
## Install
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install fairscope
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Releases are uploaded to PyPI by the maintainer; if a version isn't available there yet,
|
|
92
|
+
install from source or from the [release assets](https://github.com/Rajveer-code/fairscope/releases):
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
git clone https://github.com/Rajveer-code/fairscope
|
|
96
|
+
cd fairscope
|
|
97
|
+
pip install -e ".[dev]"
|
|
98
|
+
pytest
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
The base install is light (NumPy, SciPy, scikit-learn, pandas, matplotlib). Optional extras:
|
|
102
|
+
`fairscope[nlp]` (torch, transformers, captum), `fairscope[lending]` (econml),
|
|
103
|
+
`fairscope[shap]`, `fairscope[docs]`.
|
|
104
|
+
|
|
105
|
+
## Quickstart
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from fairscope.healthcare import HealthcareFairnessAudit
|
|
109
|
+
|
|
110
|
+
# y_true : binary outcomes
|
|
111
|
+
# y_score: the model's positive-class probabilities
|
|
112
|
+
# age_group: a protected attribute, aligned row-for-row
|
|
113
|
+
report = HealthcareFairnessAudit.from_scores(
|
|
114
|
+
y_true, y_score, {"age_group": age_group}
|
|
115
|
+
).run()
|
|
116
|
+
|
|
117
|
+
print(report.summary()) # per-subgroup AUC (DeLong CI), ECE, Brier, F1; flags the largest gap
|
|
118
|
+
report.to_dataframe() # tidy per-subgroup table
|
|
119
|
+
report.plot_auc_forest() # forest plot of per-subgroup AUC with DeLong intervals
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
Every domain is also reachable through one dispatcher,
|
|
123
|
+
`FairnessAudit(model, domain=...)`, with `domain` in `{"healthcare", "nlp", "federated",
|
|
124
|
+
"lending"}`. A runnable end-to-end example on a committed synthetic fixture is in the
|
|
125
|
+
[getting-started guide](https://rajveer-code.github.io/fairscope/getting-started/) and in
|
|
126
|
+
[`notebooks/`](notebooks/).
|
|
127
|
+
|
|
128
|
+
## Modules
|
|
129
|
+
|
|
130
|
+
| Module | Purpose | Status |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| `core/` | DeLong CI, bootstrap-AUC test, ECE/MCE + reliability, multiple-testing correction, subgroup metrics | ✅ shipped |
|
|
133
|
+
| `healthcare/` | one-call clinical fairness audit + report (tables, forest & reliability plots, PDF, optional SHAP) | ✅ shipped |
|
|
134
|
+
| `nlp/` | CPFE five-axis cross-platform protocol (centerpiece) + Captum attribution stability | ✅ shipped |
|
|
135
|
+
| `federated/` | per-node DeLong + cross-node disparity + per-node recalibration | ✅ shipped |
|
|
136
|
+
| `lending/` | annual approval-gap + subgroup CATE (Causal Forest DML) | ✅ shipped |
|
|
137
|
+
|
|
138
|
+
Plotting (forest plots, reliability diagrams) currently lives in the domain reports.
|
|
139
|
+
`lending`'s CATE estimation needs the optional `fairscope[lending]` extra (`econml`). The
|
|
140
|
+
`federated` module audits per-node predictions only — it performs no training and provides no
|
|
141
|
+
privacy guarantee.
|
|
142
|
+
|
|
143
|
+
## How it differs from AIF360 / Fairlearn
|
|
144
|
+
|
|
145
|
+
`fairscope` is complementary to AIF360 and Fairlearn, not a replacement: those toolkits do bias
|
|
146
|
+
*mitigation*; `fairscope` does uncertainty-aware *measurement*. The table below was verified by
|
|
147
|
+
inspecting the installed public APIs of **AIF360 0.6.1** and **Fairlearn 0.14.0** (checked
|
|
148
|
+
2026-06; re-confirm if versions change).
|
|
149
|
+
|
|
150
|
+
| Capability | AIF360 | Fairlearn | fairscope |
|
|
151
|
+
|---|:---:|:---:|:---:|
|
|
152
|
+
| Per-subgroup AUC confidence interval (DeLong) | no | no\* | yes |
|
|
153
|
+
| Per-subgroup Expected Calibration Error | no | no | yes |
|
|
154
|
+
| Subgroup significance test + multiple-comparison correction | no | no | yes |
|
|
155
|
+
| Subgroup-stratified recalibration (temperature / isotonic) | partial† | no | yes |
|
|
156
|
+
| Cross-platform five-axis protocol (CPFE) | no | no | yes (novel) |
|
|
157
|
+
| Per-node / federated audit | no | no | yes |
|
|
158
|
+
| Bias-mitigation algorithms | yes | yes | out of scope |
|
|
159
|
+
|
|
160
|
+
\* Fairlearn's `MetricFrame` computes per-subgroup AUC *point estimates* (e.g.
|
|
161
|
+
`roc_auc_score_group_min`), but provides no analytic (DeLong) confidence interval.
|
|
162
|
+
† AIF360 ships `CalibratedEqOddsPostprocessing` (calibration-aware equalized-odds
|
|
163
|
+
postprocessing), not a general per-subgroup temperature/isotonic recalibration interface.
|
|
164
|
+
|
|
165
|
+
**Closest related work — `meval`** (Sutariya & Petersen, 2025,
|
|
166
|
+
[arXiv:2512.17409](https://arxiv.org/abs/2512.17409)): a statistical toolbox for stratified,
|
|
167
|
+
fine-grained model-performance analysis that *also* provides subgroup metric uncertainty and
|
|
168
|
+
multiple-comparison corrections (with a medical-imaging focus). `fairscope` overlaps with it on
|
|
169
|
+
uncertainty + significance; what `fairscope` adds is the specific DeLong AUC intervals, the
|
|
170
|
+
per-subgroup calibration **and recalibration** interface, the five-axis cross-platform CPFE
|
|
171
|
+
protocol, and one-call domain audits (healthcare / lending / federated).
|
|
172
|
+
|
|
173
|
+
## Engineering
|
|
174
|
+
|
|
175
|
+
- **Test-driven**, with regression tests anchored to authoritative reference values where they
|
|
176
|
+
exist (DeLong's worked example; `statsmodels` multiple-testing routines).
|
|
177
|
+
- **100% line coverage** on the statistical core; CI runs pytest + coverage, ruff, and black
|
|
178
|
+
across Python 3.9–3.12, and executes the replication notebooks via `nbmake`.
|
|
179
|
+
- Full type hints, NumPy-style docstrings with runnable examples, and explicit input validation
|
|
180
|
+
(an AUC on a single-class subgroup raises rather than returning a meaningless value).
|
|
181
|
+
- Committed fixtures are **small, synthetic, and labelled as such**; no datasets or trained
|
|
182
|
+
models are bundled.
|
|
183
|
+
|
|
184
|
+
## Grounded in published research
|
|
185
|
+
|
|
186
|
+
`fairscope` ports methods from the author's peer-reviewed and under-review papers; it invents no
|
|
187
|
+
new mathematics. Each function cites its source. Full venues and identifiers are in
|
|
188
|
+
[`CITATION.cff`](CITATION.cff).
|
|
189
|
+
|
|
190
|
+
- Diabetes risk prediction with external validation + fairness analysis (XGBoost, NHANES→BRFSS) — IEEE CIPHER, 2026.
|
|
191
|
+
- A five-axis Cross-Platform Fairness Evaluation for mental-health NLP — under review.
|
|
192
|
+
- Privacy-preserving federated learning for diabetes risk across heterogeneous nodes — under review.
|
|
193
|
+
- Heterogeneous racial effects in mortgage approval (Causal Forest Double Machine Learning, HMDA) — under review.
|
|
194
|
+
- Racial disparities in mortgage lending (RDD / DiD / decomposition, HMDA) — under review.
|
|
195
|
+
|
|
196
|
+
## Citation
|
|
197
|
+
|
|
198
|
+
If you use `fairscope`, please cite it via [`CITATION.cff`](CITATION.cff).
|
|
199
|
+
|
|
200
|
+
## License
|
|
201
|
+
|
|
202
|
+
[MIT](LICENSE) © 2026 Rajveer Singh Pall · ORCID [0009-0001-6762-6134](https://orcid.org/0009-0001-6762-6134)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
fairscope/__init__.py,sha256=aZb7kP8vyFmMZqXe7MdhXI5TuszHSXJNK01-m9KFJSI,1979
|
|
2
|
+
fairscope/core/__init__.py,sha256=BqMc2kIC0Kjm2qvm7lqKJ7cLKNLDcdXeEQ53hN0t9Dw,1420
|
|
3
|
+
fairscope/core/_utils.py,sha256=K7NaqaWXfoZRtIc0xvDH5q9zRVTSVi1Q1SWq4yLAbTE,1368
|
|
4
|
+
fairscope/core/bootstrap.py,sha256=tETKVrGRNlkktGza0ITnVLh6vJIBDAh95OqwZ5fK2eE,2244
|
|
5
|
+
fairscope/core/calibration.py,sha256=FyFyVjGZ0EzjKDvcO_B9Y3aSXGwA9bG6nHn4s-Touw4,6188
|
|
6
|
+
fairscope/core/correction.py,sha256=3t3lIfThiP9nTIRiFgPJh2tNS3AAHMxMVMvcrPCHxGo,1443
|
|
7
|
+
fairscope/core/delong.py,sha256=9ZNh_98Zre4oHYYQaUxT5ydIrrg-2SIE0wJkNH81Ed0,5044
|
|
8
|
+
fairscope/core/metrics.py,sha256=NOIlTpzsTuVYbCPPOsfG2NdmtNn1DAaO9ZyqVlNKS_4,3655
|
|
9
|
+
fairscope/federated/__init__.py,sha256=3ieUZDgobc2OKjBPvnKittDuxRc2ZSDwL7SR1hSkFxM,336
|
|
10
|
+
fairscope/federated/audit.py,sha256=_MYnn_I9dBvVqZMHjxaJhHMktNWUS69XK0pVozeeCd0,8235
|
|
11
|
+
fairscope/healthcare/__init__.py,sha256=rmFIfUt_Qao0kZ0b2i7pNSCSD0RKIkxT3n1oN3CnwKc,179
|
|
12
|
+
fairscope/healthcare/audit.py,sha256=q84h7mpfgwwfczbuOWpBwFTZzJsq7tXWSsetqtCw5dc,9180
|
|
13
|
+
fairscope/lending/__init__.py,sha256=nS1UJjpO12mDduDrz5pwPsSDeFZMx_GLkHM2wBwL7Uc,195
|
|
14
|
+
fairscope/lending/audit.py,sha256=QEHHB3tdSG2ULVAVFOuKzFlMX0v-1MWuWyzJQBtLDGc,5989
|
|
15
|
+
fairscope/nlp/__init__.py,sha256=OmFoKumKjdCOjsvKOaSgCUyWuF-JEILmA0tC3Bj9i50,798
|
|
16
|
+
fairscope/nlp/attribution.py,sha256=7cGLZ2b5lbezvFr14B81uVpHf_2Nno_4YUpqSk6Ky6I,2292
|
|
17
|
+
fairscope/nlp/cross_platform.py,sha256=SaBgu69GLhKbBaZCq9S8tvXZphCORpbrvTvAxz0Xdcc,7507
|
|
18
|
+
fairscope/nlp/metrics.py,sha256=E5BJgdp6ooQbSsbxRgqLCyeHK0BOQsSw2GlkifLH794,3405
|
|
19
|
+
fairscope/nlp/significance.py,sha256=TXf2LgyLMvtan3xaOCbA6j27aHH6ejxeAGIbJ6Pd6oE,1859
|
|
20
|
+
fairscope-0.3.0.dist-info/METADATA,sha256=MYgXmLdl_jYdYbaQZKrroRQ3CwfHDusld4HIBIBH8-Q,10074
|
|
21
|
+
fairscope-0.3.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
22
|
+
fairscope-0.3.0.dist-info/licenses/LICENSE,sha256=33pRQQTdA8whKltAeAn7UzoxgIS-y3hQWQ-ZcxB0Pss,1075
|
|
23
|
+
fairscope-0.3.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Rajveer Singh Pall
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|