regression-substrate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ """
2
+ regression_substrate — a statistically rigorous gate for deciding whether a new
3
+ prompt/model version is better, worse, or inconclusive versus the current one.
4
+
5
+ Public API:
6
+
7
+ from regression_substrate import gate, Judge, load_from_jsonl
8
+
9
+ Offline gate (Day-1): gate, GateDecision, variance_components
10
+ Ingestion: load_from_jsonl, load_from_csv, assemble_records,
11
+ validate_records, Judge, auto_cluster
12
+ Streaming (Day-2): SequentialGate, Backend
13
+ Drift (Day-2): RollingGoldSet, drift_report, sample_for_labeling
14
+ Vendor adapters: flatten_runs, load_from_langsmith
15
+ """
16
+
17
+ from .diff_engine import gate, GateDecision, variance_components
18
+ from .ingest import (
19
+ load_from_jsonl, load_from_csv, assemble_records, validate_records,
20
+ Judge, cohen_kappa, auto_cluster, tfidf_embedder, sentence_transformer_embedder,
21
+ )
22
+ from .sequential_gate import SequentialGate, Backend, MartingaleState
23
+ from .gold import RollingGoldSet, drift_report, sample_for_labeling
24
+ from .adapters import flatten_runs, load_from_langsmith, TraceMap, LANGSMITH
25
+
26
+ __version__ = "0.1.0"
27
+
28
+ __all__ = [
29
+ "gate", "GateDecision", "variance_components",
30
+ "load_from_jsonl", "load_from_csv", "assemble_records", "validate_records",
31
+ "Judge", "cohen_kappa", "auto_cluster", "tfidf_embedder", "sentence_transformer_embedder",
32
+ "SequentialGate", "Backend", "MartingaleState",
33
+ "RollingGoldSet", "drift_report", "sample_for_labeling",
34
+ "flatten_runs", "load_from_langsmith", "TraceMap", "LANGSMITH",
35
+ "__version__",
36
+ ]
@@ -0,0 +1,185 @@
1
+ """
2
+ adapters.py — pull from vendor observability platforms into the 7-field schema.
3
+
4
+ HONESTY NOTE — READ THIS:
5
+ * The FLATTENING logic below is tested (see __main__) against a synthetic
6
+ fixture shaped like a vendor's documented run/feedback model. That transform
7
+ is proven.
8
+ * The LIVE FETCH (`load_from_langsmith`) is SDK- and auth-dependent and is NOT
9
+ exercised here. Vendor schemas drift, so the field paths in the presets are
10
+ best-effort and MUST be verified against the platform's current docs before
11
+ you trust them in production.
12
+
13
+ Design: don't hardwire any one vendor. A `TraceMap` says where each field lives
14
+ inside one run record; `flatten_runs` collapses (possibly nested, multi-step)
15
+ runs + feedback into flat 7-field records that ingest.assemble_records consumes.
16
+ A vendor is then just a TraceMap preset plus a thin fetch wrapper.
17
+ """
18
+
19
+ from __future__ import annotations
20
+ from dataclasses import dataclass
21
+ import json
22
+
23
+
24
+ def _dig(obj, path: str, default=None):
25
+ """Read a nested field by dotted path, e.g. 'extra.metadata.version'."""
26
+ cur = obj
27
+ for key in path.split("."):
28
+ if isinstance(cur, dict) and key in cur:
29
+ cur = cur[key]
30
+ else:
31
+ return default
32
+ return cur
33
+
34
+
35
+ def _canon(x) -> str:
36
+ """Canonicalize an input/output payload to a stable string. The `input`
37
+ string is what groups replicates and pairs versions, so it must be stable."""
38
+ if isinstance(x, str):
39
+ return x
40
+ if isinstance(x, dict):
41
+ for k in ("input", "question", "query", "text", "prompt",
42
+ "output", "answer", "result", "response"):
43
+ if isinstance(x.get(k), str):
44
+ return x[k]
45
+ return json.dumps(x, sort_keys=True)
46
+ return str(x)
47
+
48
+
49
+ @dataclass
50
+ class TraceMap:
51
+ """Where the fields live inside one vendor run record."""
52
+ input_path: str
53
+ output_path: str
54
+ version_path: str # MUST have been logged by the team; no version => unpairable
55
+ score_key: str # which feedback key carries the quality score
56
+ run_type_path: str = "run_type"
57
+ parent_path: str = "parent_run_id"
58
+ id_path: str = "id"
59
+ cluster_path: str | None = None
60
+ score_scale: tuple = (0.0, 1.0)
61
+
62
+
63
+ # Best-effort preset. VERIFY these paths against current LangSmith docs.
64
+ LANGSMITH = TraceMap(
65
+ input_path="inputs",
66
+ output_path="outputs",
67
+ version_path="extra.metadata.version",
68
+ score_key="quality",
69
+ run_type_path="run_type",
70
+ parent_path="parent_run_id",
71
+ id_path="id",
72
+ cluster_path="extra.metadata.cluster",
73
+ score_scale=(0.0, 1.0),
74
+ )
75
+
76
+
77
+ def flatten_runs(runs: list[dict], feedback: list[dict], tmap: TraceMap,
78
+ unit: str = "root") -> list[dict]:
79
+ """Collapse runs + feedback into flat 7-field records.
80
+
81
+ unit="root" -> evaluate whole trajectories (input=root input, response=root
82
+ output). Child LLM/tool runs are diagnostic and dropped.
83
+ unit=<type> -> evaluate a component instead (e.g. "retriever", "llm").
84
+ """
85
+ by_run: dict[str, dict] = {}
86
+ for f in feedback:
87
+ by_run.setdefault(f["run_id"], {})[f["key"]] = f.get("score")
88
+
89
+ lo, hi = tmap.score_scale
90
+ skipped_no_version = skipped_no_score = 0
91
+ records = []
92
+ for r in runs:
93
+ is_root = _dig(r, tmap.parent_path) is None
94
+ if unit == "root":
95
+ if not is_root:
96
+ continue
97
+ elif _dig(r, tmap.run_type_path) != unit:
98
+ continue
99
+
100
+ raw = (by_run.get(_dig(r, tmap.id_path)) or {}).get(tmap.score_key)
101
+ if raw is None:
102
+ skipped_no_score += 1
103
+ continue
104
+ version = _dig(r, tmap.version_path)
105
+ if version is None:
106
+ skipped_no_version += 1
107
+ continue
108
+
109
+ score = (float(raw) - lo) / (hi - lo) if hi != lo else float(raw)
110
+ rec = {
111
+ "input": _canon(_dig(r, tmap.input_path)),
112
+ "version": str(version),
113
+ "response": _canon(_dig(r, tmap.output_path)),
114
+ "score": max(0.0, min(1.0, score)),
115
+ }
116
+ if tmap.cluster_path:
117
+ c = _dig(r, tmap.cluster_path)
118
+ if c is not None:
119
+ rec["cluster"] = c
120
+ records.append(rec)
121
+
122
+ if skipped_no_version:
123
+ print(f" [adapter] WARNING: skipped {skipped_no_version} runs with no "
124
+ f"version tag at '{tmap.version_path}' -- they cannot be paired.")
125
+ if skipped_no_score:
126
+ print(f" [adapter] note: skipped {skipped_no_score} runs with no "
127
+ f"'{tmap.score_key}' feedback.")
128
+ return records
129
+
130
+
131
+ def load_from_langsmith(project: str, version_a: str, version_b: str,
132
+ tmap: TraceMap = LANGSMITH, unit: str = "root"):
133
+ """LIVE fetch + flatten + assemble. NOT exercised in this repo -- requires
134
+ the `langsmith` SDK and LANGSMITH_API_KEY, and the SDK call signatures and
135
+ schema below must be verified against current LangSmith docs."""
136
+ from langsmith import Client # raises if not installed
137
+ from .ingest import assemble_records
138
+
139
+ client = Client()
140
+ runs = [r.dict() for r in client.list_runs(project_name=project)]
141
+ run_ids = [r["id"] for r in runs]
142
+ feedback = [f.dict() for f in client.list_feedback(run_ids=run_ids)]
143
+ records = flatten_runs(runs, feedback, tmap, unit=unit)
144
+ return assemble_records(records, version_a, version_b)
145
+
146
+
147
+ # --------------------------------------------------------------------------- #
148
+ # Demo: flatten a SYNTHETIC LangSmith-shaped fixture -> gate(). Proves the
149
+ # transform (nested trajectory collapse, version extraction, score scaling,
150
+ # replicate derivation) without touching the live API.
151
+ # --------------------------------------------------------------------------- #
152
+
153
+ if __name__ == "__main__":
154
+ import numpy as np
155
+ from .ingest import assemble_records, validate_records
156
+ from .diff_engine import gate
157
+
158
+ rng = np.random.default_rng(0)
159
+ inputs = ([("billing", f"billing question {i}") for i in range(3)] +
160
+ [("general", f"general question {i}") for i in range(3)])
161
+
162
+ runs, feedback, rid = [], [], 0
163
+ for cluster, q in inputs:
164
+ for ver, base in [("v1", 0.90), ("v2", 0.20 if cluster == "billing" else 0.78)]:
165
+ for _rep in range(2): # two replicates per (input, version)
166
+ root = f"run-{rid}"; rid += 1
167
+ runs.append({"id": root, "run_type": "chain", "parent_run_id": None,
168
+ "inputs": {"question": q}, "outputs": {"answer": "..."},
169
+ "extra": {"metadata": {"version": ver, "cluster": cluster}}})
170
+ child = f"run-{rid}"; rid += 1 # a nested LLM step (must be dropped)
171
+ runs.append({"id": child, "run_type": "llm", "parent_run_id": root,
172
+ "inputs": {}, "outputs": {}, "extra": {"metadata": {"version": ver}}})
173
+ score = float(np.clip(base + rng.normal(0, 0.03), 0, 1))
174
+ feedback.append({"run_id": root, "key": "quality", "score": round(score, 3)})
175
+
176
+ records = flatten_runs(runs, feedback, LANGSMITH, unit="root")
177
+ print(f"FLATTEN: {len(runs)} raw runs -> {len(records)} flat records "
178
+ f"(child runs dropped under unit='root')")
179
+ print(" sample:", records[0])
180
+ print(" validation problems:", validate_records(records) or "none")
181
+
182
+ sa, sb, cids, meta = assemble_records(records, "v1", "v2")
183
+ print(" assembled:", meta)
184
+ dec = gate(sa, sb, cids, judge_error_sd=0.05, kappa=0.78, alpha=0.05)
185
+ print(" verdict:", dec.verdict, "| CI:", tuple(round(x, 3) for x in dec.delta_ci))
@@ -0,0 +1,108 @@
1
+ """
2
+ regression_substrate.cli — the `regsub` command. Reads scored eval data, calibrates
3
+ a judge against a gold set, runs the offline gate, and writes reports. Designed to
4
+ drop into a CI step:
5
+
6
+ regsub --data evals.csv --gold gold.jsonl --version-a v1 --version-b v2 --out out/
7
+ # exit code 0 = SHIP / SHIP_WITH_FLAGS ; 1 = REGRESSION / HOLD ; 2 = JUDGE_INADMISSIBLE
8
+
9
+ By default the rubric judge below is used so the command runs with no API key. In
10
+ production, import the package and pass your own judge:
11
+
12
+ from regression_substrate import Judge, load_from_csv, gate
13
+ """
14
+
15
+ from __future__ import annotations
16
+ import argparse
17
+ import csv
18
+ import json
19
+ import os
20
+ import sys
21
+ from collections import defaultdict
22
+ import numpy as np
23
+
24
+ from .ingest import Judge, validate_records, assemble_records, auto_cluster
25
+ from .diff_engine import gate, variance_components
26
+
27
+ # A self-contained rubric judge so the CLI runs offline. Replace in production.
28
+ _ACTION = {"refund", "reset", "update", "settings", "24/7", "link"}
29
+ _COURTESY = {"sorry", "happy", "please", "help", "glad", "sure"}
30
+
31
+ def default_judge(question: str, answer: str) -> float:
32
+ t = answer.lower()
33
+ spec = 1.0 if any(x in t for x in _ACTION) else 0.0
34
+ completeness = min(len(answer.split()) / 12.0, 1.0)
35
+ courtesy = 1.0 if any(w in t for w in _COURTESY) else 0.7
36
+ return round(0.55 * spec + 0.35 * completeness + 0.10 * courtesy, 3)
37
+
38
+
39
+ _EXIT = {"SHIP": 0, "SHIP_WITH_FLAGS": 0, "REGRESSION": 1, "HOLD": 1, "JUDGE_INADMISSIBLE": 2}
40
+
41
+
42
+ def run(data, gold, version_a, version_b, out_dir, alpha, min_n, score_fn=default_judge):
43
+ os.makedirs(out_dir, exist_ok=True)
44
+
45
+ records = []
46
+ with open(data, newline="") as f:
47
+ for row in csv.DictReader(f):
48
+ row["score"] = score_fn(row.get("input", ""), row.get("response", ""))
49
+ if str(row.get("replicate", "")).strip().isdigit():
50
+ row["replicate"] = int(row["replicate"])
51
+ else:
52
+ row.pop("replicate", None)
53
+ records.append(row)
54
+ problems = validate_records(records)
55
+ if problems:
56
+ sys.exit("Data problems:\n " + "\n ".join(problems))
57
+ with open(f"{out_dir}/records.jsonl", "w") as f:
58
+ f.write("\n".join(json.dumps(r) for r in records))
59
+
60
+ judge = Judge(score_fn)
61
+ cal = judge.calibrate([json.loads(l) for l in open(gold) if l.strip()])
62
+ with open(f"{out_dir}/calibration.json", "w") as f:
63
+ json.dump(cal, f, indent=2)
64
+
65
+ if any(not r.get("cluster") for r in records):
66
+ uniq = sorted({r["input"] for r in records})
67
+ cmap = dict(zip(uniq, auto_cluster(uniq, n_clusters=2)))
68
+ for r in records:
69
+ r.setdefault("cluster", int(cmap[r["input"]]))
70
+
71
+ sa, sb, cids, meta = assemble_records(records, version_a, version_b)
72
+ dec = gate(sa, sb, cids, judge_error_sd=cal["error_sd"], kappa=cal["kappa"],
73
+ alpha=alpha, min_n=min_n)
74
+ comp = variance_components(sa, sb)
75
+
76
+ report = {
77
+ "verdict": dec.verdict,
78
+ "delta_ci": [round(x, 4) for x in dec.delta_ci],
79
+ "weighted_delta": round(comp.delta_hat, 4),
80
+ "flagged_clusters": dec.flagged_clusters,
81
+ "judge": cal,
82
+ "n_inputs": meta["n_inputs"],
83
+ "note": dec.note,
84
+ }
85
+ with open(f"{out_dir}/gate_report.json", "w") as f:
86
+ json.dump(report, f, indent=2)
87
+
88
+ print(json.dumps(report, indent=2))
89
+ return dec.verdict
90
+
91
+
92
+ def main(argv=None):
93
+ p = argparse.ArgumentParser(prog="regsub", description="Offline AI regression gate.")
94
+ p.add_argument("--data", required=True, help="CSV of scored responses (input,version,replicate,cluster,response)")
95
+ p.add_argument("--gold", required=True, help="JSONL gold set (input,response,human)")
96
+ p.add_argument("--version-a", default="v1", help="baseline version label")
97
+ p.add_argument("--version-b", default="v2", help="candidate version label")
98
+ p.add_argument("--out", default="out", help="output directory")
99
+ p.add_argument("--alpha", type=float, default=0.05)
100
+ p.add_argument("--min-n", type=int, default=30, help="power floor")
101
+ args = p.parse_args(argv)
102
+ verdict = run(args.data, args.gold, args.version_a, args.version_b,
103
+ args.out, args.alpha, args.min_n)
104
+ sys.exit(_EXIT.get(verdict, 1))
105
+
106
+
107
+ if __name__ == "__main__":
108
+ main()
@@ -0,0 +1,350 @@
1
+ """
2
+ diff_engine.py — reference implementation of the statistical diff engine.
3
+
4
+ This is the core gate for a regression substrate for probabilistic software:
5
+ given evaluation scores for a current version A and a candidate version B over
6
+ the SAME captured inputs (paired design), decide whether B can ship, must be
7
+ held, or is an outright regression — accounting for input difficulty, judge
8
+ measurement noise, and within-version (token) sampling variance — and localize
9
+ regressions to semantic clusters under FDR control.
10
+
11
+ Data model
12
+ ----------
13
+ scores_a, scores_b : np.ndarray of shape (N, k)
14
+ Judge quality scores in [0, 1]. N inputs, k replicate samples per input per
15
+ version (k > 1 is what lets us separate token noise from real difference).
16
+ cluster_ids : np.ndarray of shape (N,)
17
+ Integer semantic-cluster label per input.
18
+ judge_error_sd : float
19
+ Std-dev of the judge's measurement error, ESTIMATED FROM A HUMAN GOLD SET
20
+ (not assumed). Propagated through the bootstrap so an unreliable judge
21
+ widens the interval instead of silently passing the gate.
22
+ kappa : float
23
+ Judge–human agreement (Cohen's kappa) on the gold set. Used only as an
24
+ admissibility diagnostic: below kappa_min the judge cannot gate at all.
25
+
26
+ All statistics are deliberately explicit rather than delegated to a black-box
27
+ fitter, so each variance component is auditable.
28
+ """
29
+
30
+ from __future__ import annotations
31
+ from dataclasses import dataclass, field
32
+ import numpy as np
33
+ from scipy import stats
34
+
35
+
36
+ # --------------------------------------------------------------------------- #
37
+ # 1. Variance components (mixed-effects view of the paired delta)
38
+ # --------------------------------------------------------------------------- #
39
+ # Model: score[v, i, r] = mu + tau_v + u_i + (tau*u)_{v,i} + e_{v,i,r}
40
+ # tau_v fixed version effect (the thing we want: tau_B - tau_A)
41
+ # u_i random input effect ~ N(0, sigma_input^2)
42
+ # (tau*u)_{vi} version x input interaction (where regressions live)
43
+ # e_{v,i,r} residual token noise ~ N(0, sigma_resid^2)
44
+ #
45
+ # In a PAIRED design the per-input delta cancels u_i and mu, leaving the
46
+ # interaction plus averaged residuals. That is the estimator we actually gate
47
+ # on; the residual variance is estimated separately so we can report the
48
+ # signal-to-noise ratio explicitly ("variance-components filtering").
49
+
50
+ @dataclass
51
+ class VarianceComponents:
52
+ delta_hat: float # estimated tau_B - tau_A (mean paired delta)
53
+ se_delta: float # standard error of delta_hat
54
+ sigma_resid2: float # within-(version,input) token-noise variance
55
+ interaction_var: float # structural version x input variance (>=0)
56
+ snr: float # |delta_hat| / sqrt(interaction_var + tiny)
57
+
58
+
59
+ def _as_ragged(scores) -> list[np.ndarray]:
60
+ """Accept either a balanced (N, k) array or a list of variable-length 1D
61
+ arrays (one per input, length k_i) and normalize to a list of 1D arrays."""
62
+ if isinstance(scores, np.ndarray) and scores.ndim == 2:
63
+ return [scores[i] for i in range(scores.shape[0])]
64
+ return [np.asarray(s, dtype=float) for s in scores]
65
+
66
+
67
+ def _pooled_residual_var(cells: list[np.ndarray]) -> tuple[float, float]:
68
+ """Token-noise variance pooled across cells, weighted by df = k_i - 1.
69
+ Cells with k=1 contribute zero df (no internal info) but are not dropped
70
+ elsewhere. Returns (sigma_resid2, total_df)."""
71
+ num = den = 0.0
72
+ for arr in cells:
73
+ if len(arr) >= 2:
74
+ num += (len(arr) - 1) * arr.var(ddof=1)
75
+ den += (len(arr) - 1)
76
+ return (num / den if den > 0 else 0.0), den
77
+
78
+
79
+ def variance_components(scores_a, scores_b) -> VarianceComponents:
80
+ A, B = _as_ragged(scores_a), _as_ragged(scores_b)
81
+ N = len(A)
82
+ kA = np.array([len(a) for a in A], dtype=float)
83
+ kB = np.array([len(b) for b in B], dtype=float)
84
+ mA = np.array([a.mean() for a in A])
85
+ mB = np.array([b.mean() for b in B])
86
+ d = mB - mA # per-input paired delta
87
+
88
+ # (1) df-weighted pooled token-noise variance across all cells of A and B.
89
+ sigma_resid2, total_df = _pooled_residual_var([*A, *B])
90
+
91
+ # (2) per-input measurement variance of d_i from token noise.
92
+ v = sigma_resid2 * (1.0 / kA + 1.0 / kB) # (N,)
93
+
94
+ if total_df == 0: # no replicate info anywhere
95
+ delta_hat = float(d.mean())
96
+ se = float(d.std(ddof=1) / np.sqrt(N))
97
+ tau2 = float(max(0.0, d.var(ddof=1)))
98
+ return VarianceComponents(delta_hat, se, sigma_resid2, tau2,
99
+ abs(delta_hat) / np.sqrt(tau2 + 1e-12))
100
+
101
+ # (3) DerSimonian-Laird estimate of between-input (interaction) variance.
102
+ w0 = 1.0 / np.maximum(v, 1e-12) # fixed-effect weights
103
+ delta_fe = np.sum(w0 * d) / np.sum(w0)
104
+ Q = np.sum(w0 * (d - delta_fe) ** 2) # Cochran's Q
105
+ C = np.sum(w0) - np.sum(w0 ** 2) / np.sum(w0)
106
+ tau2 = max(0.0, (Q - (N - 1)) / C) if C > 0 else 0.0
107
+
108
+ # (4) random-effects inverse-variance weights: keep low-k inputs, trust
109
+ # them proportionally to their precision (never truncate).
110
+ w = 1.0 / (tau2 + v)
111
+ delta_hat = float(np.sum(w * d) / np.sum(w))
112
+ se_delta = float(np.sqrt(1.0 / np.sum(w)))
113
+ snr = abs(delta_hat) / np.sqrt(tau2 + 1e-12)
114
+ return VarianceComponents(delta_hat, se_delta, sigma_resid2, float(tau2), snr)
115
+
116
+
117
+ # --------------------------------------------------------------------------- #
118
+ # 2. Bootstrap CI that wraps judge resampling
119
+ # --------------------------------------------------------------------------- #
120
+ # Cluster (input-level) bootstrap so the CI reflects input sampling, AND inject
121
+ # judge measurement noise on every scored output so the CI also reflects judge
122
+ # unreliability. An unreliable judge (large judge_error_sd) widens the interval
123
+ # and prevents a confident ship — by construction, not by a hand-tuned fudge.
124
+
125
+ def bootstrap_delta_ci(
126
+ scores_a,
127
+ scores_b,
128
+ judge_error_sd: float,
129
+ alpha: float = 0.05,
130
+ n_boot: int = 5000,
131
+ rng: np.random.Generator | None = None,
132
+ ) -> tuple[float, float, np.ndarray]:
133
+ rng = rng or np.random.default_rng(0)
134
+ A, B = _as_ragged(scores_a), _as_ragged(scores_b)
135
+ N = len(A)
136
+ kA = np.array([len(a) for a in A], dtype=float)
137
+ kB = np.array([len(b) for b in B], dtype=float)
138
+ mA = np.array([a.mean() for a in A])
139
+ mB = np.array([b.mean() for b in B])
140
+
141
+ # Random-effects weights from the full-data fit (held fixed across boots).
142
+ comp = variance_components(A, B)
143
+ v = comp.sigma_resid2 * (1.0 / kA + 1.0 / kB)
144
+ w = 1.0 / (comp.interaction_var + v + 1e-12)
145
+
146
+ # Judge noise on a CELL MEAN is Gaussian with variance sd^2 / k_i, so we
147
+ # perturb means directly instead of every score: same distribution, fully
148
+ # vectorizable, and low-k cells correctly carry MORE judge uncertainty.
149
+ idx = rng.integers(0, N, size=(n_boot, N))
150
+ seA = judge_error_sd / np.sqrt(kA[idx])
151
+ seB = judge_error_sd / np.sqrt(kB[idx])
152
+ da = mA[idx] + rng.normal(0, 1, idx.shape) * seA
153
+ db = mB[idx] + rng.normal(0, 1, idx.shape) * seB
154
+ dd = db - da
155
+ ww = w[idx]
156
+ boot = np.sum(ww * dd, axis=1) / np.sum(ww, axis=1)
157
+
158
+ lo, hi = np.percentile(boot, [100 * alpha / 2, 100 * (1 - alpha / 2)])
159
+ return float(lo), float(hi), boot
160
+
161
+
162
+ # --------------------------------------------------------------------------- #
163
+ # 3. Multiplicity control across clusters: BH (fixed-n) and e-BH (sequential)
164
+ # --------------------------------------------------------------------------- #
165
+
166
+ def benjamini_hochberg(pvalues: np.ndarray, alpha: float) -> np.ndarray:
167
+ """Return boolean mask of rejected hypotheses. FDR <= alpha under
168
+ independence / positive dependence (PRDS)."""
169
+ m = len(pvalues)
170
+ order = np.argsort(pvalues)
171
+ ranked = pvalues[order]
172
+ thresh = alpha * (np.arange(1, m + 1) / m)
173
+ passed = ranked <= thresh
174
+ reject = np.zeros(m, dtype=bool)
175
+ if passed.any():
176
+ kmax = np.max(np.where(passed)[0]) # largest rank that passes
177
+ reject[order[: kmax + 1]] = True
178
+ return reject
179
+
180
+
181
+ def p_to_e(p: np.ndarray, kappa: float = 0.5) -> np.ndarray:
182
+ """Calibrate p-values to e-values with the kappa-calibrator
183
+ e = kappa * p**(kappa - 1), valid for any p (integrates to 1 under H0)."""
184
+ return kappa * np.power(np.clip(p, 1e-12, 1.0), kappa - 1.0)
185
+
186
+
187
+ def e_bh(evalues: np.ndarray, alpha: float) -> np.ndarray:
188
+ """e-BH (Wang & Ramdas 2022). FDR <= alpha under ARBITRARY dependence and
189
+ valid under optional stopping — the version to use when the gate is hit
190
+ repeatedly over many deploys."""
191
+ m = len(evalues)
192
+ order = np.argsort(-evalues) # decreasing
193
+ ranked = evalues[order]
194
+ crit = m / (alpha * np.arange(1, m + 1)) # threshold per rank
195
+ passed = ranked >= crit
196
+ reject = np.zeros(m, dtype=bool)
197
+ if passed.any():
198
+ kmax = np.max(np.where(passed)[0])
199
+ reject[order[: kmax + 1]] = True
200
+ return reject
201
+
202
+
203
+ # --------------------------------------------------------------------------- #
204
+ # 4. Cluster regression scan
205
+ # --------------------------------------------------------------------------- #
206
+ # For each cluster, one-sided test of H0: delta >= 0 (B not worse) vs
207
+ # H1: delta < 0 (regression). Small p = strong regression evidence. Then apply
208
+ # BH (fixed-n) or e-BH (sequential) across clusters to control false flags.
209
+
210
+ def paired_permutation_p(
211
+ d: np.ndarray, n_perm: int = 10000, rng: np.random.Generator | None = None
212
+ ) -> float:
213
+ """One-sided sign-flip permutation p-value for H1: mean(d) < 0 (regression).
214
+ Under H0 the paired delta is symmetric about 0, so each sign is exchangeable.
215
+ Distribution-free -- the correct test for small clusters where the normal
216
+ approximation throws false positives. (+1 correction keeps it valid.)"""
217
+ rng = rng or np.random.default_rng(0)
218
+ obs = d.mean()
219
+ signs = rng.choice(np.array([-1.0, 1.0]), size=(n_perm, len(d)))
220
+ perm_means = (signs * d).mean(axis=1)
221
+ return float((1 + np.sum(perm_means <= obs)) / (1 + n_perm))
222
+
223
+
224
+ @dataclass
225
+ class ClusterResult:
226
+ cluster_id: int
227
+ n: int
228
+ delta: float
229
+ p_value: float
230
+ e_value: float
231
+ flagged_regression: bool = False
232
+
233
+
234
+ def cluster_scan(
235
+ scores_a: np.ndarray,
236
+ scores_b: np.ndarray,
237
+ cluster_ids: np.ndarray,
238
+ alpha: float = 0.10,
239
+ mode: str = "fixed", # "fixed" -> BH ; "sequential" -> e-BH
240
+ e_kappa: float = 0.5,
241
+ small_n: int = 30, # below this, use the permutation test
242
+ n_perm: int = 10000,
243
+ rng: np.random.Generator | None = None,
244
+ ) -> list[ClusterResult]:
245
+ rng = rng or np.random.default_rng(0)
246
+ A, B = _as_ragged(scores_a), _as_ragged(scores_b)
247
+ d_i = np.array([b.mean() - a.mean() for a, b in zip(A, B)])
248
+ results: list[ClusterResult] = []
249
+
250
+ for c in np.unique(cluster_ids):
251
+ d = d_i[cluster_ids == c]
252
+ n = len(d)
253
+ if n < 3: # too small to test at all
254
+ results.append(ClusterResult(int(c), n, float(d.mean()) if n else 0.0,
255
+ 1.0, p_to_e(np.array([1.0]), e_kappa)[0]))
256
+ continue
257
+ if n < small_n: # distribution-free fallback
258
+ p = paired_permutation_p(d, n_perm=n_perm, rng=rng)
259
+ else: # normal approximation
260
+ se = d.std(ddof=1) / np.sqrt(n)
261
+ p = 0.0 if (se == 0 and d.mean() < 0) else (
262
+ 1.0 if se == 0 else float(stats.norm.cdf(d.mean() / se)))
263
+ results.append(ClusterResult(int(c), n, float(d.mean()), p,
264
+ float(p_to_e(np.array([p]), e_kappa)[0])))
265
+
266
+ pvals = np.array([r.p_value for r in results])
267
+ if mode == "sequential":
268
+ reject = e_bh(np.array([r.e_value for r in results]), alpha)
269
+ else:
270
+ reject = benjamini_hochberg(pvals, alpha)
271
+ for r, rej in zip(results, reject):
272
+ r.flagged_regression = bool(rej)
273
+ return results
274
+
275
+
276
+ # --------------------------------------------------------------------------- #
277
+ # 5. Top-level gate
278
+ # --------------------------------------------------------------------------- #
279
+
280
+ @dataclass
281
+ class GateDecision:
282
+ verdict: str # SHIP | HOLD | REGRESSION | JUDGE_INADMISSIBLE
283
+ delta_ci: tuple[float, float]
284
+ components: VarianceComponents | None
285
+ flagged_clusters: list[int] = field(default_factory=list)
286
+ note: str = ""
287
+
288
+
289
+ def gate(
290
+ scores_a: np.ndarray,
291
+ scores_b: np.ndarray,
292
+ cluster_ids: np.ndarray,
293
+ judge_error_sd: float = 0.0,
294
+ kappa: float = 1.0,
295
+ alpha: float = 0.05,
296
+ margin: float = 0.0, # superiority margin on the delta
297
+ kappa_min: float = 0.4,
298
+ min_n: int = 30, # below this, no SHIP/REGRESSION -- insufficient power
299
+ mode: str = "fixed",
300
+ rng: np.random.Generator | None = None,
301
+ ) -> GateDecision:
302
+ # (a) Judge must be admissible before it is allowed to gate anything.
303
+ if kappa < kappa_min:
304
+ return GateDecision("JUDGE_INADMISSIBLE", (float("nan"), float("nan")), None,
305
+ note=f"kappa={kappa:.2f} < kappa_min={kappa_min}; recalibrate judge.")
306
+
307
+ # (b) Power floor, handled ASYMMETRICALLY. At small N the bootstrap
308
+ # underestimates variance, so we never trust it to SHIP. But the danger
309
+ # is one-sided: a consistent, large negative delta is a real catastrophe
310
+ # even at small N, and the EXACT sign-flip permutation test (which does
311
+ # not depend on the bootstrap) can certify it. A significant *improvement*
312
+ # at small N is NOT shippable -- a tiny sample can't represent the input
313
+ # distribution (an external-validity failure, not a power failure). So:
314
+ # regression -> flag it; everything else -> HOLD. (Below ~N=5 the
315
+ # permutation p floor of 1/2^N exceeds alpha, so this correctly falls
316
+ # through to HOLD on its own.)
317
+ n_inputs = len(_as_ragged(scores_a))
318
+ if n_inputs < min_n:
319
+ comp = variance_components(scores_a, scores_b)
320
+ lo, hi, _ = bootstrap_delta_ci(scores_a, scores_b, judge_error_sd, alpha, rng=rng)
321
+ A, B = _as_ragged(scores_a), _as_ragged(scores_b)
322
+ d = np.array([b.mean() - a.mean() for a, b in zip(A, B)])
323
+ p_reg = paired_permutation_p(d, rng=rng) # H1: mean(d) < 0
324
+ if d.mean() < 0 and p_reg < alpha:
325
+ return GateDecision("REGRESSION", (lo, hi), comp,
326
+ note=f"small-N catastrophic regression caught by exact "
327
+ f"permutation test: N={n_inputs}, p={p_reg:.3f} < alpha={alpha}.")
328
+ return GateDecision("HOLD", (lo, hi), comp,
329
+ note=f"insufficient power: N={n_inputs} < min_n={min_n}; "
330
+ f"permutation p={p_reg:.3f} is not a clear regression and a "
331
+ f"small sample cannot certify a ship -- collect more paired inputs.")
332
+
333
+ comp = variance_components(scores_a, scores_b)
334
+ lo, hi, _ = bootstrap_delta_ci(scores_a, scores_b, judge_error_sd, alpha, rng=rng)
335
+
336
+ if lo > margin:
337
+ verdict = "SHIP"
338
+ elif hi < 0:
339
+ verdict = "REGRESSION"
340
+ else:
341
+ verdict = "HOLD"
342
+
343
+ # (b) Localized regressions are reported even when the overall verdict ships.
344
+ clusters = cluster_scan(scores_a, scores_b, cluster_ids, alpha=2 * alpha, mode=mode, rng=rng)
345
+ flagged = [r.cluster_id for r in clusters if r.flagged_regression]
346
+ if flagged and verdict == "SHIP":
347
+ verdict = "SHIP_WITH_FLAGS"
348
+
349
+ return GateDecision(verdict, (lo, hi), comp, flagged,
350
+ note=f"snr={comp.snr:.2f}, token-noise var={comp.sigma_resid2:.4f}")