regression-substrate 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- regression_substrate/__init__.py +36 -0
- regression_substrate/adapters.py +185 -0
- regression_substrate/cli.py +108 -0
- regression_substrate/diff_engine.py +350 -0
- regression_substrate/gold.py +108 -0
- regression_substrate/ingest.py +255 -0
- regression_substrate/otel_exporter.py +220 -0
- regression_substrate/sequential_gate.py +243 -0
- regression_substrate-0.1.0.dist-info/METADATA +104 -0
- regression_substrate-0.1.0.dist-info/RECORD +13 -0
- regression_substrate-0.1.0.dist-info/WHEEL +4 -0
- regression_substrate-0.1.0.dist-info/entry_points.txt +2 -0
- regression_substrate-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""
|
|
2
|
+
regression_substrate — a statistically rigorous gate for deciding whether a new
|
|
3
|
+
prompt/model version is better, worse, or inconclusive versus the current one.
|
|
4
|
+
|
|
5
|
+
Public API:
|
|
6
|
+
|
|
7
|
+
from regression_substrate import gate, Judge, load_from_jsonl
|
|
8
|
+
|
|
9
|
+
Offline gate (Day-1): gate, GateDecision, variance_components
|
|
10
|
+
Ingestion: load_from_jsonl, load_from_csv, assemble_records,
|
|
11
|
+
validate_records, Judge, auto_cluster
|
|
12
|
+
Streaming (Day-2): SequentialGate, Backend
|
|
13
|
+
Drift (Day-2): RollingGoldSet, drift_report, sample_for_labeling
|
|
14
|
+
Vendor adapters: flatten_runs, load_from_langsmith
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from .diff_engine import gate, GateDecision, variance_components
|
|
18
|
+
from .ingest import (
|
|
19
|
+
load_from_jsonl, load_from_csv, assemble_records, validate_records,
|
|
20
|
+
Judge, cohen_kappa, auto_cluster, tfidf_embedder, sentence_transformer_embedder,
|
|
21
|
+
)
|
|
22
|
+
from .sequential_gate import SequentialGate, Backend, MartingaleState
|
|
23
|
+
from .gold import RollingGoldSet, drift_report, sample_for_labeling
|
|
24
|
+
from .adapters import flatten_runs, load_from_langsmith, TraceMap, LANGSMITH
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"gate", "GateDecision", "variance_components",
|
|
30
|
+
"load_from_jsonl", "load_from_csv", "assemble_records", "validate_records",
|
|
31
|
+
"Judge", "cohen_kappa", "auto_cluster", "tfidf_embedder", "sentence_transformer_embedder",
|
|
32
|
+
"SequentialGate", "Backend", "MartingaleState",
|
|
33
|
+
"RollingGoldSet", "drift_report", "sample_for_labeling",
|
|
34
|
+
"flatten_runs", "load_from_langsmith", "TraceMap", "LANGSMITH",
|
|
35
|
+
"__version__",
|
|
36
|
+
]
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
adapters.py — pull from vendor observability platforms into the 7-field schema.
|
|
3
|
+
|
|
4
|
+
HONESTY NOTE — READ THIS:
|
|
5
|
+
* The FLATTENING logic below is tested (see __main__) against a synthetic
|
|
6
|
+
fixture shaped like a vendor's documented run/feedback model. That transform
|
|
7
|
+
is proven.
|
|
8
|
+
* The LIVE FETCH (`load_from_langsmith`) is SDK- and auth-dependent and is NOT
|
|
9
|
+
exercised here. Vendor schemas drift, so the field paths in the presets are
|
|
10
|
+
best-effort and MUST be verified against the platform's current docs before
|
|
11
|
+
you trust them in production.
|
|
12
|
+
|
|
13
|
+
Design: don't hardwire any one vendor. A `TraceMap` says where each field lives
|
|
14
|
+
inside one run record; `flatten_runs` collapses (possibly nested, multi-step)
|
|
15
|
+
runs + feedback into flat 7-field records that ingest.assemble_records consumes.
|
|
16
|
+
A vendor is then just a TraceMap preset plus a thin fetch wrapper.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
import json
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _dig(obj, path: str, default=None):
|
|
25
|
+
"""Read a nested field by dotted path, e.g. 'extra.metadata.version'."""
|
|
26
|
+
cur = obj
|
|
27
|
+
for key in path.split("."):
|
|
28
|
+
if isinstance(cur, dict) and key in cur:
|
|
29
|
+
cur = cur[key]
|
|
30
|
+
else:
|
|
31
|
+
return default
|
|
32
|
+
return cur
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _canon(x) -> str:
|
|
36
|
+
"""Canonicalize an input/output payload to a stable string. The `input`
|
|
37
|
+
string is what groups replicates and pairs versions, so it must be stable."""
|
|
38
|
+
if isinstance(x, str):
|
|
39
|
+
return x
|
|
40
|
+
if isinstance(x, dict):
|
|
41
|
+
for k in ("input", "question", "query", "text", "prompt",
|
|
42
|
+
"output", "answer", "result", "response"):
|
|
43
|
+
if isinstance(x.get(k), str):
|
|
44
|
+
return x[k]
|
|
45
|
+
return json.dumps(x, sort_keys=True)
|
|
46
|
+
return str(x)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class TraceMap:
|
|
51
|
+
"""Where the fields live inside one vendor run record."""
|
|
52
|
+
input_path: str
|
|
53
|
+
output_path: str
|
|
54
|
+
version_path: str # MUST have been logged by the team; no version => unpairable
|
|
55
|
+
score_key: str # which feedback key carries the quality score
|
|
56
|
+
run_type_path: str = "run_type"
|
|
57
|
+
parent_path: str = "parent_run_id"
|
|
58
|
+
id_path: str = "id"
|
|
59
|
+
cluster_path: str | None = None
|
|
60
|
+
score_scale: tuple = (0.0, 1.0)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# Best-effort preset. VERIFY these paths against current LangSmith docs.
|
|
64
|
+
LANGSMITH = TraceMap(
|
|
65
|
+
input_path="inputs",
|
|
66
|
+
output_path="outputs",
|
|
67
|
+
version_path="extra.metadata.version",
|
|
68
|
+
score_key="quality",
|
|
69
|
+
run_type_path="run_type",
|
|
70
|
+
parent_path="parent_run_id",
|
|
71
|
+
id_path="id",
|
|
72
|
+
cluster_path="extra.metadata.cluster",
|
|
73
|
+
score_scale=(0.0, 1.0),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def flatten_runs(runs: list[dict], feedback: list[dict], tmap: TraceMap,
|
|
78
|
+
unit: str = "root") -> list[dict]:
|
|
79
|
+
"""Collapse runs + feedback into flat 7-field records.
|
|
80
|
+
|
|
81
|
+
unit="root" -> evaluate whole trajectories (input=root input, response=root
|
|
82
|
+
output). Child LLM/tool runs are diagnostic and dropped.
|
|
83
|
+
unit=<type> -> evaluate a component instead (e.g. "retriever", "llm").
|
|
84
|
+
"""
|
|
85
|
+
by_run: dict[str, dict] = {}
|
|
86
|
+
for f in feedback:
|
|
87
|
+
by_run.setdefault(f["run_id"], {})[f["key"]] = f.get("score")
|
|
88
|
+
|
|
89
|
+
lo, hi = tmap.score_scale
|
|
90
|
+
skipped_no_version = skipped_no_score = 0
|
|
91
|
+
records = []
|
|
92
|
+
for r in runs:
|
|
93
|
+
is_root = _dig(r, tmap.parent_path) is None
|
|
94
|
+
if unit == "root":
|
|
95
|
+
if not is_root:
|
|
96
|
+
continue
|
|
97
|
+
elif _dig(r, tmap.run_type_path) != unit:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
raw = (by_run.get(_dig(r, tmap.id_path)) or {}).get(tmap.score_key)
|
|
101
|
+
if raw is None:
|
|
102
|
+
skipped_no_score += 1
|
|
103
|
+
continue
|
|
104
|
+
version = _dig(r, tmap.version_path)
|
|
105
|
+
if version is None:
|
|
106
|
+
skipped_no_version += 1
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
score = (float(raw) - lo) / (hi - lo) if hi != lo else float(raw)
|
|
110
|
+
rec = {
|
|
111
|
+
"input": _canon(_dig(r, tmap.input_path)),
|
|
112
|
+
"version": str(version),
|
|
113
|
+
"response": _canon(_dig(r, tmap.output_path)),
|
|
114
|
+
"score": max(0.0, min(1.0, score)),
|
|
115
|
+
}
|
|
116
|
+
if tmap.cluster_path:
|
|
117
|
+
c = _dig(r, tmap.cluster_path)
|
|
118
|
+
if c is not None:
|
|
119
|
+
rec["cluster"] = c
|
|
120
|
+
records.append(rec)
|
|
121
|
+
|
|
122
|
+
if skipped_no_version:
|
|
123
|
+
print(f" [adapter] WARNING: skipped {skipped_no_version} runs with no "
|
|
124
|
+
f"version tag at '{tmap.version_path}' -- they cannot be paired.")
|
|
125
|
+
if skipped_no_score:
|
|
126
|
+
print(f" [adapter] note: skipped {skipped_no_score} runs with no "
|
|
127
|
+
f"'{tmap.score_key}' feedback.")
|
|
128
|
+
return records
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def load_from_langsmith(project: str, version_a: str, version_b: str,
|
|
132
|
+
tmap: TraceMap = LANGSMITH, unit: str = "root"):
|
|
133
|
+
"""LIVE fetch + flatten + assemble. NOT exercised in this repo -- requires
|
|
134
|
+
the `langsmith` SDK and LANGSMITH_API_KEY, and the SDK call signatures and
|
|
135
|
+
schema below must be verified against current LangSmith docs."""
|
|
136
|
+
from langsmith import Client # raises if not installed
|
|
137
|
+
from .ingest import assemble_records
|
|
138
|
+
|
|
139
|
+
client = Client()
|
|
140
|
+
runs = [r.dict() for r in client.list_runs(project_name=project)]
|
|
141
|
+
run_ids = [r["id"] for r in runs]
|
|
142
|
+
feedback = [f.dict() for f in client.list_feedback(run_ids=run_ids)]
|
|
143
|
+
records = flatten_runs(runs, feedback, tmap, unit=unit)
|
|
144
|
+
return assemble_records(records, version_a, version_b)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# --------------------------------------------------------------------------- #
|
|
148
|
+
# Demo: flatten a SYNTHETIC LangSmith-shaped fixture -> gate(). Proves the
|
|
149
|
+
# transform (nested trajectory collapse, version extraction, score scaling,
|
|
150
|
+
# replicate derivation) without touching the live API.
|
|
151
|
+
# --------------------------------------------------------------------------- #
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
import numpy as np
|
|
155
|
+
from .ingest import assemble_records, validate_records
|
|
156
|
+
from .diff_engine import gate
|
|
157
|
+
|
|
158
|
+
rng = np.random.default_rng(0)
|
|
159
|
+
inputs = ([("billing", f"billing question {i}") for i in range(3)] +
|
|
160
|
+
[("general", f"general question {i}") for i in range(3)])
|
|
161
|
+
|
|
162
|
+
runs, feedback, rid = [], [], 0
|
|
163
|
+
for cluster, q in inputs:
|
|
164
|
+
for ver, base in [("v1", 0.90), ("v2", 0.20 if cluster == "billing" else 0.78)]:
|
|
165
|
+
for _rep in range(2): # two replicates per (input, version)
|
|
166
|
+
root = f"run-{rid}"; rid += 1
|
|
167
|
+
runs.append({"id": root, "run_type": "chain", "parent_run_id": None,
|
|
168
|
+
"inputs": {"question": q}, "outputs": {"answer": "..."},
|
|
169
|
+
"extra": {"metadata": {"version": ver, "cluster": cluster}}})
|
|
170
|
+
child = f"run-{rid}"; rid += 1 # a nested LLM step (must be dropped)
|
|
171
|
+
runs.append({"id": child, "run_type": "llm", "parent_run_id": root,
|
|
172
|
+
"inputs": {}, "outputs": {}, "extra": {"metadata": {"version": ver}}})
|
|
173
|
+
score = float(np.clip(base + rng.normal(0, 0.03), 0, 1))
|
|
174
|
+
feedback.append({"run_id": root, "key": "quality", "score": round(score, 3)})
|
|
175
|
+
|
|
176
|
+
records = flatten_runs(runs, feedback, LANGSMITH, unit="root")
|
|
177
|
+
print(f"FLATTEN: {len(runs)} raw runs -> {len(records)} flat records "
|
|
178
|
+
f"(child runs dropped under unit='root')")
|
|
179
|
+
print(" sample:", records[0])
|
|
180
|
+
print(" validation problems:", validate_records(records) or "none")
|
|
181
|
+
|
|
182
|
+
sa, sb, cids, meta = assemble_records(records, "v1", "v2")
|
|
183
|
+
print(" assembled:", meta)
|
|
184
|
+
dec = gate(sa, sb, cids, judge_error_sd=0.05, kappa=0.78, alpha=0.05)
|
|
185
|
+
print(" verdict:", dec.verdict, "| CI:", tuple(round(x, 3) for x in dec.delta_ci))
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""
|
|
2
|
+
regression_substrate.cli — the `regsub` command. Reads scored eval data, calibrates
|
|
3
|
+
a judge against a gold set, runs the offline gate, and writes reports. Designed to
|
|
4
|
+
drop into a CI step:
|
|
5
|
+
|
|
6
|
+
regsub --data evals.csv --gold gold.jsonl --version-a v1 --version-b v2 --out out/
|
|
7
|
+
# exit code 0 = SHIP / SHIP_WITH_FLAGS ; 1 = REGRESSION / HOLD ; 2 = JUDGE_INADMISSIBLE
|
|
8
|
+
|
|
9
|
+
By default the rubric judge below is used so the command runs with no API key. In
|
|
10
|
+
production, import the package and pass your own judge:
|
|
11
|
+
|
|
12
|
+
from regression_substrate import Judge, load_from_csv, gate
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
import argparse
|
|
17
|
+
import csv
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import sys
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
import numpy as np
|
|
23
|
+
|
|
24
|
+
from .ingest import Judge, validate_records, assemble_records, auto_cluster
|
|
25
|
+
from .diff_engine import gate, variance_components
|
|
26
|
+
|
|
27
|
+
# A self-contained rubric judge so the CLI runs offline. Replace in production.
|
|
28
|
+
_ACTION = {"refund", "reset", "update", "settings", "24/7", "link"}
|
|
29
|
+
_COURTESY = {"sorry", "happy", "please", "help", "glad", "sure"}
|
|
30
|
+
|
|
31
|
+
def default_judge(question: str, answer: str) -> float:
|
|
32
|
+
t = answer.lower()
|
|
33
|
+
spec = 1.0 if any(x in t for x in _ACTION) else 0.0
|
|
34
|
+
completeness = min(len(answer.split()) / 12.0, 1.0)
|
|
35
|
+
courtesy = 1.0 if any(w in t for w in _COURTESY) else 0.7
|
|
36
|
+
return round(0.55 * spec + 0.35 * completeness + 0.10 * courtesy, 3)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
_EXIT = {"SHIP": 0, "SHIP_WITH_FLAGS": 0, "REGRESSION": 1, "HOLD": 1, "JUDGE_INADMISSIBLE": 2}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def run(data, gold, version_a, version_b, out_dir, alpha, min_n, score_fn=default_judge):
|
|
43
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
44
|
+
|
|
45
|
+
records = []
|
|
46
|
+
with open(data, newline="") as f:
|
|
47
|
+
for row in csv.DictReader(f):
|
|
48
|
+
row["score"] = score_fn(row.get("input", ""), row.get("response", ""))
|
|
49
|
+
if str(row.get("replicate", "")).strip().isdigit():
|
|
50
|
+
row["replicate"] = int(row["replicate"])
|
|
51
|
+
else:
|
|
52
|
+
row.pop("replicate", None)
|
|
53
|
+
records.append(row)
|
|
54
|
+
problems = validate_records(records)
|
|
55
|
+
if problems:
|
|
56
|
+
sys.exit("Data problems:\n " + "\n ".join(problems))
|
|
57
|
+
with open(f"{out_dir}/records.jsonl", "w") as f:
|
|
58
|
+
f.write("\n".join(json.dumps(r) for r in records))
|
|
59
|
+
|
|
60
|
+
judge = Judge(score_fn)
|
|
61
|
+
cal = judge.calibrate([json.loads(l) for l in open(gold) if l.strip()])
|
|
62
|
+
with open(f"{out_dir}/calibration.json", "w") as f:
|
|
63
|
+
json.dump(cal, f, indent=2)
|
|
64
|
+
|
|
65
|
+
if any(not r.get("cluster") for r in records):
|
|
66
|
+
uniq = sorted({r["input"] for r in records})
|
|
67
|
+
cmap = dict(zip(uniq, auto_cluster(uniq, n_clusters=2)))
|
|
68
|
+
for r in records:
|
|
69
|
+
r.setdefault("cluster", int(cmap[r["input"]]))
|
|
70
|
+
|
|
71
|
+
sa, sb, cids, meta = assemble_records(records, version_a, version_b)
|
|
72
|
+
dec = gate(sa, sb, cids, judge_error_sd=cal["error_sd"], kappa=cal["kappa"],
|
|
73
|
+
alpha=alpha, min_n=min_n)
|
|
74
|
+
comp = variance_components(sa, sb)
|
|
75
|
+
|
|
76
|
+
report = {
|
|
77
|
+
"verdict": dec.verdict,
|
|
78
|
+
"delta_ci": [round(x, 4) for x in dec.delta_ci],
|
|
79
|
+
"weighted_delta": round(comp.delta_hat, 4),
|
|
80
|
+
"flagged_clusters": dec.flagged_clusters,
|
|
81
|
+
"judge": cal,
|
|
82
|
+
"n_inputs": meta["n_inputs"],
|
|
83
|
+
"note": dec.note,
|
|
84
|
+
}
|
|
85
|
+
with open(f"{out_dir}/gate_report.json", "w") as f:
|
|
86
|
+
json.dump(report, f, indent=2)
|
|
87
|
+
|
|
88
|
+
print(json.dumps(report, indent=2))
|
|
89
|
+
return dec.verdict
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def main(argv=None):
|
|
93
|
+
p = argparse.ArgumentParser(prog="regsub", description="Offline AI regression gate.")
|
|
94
|
+
p.add_argument("--data", required=True, help="CSV of scored responses (input,version,replicate,cluster,response)")
|
|
95
|
+
p.add_argument("--gold", required=True, help="JSONL gold set (input,response,human)")
|
|
96
|
+
p.add_argument("--version-a", default="v1", help="baseline version label")
|
|
97
|
+
p.add_argument("--version-b", default="v2", help="candidate version label")
|
|
98
|
+
p.add_argument("--out", default="out", help="output directory")
|
|
99
|
+
p.add_argument("--alpha", type=float, default=0.05)
|
|
100
|
+
p.add_argument("--min-n", type=int, default=30, help="power floor")
|
|
101
|
+
args = p.parse_args(argv)
|
|
102
|
+
verdict = run(args.data, args.gold, args.version_a, args.version_b,
|
|
103
|
+
args.out, args.alpha, args.min_n)
|
|
104
|
+
sys.exit(_EXIT.get(verdict, 1))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
if __name__ == "__main__":
|
|
108
|
+
main()
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
"""
|
|
2
|
+
diff_engine.py — reference implementation of the statistical diff engine.
|
|
3
|
+
|
|
4
|
+
This is the core gate for a regression substrate for probabilistic software:
|
|
5
|
+
given evaluation scores for a current version A and a candidate version B over
|
|
6
|
+
the SAME captured inputs (paired design), decide whether B can ship, must be
|
|
7
|
+
held, or is an outright regression — accounting for input difficulty, judge
|
|
8
|
+
measurement noise, and within-version (token) sampling variance — and localize
|
|
9
|
+
regressions to semantic clusters under FDR control.
|
|
10
|
+
|
|
11
|
+
Data model
|
|
12
|
+
----------
|
|
13
|
+
scores_a, scores_b : np.ndarray of shape (N, k)
|
|
14
|
+
Judge quality scores in [0, 1]. N inputs, k replicate samples per input per
|
|
15
|
+
version (k > 1 is what lets us separate token noise from real difference).
|
|
16
|
+
cluster_ids : np.ndarray of shape (N,)
|
|
17
|
+
Integer semantic-cluster label per input.
|
|
18
|
+
judge_error_sd : float
|
|
19
|
+
Std-dev of the judge's measurement error, ESTIMATED FROM A HUMAN GOLD SET
|
|
20
|
+
(not assumed). Propagated through the bootstrap so an unreliable judge
|
|
21
|
+
widens the interval instead of silently passing the gate.
|
|
22
|
+
kappa : float
|
|
23
|
+
Judge–human agreement (Cohen's kappa) on the gold set. Used only as an
|
|
24
|
+
admissibility diagnostic: below kappa_min the judge cannot gate at all.
|
|
25
|
+
|
|
26
|
+
All statistics are deliberately explicit rather than delegated to a black-box
|
|
27
|
+
fitter, so each variance component is auditable.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
from __future__ import annotations
|
|
31
|
+
from dataclasses import dataclass, field
|
|
32
|
+
import numpy as np
|
|
33
|
+
from scipy import stats
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# --------------------------------------------------------------------------- #
|
|
37
|
+
# 1. Variance components (mixed-effects view of the paired delta)
|
|
38
|
+
# --------------------------------------------------------------------------- #
|
|
39
|
+
# Model: score[v, i, r] = mu + tau_v + u_i + (tau*u)_{v,i} + e_{v,i,r}
|
|
40
|
+
# tau_v fixed version effect (the thing we want: tau_B - tau_A)
|
|
41
|
+
# u_i random input effect ~ N(0, sigma_input^2)
|
|
42
|
+
# (tau*u)_{vi} version x input interaction (where regressions live)
|
|
43
|
+
# e_{v,i,r} residual token noise ~ N(0, sigma_resid^2)
|
|
44
|
+
#
|
|
45
|
+
# In a PAIRED design the per-input delta cancels u_i and mu, leaving the
|
|
46
|
+
# interaction plus averaged residuals. That is the estimator we actually gate
|
|
47
|
+
# on; the residual variance is estimated separately so we can report the
|
|
48
|
+
# signal-to-noise ratio explicitly ("variance-components filtering").
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class VarianceComponents:
|
|
52
|
+
delta_hat: float # estimated tau_B - tau_A (mean paired delta)
|
|
53
|
+
se_delta: float # standard error of delta_hat
|
|
54
|
+
sigma_resid2: float # within-(version,input) token-noise variance
|
|
55
|
+
interaction_var: float # structural version x input variance (>=0)
|
|
56
|
+
snr: float # |delta_hat| / sqrt(interaction_var + tiny)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _as_ragged(scores) -> list[np.ndarray]:
|
|
60
|
+
"""Accept either a balanced (N, k) array or a list of variable-length 1D
|
|
61
|
+
arrays (one per input, length k_i) and normalize to a list of 1D arrays."""
|
|
62
|
+
if isinstance(scores, np.ndarray) and scores.ndim == 2:
|
|
63
|
+
return [scores[i] for i in range(scores.shape[0])]
|
|
64
|
+
return [np.asarray(s, dtype=float) for s in scores]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _pooled_residual_var(cells: list[np.ndarray]) -> tuple[float, float]:
|
|
68
|
+
"""Token-noise variance pooled across cells, weighted by df = k_i - 1.
|
|
69
|
+
Cells with k=1 contribute zero df (no internal info) but are not dropped
|
|
70
|
+
elsewhere. Returns (sigma_resid2, total_df)."""
|
|
71
|
+
num = den = 0.0
|
|
72
|
+
for arr in cells:
|
|
73
|
+
if len(arr) >= 2:
|
|
74
|
+
num += (len(arr) - 1) * arr.var(ddof=1)
|
|
75
|
+
den += (len(arr) - 1)
|
|
76
|
+
return (num / den if den > 0 else 0.0), den
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def variance_components(scores_a, scores_b) -> VarianceComponents:
|
|
80
|
+
A, B = _as_ragged(scores_a), _as_ragged(scores_b)
|
|
81
|
+
N = len(A)
|
|
82
|
+
kA = np.array([len(a) for a in A], dtype=float)
|
|
83
|
+
kB = np.array([len(b) for b in B], dtype=float)
|
|
84
|
+
mA = np.array([a.mean() for a in A])
|
|
85
|
+
mB = np.array([b.mean() for b in B])
|
|
86
|
+
d = mB - mA # per-input paired delta
|
|
87
|
+
|
|
88
|
+
# (1) df-weighted pooled token-noise variance across all cells of A and B.
|
|
89
|
+
sigma_resid2, total_df = _pooled_residual_var([*A, *B])
|
|
90
|
+
|
|
91
|
+
# (2) per-input measurement variance of d_i from token noise.
|
|
92
|
+
v = sigma_resid2 * (1.0 / kA + 1.0 / kB) # (N,)
|
|
93
|
+
|
|
94
|
+
if total_df == 0: # no replicate info anywhere
|
|
95
|
+
delta_hat = float(d.mean())
|
|
96
|
+
se = float(d.std(ddof=1) / np.sqrt(N))
|
|
97
|
+
tau2 = float(max(0.0, d.var(ddof=1)))
|
|
98
|
+
return VarianceComponents(delta_hat, se, sigma_resid2, tau2,
|
|
99
|
+
abs(delta_hat) / np.sqrt(tau2 + 1e-12))
|
|
100
|
+
|
|
101
|
+
# (3) DerSimonian-Laird estimate of between-input (interaction) variance.
|
|
102
|
+
w0 = 1.0 / np.maximum(v, 1e-12) # fixed-effect weights
|
|
103
|
+
delta_fe = np.sum(w0 * d) / np.sum(w0)
|
|
104
|
+
Q = np.sum(w0 * (d - delta_fe) ** 2) # Cochran's Q
|
|
105
|
+
C = np.sum(w0) - np.sum(w0 ** 2) / np.sum(w0)
|
|
106
|
+
tau2 = max(0.0, (Q - (N - 1)) / C) if C > 0 else 0.0
|
|
107
|
+
|
|
108
|
+
# (4) random-effects inverse-variance weights: keep low-k inputs, trust
|
|
109
|
+
# them proportionally to their precision (never truncate).
|
|
110
|
+
w = 1.0 / (tau2 + v)
|
|
111
|
+
delta_hat = float(np.sum(w * d) / np.sum(w))
|
|
112
|
+
se_delta = float(np.sqrt(1.0 / np.sum(w)))
|
|
113
|
+
snr = abs(delta_hat) / np.sqrt(tau2 + 1e-12)
|
|
114
|
+
return VarianceComponents(delta_hat, se_delta, sigma_resid2, float(tau2), snr)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# --------------------------------------------------------------------------- #
|
|
118
|
+
# 2. Bootstrap CI that wraps judge resampling
|
|
119
|
+
# --------------------------------------------------------------------------- #
|
|
120
|
+
# Cluster (input-level) bootstrap so the CI reflects input sampling, AND inject
|
|
121
|
+
# judge measurement noise on every scored output so the CI also reflects judge
|
|
122
|
+
# unreliability. An unreliable judge (large judge_error_sd) widens the interval
|
|
123
|
+
# and prevents a confident ship — by construction, not by a hand-tuned fudge.
|
|
124
|
+
|
|
125
|
+
def bootstrap_delta_ci(
|
|
126
|
+
scores_a,
|
|
127
|
+
scores_b,
|
|
128
|
+
judge_error_sd: float,
|
|
129
|
+
alpha: float = 0.05,
|
|
130
|
+
n_boot: int = 5000,
|
|
131
|
+
rng: np.random.Generator | None = None,
|
|
132
|
+
) -> tuple[float, float, np.ndarray]:
|
|
133
|
+
rng = rng or np.random.default_rng(0)
|
|
134
|
+
A, B = _as_ragged(scores_a), _as_ragged(scores_b)
|
|
135
|
+
N = len(A)
|
|
136
|
+
kA = np.array([len(a) for a in A], dtype=float)
|
|
137
|
+
kB = np.array([len(b) for b in B], dtype=float)
|
|
138
|
+
mA = np.array([a.mean() for a in A])
|
|
139
|
+
mB = np.array([b.mean() for b in B])
|
|
140
|
+
|
|
141
|
+
# Random-effects weights from the full-data fit (held fixed across boots).
|
|
142
|
+
comp = variance_components(A, B)
|
|
143
|
+
v = comp.sigma_resid2 * (1.0 / kA + 1.0 / kB)
|
|
144
|
+
w = 1.0 / (comp.interaction_var + v + 1e-12)
|
|
145
|
+
|
|
146
|
+
# Judge noise on a CELL MEAN is Gaussian with variance sd^2 / k_i, so we
|
|
147
|
+
# perturb means directly instead of every score: same distribution, fully
|
|
148
|
+
# vectorizable, and low-k cells correctly carry MORE judge uncertainty.
|
|
149
|
+
idx = rng.integers(0, N, size=(n_boot, N))
|
|
150
|
+
seA = judge_error_sd / np.sqrt(kA[idx])
|
|
151
|
+
seB = judge_error_sd / np.sqrt(kB[idx])
|
|
152
|
+
da = mA[idx] + rng.normal(0, 1, idx.shape) * seA
|
|
153
|
+
db = mB[idx] + rng.normal(0, 1, idx.shape) * seB
|
|
154
|
+
dd = db - da
|
|
155
|
+
ww = w[idx]
|
|
156
|
+
boot = np.sum(ww * dd, axis=1) / np.sum(ww, axis=1)
|
|
157
|
+
|
|
158
|
+
lo, hi = np.percentile(boot, [100 * alpha / 2, 100 * (1 - alpha / 2)])
|
|
159
|
+
return float(lo), float(hi), boot
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# --------------------------------------------------------------------------- #
|
|
163
|
+
# 3. Multiplicity control across clusters: BH (fixed-n) and e-BH (sequential)
|
|
164
|
+
# --------------------------------------------------------------------------- #
|
|
165
|
+
|
|
166
|
+
def benjamini_hochberg(pvalues: np.ndarray, alpha: float) -> np.ndarray:
|
|
167
|
+
"""Return boolean mask of rejected hypotheses. FDR <= alpha under
|
|
168
|
+
independence / positive dependence (PRDS)."""
|
|
169
|
+
m = len(pvalues)
|
|
170
|
+
order = np.argsort(pvalues)
|
|
171
|
+
ranked = pvalues[order]
|
|
172
|
+
thresh = alpha * (np.arange(1, m + 1) / m)
|
|
173
|
+
passed = ranked <= thresh
|
|
174
|
+
reject = np.zeros(m, dtype=bool)
|
|
175
|
+
if passed.any():
|
|
176
|
+
kmax = np.max(np.where(passed)[0]) # largest rank that passes
|
|
177
|
+
reject[order[: kmax + 1]] = True
|
|
178
|
+
return reject
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def p_to_e(p: np.ndarray, kappa: float = 0.5) -> np.ndarray:
|
|
182
|
+
"""Calibrate p-values to e-values with the kappa-calibrator
|
|
183
|
+
e = kappa * p**(kappa - 1), valid for any p (integrates to 1 under H0)."""
|
|
184
|
+
return kappa * np.power(np.clip(p, 1e-12, 1.0), kappa - 1.0)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def e_bh(evalues: np.ndarray, alpha: float) -> np.ndarray:
|
|
188
|
+
"""e-BH (Wang & Ramdas 2022). FDR <= alpha under ARBITRARY dependence and
|
|
189
|
+
valid under optional stopping — the version to use when the gate is hit
|
|
190
|
+
repeatedly over many deploys."""
|
|
191
|
+
m = len(evalues)
|
|
192
|
+
order = np.argsort(-evalues) # decreasing
|
|
193
|
+
ranked = evalues[order]
|
|
194
|
+
crit = m / (alpha * np.arange(1, m + 1)) # threshold per rank
|
|
195
|
+
passed = ranked >= crit
|
|
196
|
+
reject = np.zeros(m, dtype=bool)
|
|
197
|
+
if passed.any():
|
|
198
|
+
kmax = np.max(np.where(passed)[0])
|
|
199
|
+
reject[order[: kmax + 1]] = True
|
|
200
|
+
return reject
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# --------------------------------------------------------------------------- #
|
|
204
|
+
# 4. Cluster regression scan
|
|
205
|
+
# --------------------------------------------------------------------------- #
|
|
206
|
+
# For each cluster, one-sided test of H0: delta >= 0 (B not worse) vs
|
|
207
|
+
# H1: delta < 0 (regression). Small p = strong regression evidence. Then apply
|
|
208
|
+
# BH (fixed-n) or e-BH (sequential) across clusters to control false flags.
|
|
209
|
+
|
|
210
|
+
def paired_permutation_p(
|
|
211
|
+
d: np.ndarray, n_perm: int = 10000, rng: np.random.Generator | None = None
|
|
212
|
+
) -> float:
|
|
213
|
+
"""One-sided sign-flip permutation p-value for H1: mean(d) < 0 (regression).
|
|
214
|
+
Under H0 the paired delta is symmetric about 0, so each sign is exchangeable.
|
|
215
|
+
Distribution-free -- the correct test for small clusters where the normal
|
|
216
|
+
approximation throws false positives. (+1 correction keeps it valid.)"""
|
|
217
|
+
rng = rng or np.random.default_rng(0)
|
|
218
|
+
obs = d.mean()
|
|
219
|
+
signs = rng.choice(np.array([-1.0, 1.0]), size=(n_perm, len(d)))
|
|
220
|
+
perm_means = (signs * d).mean(axis=1)
|
|
221
|
+
return float((1 + np.sum(perm_means <= obs)) / (1 + n_perm))
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
@dataclass
|
|
225
|
+
class ClusterResult:
|
|
226
|
+
cluster_id: int
|
|
227
|
+
n: int
|
|
228
|
+
delta: float
|
|
229
|
+
p_value: float
|
|
230
|
+
e_value: float
|
|
231
|
+
flagged_regression: bool = False
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def cluster_scan(
|
|
235
|
+
scores_a: np.ndarray,
|
|
236
|
+
scores_b: np.ndarray,
|
|
237
|
+
cluster_ids: np.ndarray,
|
|
238
|
+
alpha: float = 0.10,
|
|
239
|
+
mode: str = "fixed", # "fixed" -> BH ; "sequential" -> e-BH
|
|
240
|
+
e_kappa: float = 0.5,
|
|
241
|
+
small_n: int = 30, # below this, use the permutation test
|
|
242
|
+
n_perm: int = 10000,
|
|
243
|
+
rng: np.random.Generator | None = None,
|
|
244
|
+
) -> list[ClusterResult]:
|
|
245
|
+
rng = rng or np.random.default_rng(0)
|
|
246
|
+
A, B = _as_ragged(scores_a), _as_ragged(scores_b)
|
|
247
|
+
d_i = np.array([b.mean() - a.mean() for a, b in zip(A, B)])
|
|
248
|
+
results: list[ClusterResult] = []
|
|
249
|
+
|
|
250
|
+
for c in np.unique(cluster_ids):
|
|
251
|
+
d = d_i[cluster_ids == c]
|
|
252
|
+
n = len(d)
|
|
253
|
+
if n < 3: # too small to test at all
|
|
254
|
+
results.append(ClusterResult(int(c), n, float(d.mean()) if n else 0.0,
|
|
255
|
+
1.0, p_to_e(np.array([1.0]), e_kappa)[0]))
|
|
256
|
+
continue
|
|
257
|
+
if n < small_n: # distribution-free fallback
|
|
258
|
+
p = paired_permutation_p(d, n_perm=n_perm, rng=rng)
|
|
259
|
+
else: # normal approximation
|
|
260
|
+
se = d.std(ddof=1) / np.sqrt(n)
|
|
261
|
+
p = 0.0 if (se == 0 and d.mean() < 0) else (
|
|
262
|
+
1.0 if se == 0 else float(stats.norm.cdf(d.mean() / se)))
|
|
263
|
+
results.append(ClusterResult(int(c), n, float(d.mean()), p,
|
|
264
|
+
float(p_to_e(np.array([p]), e_kappa)[0])))
|
|
265
|
+
|
|
266
|
+
pvals = np.array([r.p_value for r in results])
|
|
267
|
+
if mode == "sequential":
|
|
268
|
+
reject = e_bh(np.array([r.e_value for r in results]), alpha)
|
|
269
|
+
else:
|
|
270
|
+
reject = benjamini_hochberg(pvals, alpha)
|
|
271
|
+
for r, rej in zip(results, reject):
|
|
272
|
+
r.flagged_regression = bool(rej)
|
|
273
|
+
return results
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# --------------------------------------------------------------------------- #
|
|
277
|
+
# 5. Top-level gate
|
|
278
|
+
# --------------------------------------------------------------------------- #
|
|
279
|
+
|
|
280
|
+
@dataclass
|
|
281
|
+
class GateDecision:
|
|
282
|
+
verdict: str # SHIP | HOLD | REGRESSION | JUDGE_INADMISSIBLE
|
|
283
|
+
delta_ci: tuple[float, float]
|
|
284
|
+
components: VarianceComponents | None
|
|
285
|
+
flagged_clusters: list[int] = field(default_factory=list)
|
|
286
|
+
note: str = ""
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def gate(
|
|
290
|
+
scores_a: np.ndarray,
|
|
291
|
+
scores_b: np.ndarray,
|
|
292
|
+
cluster_ids: np.ndarray,
|
|
293
|
+
judge_error_sd: float = 0.0,
|
|
294
|
+
kappa: float = 1.0,
|
|
295
|
+
alpha: float = 0.05,
|
|
296
|
+
margin: float = 0.0, # superiority margin on the delta
|
|
297
|
+
kappa_min: float = 0.4,
|
|
298
|
+
min_n: int = 30, # below this, no SHIP/REGRESSION -- insufficient power
|
|
299
|
+
mode: str = "fixed",
|
|
300
|
+
rng: np.random.Generator | None = None,
|
|
301
|
+
) -> GateDecision:
|
|
302
|
+
# (a) Judge must be admissible before it is allowed to gate anything.
|
|
303
|
+
if kappa < kappa_min:
|
|
304
|
+
return GateDecision("JUDGE_INADMISSIBLE", (float("nan"), float("nan")), None,
|
|
305
|
+
note=f"kappa={kappa:.2f} < kappa_min={kappa_min}; recalibrate judge.")
|
|
306
|
+
|
|
307
|
+
# (b) Power floor, handled ASYMMETRICALLY. At small N the bootstrap
|
|
308
|
+
# underestimates variance, so we never trust it to SHIP. But the danger
|
|
309
|
+
# is one-sided: a consistent, large negative delta is a real catastrophe
|
|
310
|
+
# even at small N, and the EXACT sign-flip permutation test (which does
|
|
311
|
+
# not depend on the bootstrap) can certify it. A significant *improvement*
|
|
312
|
+
# at small N is NOT shippable -- a tiny sample can't represent the input
|
|
313
|
+
# distribution (an external-validity failure, not a power failure). So:
|
|
314
|
+
# regression -> flag it; everything else -> HOLD. (Below ~N=5 the
|
|
315
|
+
# permutation p floor of 1/2^N exceeds alpha, so this correctly falls
|
|
316
|
+
# through to HOLD on its own.)
|
|
317
|
+
n_inputs = len(_as_ragged(scores_a))
|
|
318
|
+
if n_inputs < min_n:
|
|
319
|
+
comp = variance_components(scores_a, scores_b)
|
|
320
|
+
lo, hi, _ = bootstrap_delta_ci(scores_a, scores_b, judge_error_sd, alpha, rng=rng)
|
|
321
|
+
A, B = _as_ragged(scores_a), _as_ragged(scores_b)
|
|
322
|
+
d = np.array([b.mean() - a.mean() for a, b in zip(A, B)])
|
|
323
|
+
p_reg = paired_permutation_p(d, rng=rng) # H1: mean(d) < 0
|
|
324
|
+
if d.mean() < 0 and p_reg < alpha:
|
|
325
|
+
return GateDecision("REGRESSION", (lo, hi), comp,
|
|
326
|
+
note=f"small-N catastrophic regression caught by exact "
|
|
327
|
+
f"permutation test: N={n_inputs}, p={p_reg:.3f} < alpha={alpha}.")
|
|
328
|
+
return GateDecision("HOLD", (lo, hi), comp,
|
|
329
|
+
note=f"insufficient power: N={n_inputs} < min_n={min_n}; "
|
|
330
|
+
f"permutation p={p_reg:.3f} is not a clear regression and a "
|
|
331
|
+
f"small sample cannot certify a ship -- collect more paired inputs.")
|
|
332
|
+
|
|
333
|
+
comp = variance_components(scores_a, scores_b)
|
|
334
|
+
lo, hi, _ = bootstrap_delta_ci(scores_a, scores_b, judge_error_sd, alpha, rng=rng)
|
|
335
|
+
|
|
336
|
+
if lo > margin:
|
|
337
|
+
verdict = "SHIP"
|
|
338
|
+
elif hi < 0:
|
|
339
|
+
verdict = "REGRESSION"
|
|
340
|
+
else:
|
|
341
|
+
verdict = "HOLD"
|
|
342
|
+
|
|
343
|
+
# (b) Localized regressions are reported even when the overall verdict ships.
|
|
344
|
+
clusters = cluster_scan(scores_a, scores_b, cluster_ids, alpha=2 * alpha, mode=mode, rng=rng)
|
|
345
|
+
flagged = [r.cluster_id for r in clusters if r.flagged_regression]
|
|
346
|
+
if flagged and verdict == "SHIP":
|
|
347
|
+
verdict = "SHIP_WITH_FLAGS"
|
|
348
|
+
|
|
349
|
+
return GateDecision(verdict, (lo, hi), comp, flagged,
|
|
350
|
+
note=f"snr={comp.snr:.2f}, token-noise var={comp.sigma_resid2:.4f}")
|