@doidor/agentrig 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -27
- package/dist/agent/copilot.js +46 -5
- package/dist/agent/copilot.js.map +1 -1
- package/dist/cli.js +30 -5
- package/dist/cli.js.map +1 -1
- package/dist/commands/doctor.js +53 -8
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/eval-dynamic.js +316 -0
- package/dist/commands/eval-dynamic.js.map +1 -0
- package/dist/commands/eval-scaffold.js +173 -0
- package/dist/commands/eval-scaffold.js.map +1 -0
- package/dist/commands/eval.js +184 -55
- package/dist/commands/eval.js.map +1 -1
- package/dist/core/audit.js +237 -9
- package/dist/core/audit.js.map +1 -1
- package/dist/core/model-family.js +31 -0
- package/dist/core/model-family.js.map +1 -0
- package/dist/core/scenario-runner.js +298 -0
- package/dist/core/scenario-runner.js.map +1 -0
- package/dist/prompts/index.js +121 -30
- package/dist/prompts/index.js.map +1 -1
- package/knowledge/PRINCIPLES.md +2 -2
- package/knowledge/manifest.json +16 -1
- package/knowledge/templates/AGENTS.md +7 -6
- package/knowledge/templates/agents/README.md +4 -4
- package/knowledge/templates/agents/developer.yml +1 -1
- package/knowledge/templates/agents/judge.yml +1 -1
- package/knowledge/templates/agents/reviewer.yml +1 -1
- package/knowledge/templates/agents/triager.yml +5 -4
- package/knowledge/templates/dashboard/dashboard.mjs +12 -5
- package/knowledge/templates/eval/RUBRIC.md +87 -64
- package/knowledge/templates/eval/axes.json +25 -25
- package/knowledge/templates/eval/calibration/README.md +54 -0
- package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
- package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
- package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
- package/knowledge/templates/eval/checks.json +88 -11
- package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
- package/knowledge/templates/eval/score.mjs +368 -42
- package/knowledge/templates/eval/static-audit.mjs +204 -17
- package/knowledge/templates/harness/state-machine.yml +18 -12
- package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
- package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
- package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
- package/package.json +4 -3
- package/knowledge/templates/eval/scenarios/README.md +0 -24
- package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
- package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
- package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
|
@@ -22,6 +22,26 @@ const scriptDir = dirname(fileURLToPath(import.meta.url));
|
|
|
22
22
|
const resultsDir = join(scriptDir, "results");
|
|
23
23
|
const axesPath = join(scriptDir, "axes.json");
|
|
24
24
|
|
|
25
|
+
// Mirror of src/core/model-family.ts. Kept inline to keep this script dep-free so it
|
|
26
|
+
// works in target repos that haven't run `npm install` after `agentrig init`.
|
|
27
|
+
const FAMILY_PATTERNS = [
|
|
28
|
+
["anthropic-claude", /^(anthropic[\.\/-])?claude([-_\.]|$)/i],
|
|
29
|
+
["openai-gpt", /^(openai[\.\/-])?(gpt|o[1-9]|codex|davinci|chatgpt)([-_\.]|$)/i],
|
|
30
|
+
["google-gemini", /^(google[\.\/-])?(gemini|palm|bard|flash)([-_\.]|$)/i],
|
|
31
|
+
["mistral", /^(mistral|mixtral|codestral|ministral)([-_\.]|$)/i],
|
|
32
|
+
["deepseek", /^deepseek([-_\.]|$)/i],
|
|
33
|
+
["meta-llama", /^(meta[\.\/-])?(llama|code-?llama)([-_\.]|$)/i],
|
|
34
|
+
["xai-grok", /^(xai[\.\/-])?grok([-_\.]|$)/i],
|
|
35
|
+
["cohere", /^(cohere[\.\/-])?(command|aya)([-_\.]|$)/i],
|
|
36
|
+
["qwen", /^qwen([-_\.]|$)/i],
|
|
37
|
+
];
|
|
38
|
+
function modelFamily(id) {
|
|
39
|
+
if (!id) return "";
|
|
40
|
+
for (const [name, rx] of FAMILY_PATTERNS) if (rx.test(id)) return name;
|
|
41
|
+
const m = id.match(/^([a-z0-9]+)/i);
|
|
42
|
+
return m ? `unknown:${m[1].toLowerCase()}` : `unknown:${id}`;
|
|
43
|
+
}
|
|
44
|
+
|
|
25
45
|
function loadRegistry() {
|
|
26
46
|
if (!existsSync(axesPath)) {
|
|
27
47
|
console.error(`axes.json not found at ${axesPath}`);
|
|
@@ -30,7 +50,9 @@ function loadRegistry() {
|
|
|
30
50
|
return JSON.parse(readFileSync(axesPath, "utf8"));
|
|
31
51
|
}
|
|
32
52
|
|
|
33
|
-
/** Build axis -> { category, codes } lookup for a rubric type.
|
|
53
|
+
/** Build axis -> { category, codes, weight, veto } lookup for a rubric type. Supports
|
|
54
|
+
* both legacy schema (axis: [CODE,...]) and v2 schema (axis: { codes:[...], weight, veto }).
|
|
55
|
+
*/
|
|
34
56
|
function axisIndex(registry, type) {
|
|
35
57
|
const def = registry.types?.[type];
|
|
36
58
|
if (!def) {
|
|
@@ -39,7 +61,12 @@ function axisIndex(registry, type) {
|
|
|
39
61
|
}
|
|
40
62
|
const index = new Map();
|
|
41
63
|
for (const [category, axes] of Object.entries(def.categories)) {
|
|
42
|
-
for (const [axis,
|
|
64
|
+
for (const [axis, spec] of Object.entries(axes)) {
|
|
65
|
+
const meta = Array.isArray(spec)
|
|
66
|
+
? { category, codes: spec, weight: 1, veto: false }
|
|
67
|
+
: { category, codes: spec.codes || [], weight: spec.weight ?? 1, veto: Boolean(spec.veto) };
|
|
68
|
+
index.set(axis, meta);
|
|
69
|
+
}
|
|
43
70
|
}
|
|
44
71
|
return index;
|
|
45
72
|
}
|
|
@@ -93,30 +120,70 @@ if (cmd === "save") {
|
|
|
93
120
|
if (!meta.codes.includes(code)) fail(`issue code "${code}" is not valid for axis "${name}". Valid: ${meta.codes.join(", ")}`);
|
|
94
121
|
if (!evidence) fail(`axis "${name}" scored ${score} < 1.0 but has no evidence — use name=score:CODE:evidence`);
|
|
95
122
|
}
|
|
96
|
-
return { name, category: meta.category, score, issue: code || null, evidence, confidence: 1 };
|
|
123
|
+
return { name, category: meta.category, weight: meta.weight, veto: meta.veto, score, issue: code || null, evidence, confidence: 1 };
|
|
97
124
|
});
|
|
98
125
|
|
|
99
|
-
// Recompute rollups from axis data (never trust hand-supplied totals). Confidence-gated.
|
|
126
|
+
// Recompute rollups from axis data (never trust hand-supplied totals). Confidence-gated + weighted.
|
|
100
127
|
const observed = axes.filter((a) => a.confidence > 0);
|
|
101
128
|
const categories = {};
|
|
102
|
-
for (const a of observed) (categories[a.category] ||= []).push(a.score);
|
|
129
|
+
for (const a of observed) (categories[a.category] ||= []).push({ score: a.score, weight: a.weight });
|
|
103
130
|
const categoryScores = Object.fromEntries(
|
|
104
|
-
Object.entries(categories).map(([c, xs]) =>
|
|
131
|
+
Object.entries(categories).map(([c, xs]) => {
|
|
132
|
+
const wSum = xs.reduce((s, x) => s + x.weight, 0);
|
|
133
|
+
const wScore = xs.reduce((s, x) => s + x.weight * x.score, 0);
|
|
134
|
+
return [c, round(wSum ? wScore / wSum : 0)];
|
|
135
|
+
}),
|
|
105
136
|
);
|
|
106
|
-
const
|
|
107
|
-
const
|
|
137
|
+
const wSum = observed.reduce((s, a) => s + a.weight, 0);
|
|
138
|
+
const wScore = observed.reduce((s, a) => s + a.weight * a.score, 0);
|
|
139
|
+
const aggregate = wSum ? round(wScore / wSum) : 0;
|
|
140
|
+
|
|
141
|
+
// Pass rule: aggregate clears threshold AND no observed axis is zero AND no veto axis < 1.0.
|
|
142
|
+
// veto axes encode "cosmetics cannot bail out a correctness/gate-compliance regression."
|
|
143
|
+
const vetoFails = observed.filter((a) => a.veto && a.score < 1).map((a) => a.name);
|
|
144
|
+
const hardZeros = observed.filter((a) => a.score === 0).map((a) => a.name);
|
|
145
|
+
let pass = observed.length > 0 && aggregate >= PASS && hardZeros.length === 0 && vetoFails.length === 0;
|
|
146
|
+
const failReason = !observed.length ? "no observed axes"
|
|
147
|
+
: vetoFails.length ? `veto axis fail: ${vetoFails.join(", ")}`
|
|
148
|
+
: hardZeros.length ? `zero score on: ${hardZeros.join(", ")}`
|
|
149
|
+
: aggregate < PASS ? `aggregate ${aggregate.toFixed(2)} < ${PASS}`
|
|
150
|
+
: null;
|
|
151
|
+
|
|
152
|
+
// Producer / judge metadata. Comes from --producer-model / --judge-model flags OR from
|
|
153
|
+
// env vars (the orchestrator sets AGENTRIG_PRODUCER_MODEL / AGENTRIG_JUDGE_MODEL so it
|
|
154
|
+
// doesn't have to thread two more positional args through). Family-divergence is enforced:
|
|
155
|
+
// a result where producer + judge share a family is rejected unless --allow-same-family
|
|
156
|
+
// (or AGENTRIG_ALLOW_SAME_FAMILY=1) is set, and the override gets recorded so reviewers
|
|
157
|
+
// can spot lazy single-model setups.
|
|
158
|
+
const producerModel = getOpt(args, "--producer-model") || process.env.AGENTRIG_PRODUCER_MODEL || "";
|
|
159
|
+
const judgeModel = getOpt(args, "--judge-model") || process.env.AGENTRIG_JUDGE_MODEL || judge;
|
|
160
|
+
const allowSameFamily = args.includes("--allow-same-family") || process.env.AGENTRIG_ALLOW_SAME_FAMILY === "1";
|
|
161
|
+
const trialIndex = getOpt(args, "--trial");
|
|
162
|
+
if (producerModel && judgeModel) {
|
|
163
|
+
if (modelFamily(producerModel) === modelFamily(judgeModel) && !allowSameFamily) {
|
|
164
|
+
fail(`producer "${producerModel}" and judge "${judgeModel}" share family "${modelFamily(producerModel)}". ` +
|
|
165
|
+
`Pass --allow-same-family (or set AGENTRIG_ALLOW_SAME_FAMILY=1) to override; the override will be recorded.`);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
108
168
|
|
|
109
169
|
const record = {
|
|
170
|
+
schemaVersion: 2,
|
|
110
171
|
type, task, scenario, variant, run, judge,
|
|
172
|
+
producerModel: producerModel || null,
|
|
173
|
+
judgeModel: judgeModel || null,
|
|
174
|
+
producerFamily: producerModel ? modelFamily(producerModel) : null,
|
|
175
|
+
judgeFamily: judgeModel ? modelFamily(judgeModel) : null,
|
|
176
|
+
allowSameFamily,
|
|
177
|
+
trialIndex: trialIndex != null ? Number(trialIndex) : null,
|
|
111
178
|
timestamp: new Date().toISOString(),
|
|
112
|
-
aggregate, pass, categoryScores, axes,
|
|
179
|
+
aggregate, pass, failReason, categoryScores, axes,
|
|
113
180
|
};
|
|
114
181
|
|
|
115
182
|
if (!existsSync(resultsDir)) mkdirSync(resultsDir, { recursive: true });
|
|
116
183
|
const safe = (s) => String(s).replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
117
|
-
const file = join(resultsDir, `${safe(type)}.${safe(scenario)}.${safe(variant || "base")}.${Date.now()}.json`);
|
|
184
|
+
const file = join(resultsDir, `${safe(type)}.${safe(scenario)}.${safe(variant || "base")}${trialIndex != null ? "." + safe("trial" + trialIndex) : ""}.${Date.now()}.json`);
|
|
118
185
|
writeFileSync(file, JSON.stringify(record, null, 2));
|
|
119
|
-
console.log(`Saved ${file}\n aggregate=${aggregate.toFixed(2)} ${pass ? "PASS" : "FAIL"} (${observed.length}/${axes.length} axes observed)`);
|
|
186
|
+
console.log(`Saved ${file}\n aggregate=${aggregate.toFixed(2)} ${pass ? "PASS" : "FAIL"}${failReason ? ` — ${failReason}` : ""} (${observed.length}/${axes.length} axes observed)`);
|
|
120
187
|
process.exit(0);
|
|
121
188
|
}
|
|
122
189
|
|
|
@@ -134,6 +201,8 @@ if (cmd === "report" || cmd === "compare") {
|
|
|
134
201
|
let scoped = records;
|
|
135
202
|
if (filterType) scoped = scoped.filter((r) => r.type === filterType);
|
|
136
203
|
if (filterVariant) scoped = scoped.filter((r) => (r.variant || "base") === filterVariant);
|
|
204
|
+
const filterRun = getOpt(args, "--run");
|
|
205
|
+
if (filterRun) scoped = scoped.filter((r) => r.run === filterRun);
|
|
137
206
|
|
|
138
207
|
// Latest record per (type, scenario, variant).
|
|
139
208
|
const latest = new Map();
|
|
@@ -160,6 +229,13 @@ if (cmd === "report" || cmd === "compare") {
|
|
|
160
229
|
if (rows.length === 0) {
|
|
161
230
|
console.log(" No results yet. Run `score.mjs save ...` first.");
|
|
162
231
|
} else {
|
|
232
|
+
const passed = rows.filter((r) => r.pass).length;
|
|
233
|
+
const failed = rows.length - passed;
|
|
234
|
+
console.log(` Summary: ${passed}/${rows.length} scenario${rows.length === 1 ? "" : "s"} PASS, ${failed} FAIL`);
|
|
235
|
+
console.log(` Overall aggregate (mean of per-scenario aggregates): ${overall.toFixed(2)}\n`);
|
|
236
|
+
console.log(" Pass rule: aggregate ≥ 0.8 AND no observed axis = 0 AND no veto axis < 1.0.");
|
|
237
|
+
console.log(" Veto axes per type: run → correctness/gate_compliance; review → finding_correctness/blocking_decision.\n");
|
|
238
|
+
|
|
163
239
|
const byType = new Map();
|
|
164
240
|
for (const r of rows) {
|
|
165
241
|
if (!byType.has(r.type)) byType.set(r.type, []);
|
|
@@ -169,20 +245,170 @@ if (cmd === "report" || cmd === "compare") {
|
|
|
169
245
|
console.log(` ${type.toUpperCase()}`);
|
|
170
246
|
for (const r of group) {
|
|
171
247
|
const v = r.variant ? ` [${r.variant}]` : "";
|
|
172
|
-
|
|
248
|
+
const verdict = r.pass ? "PASS" : "FAIL";
|
|
249
|
+
console.log(` ${verdict} ${(r.scenario + v).padEnd(30)} ${r.aggregate.toFixed(2)} ${r.failReason ? `← ${r.failReason}` : ""}`);
|
|
250
|
+
// Print failing axes with their issue code + evidence so the operator can act.
|
|
251
|
+
if (!r.pass) {
|
|
252
|
+
const failing = (r.axes || []).filter((a) => a.confidence > 0 && a.score < 1);
|
|
253
|
+
for (const a of failing) {
|
|
254
|
+
const tag = a.score === 0 ? "0 " : "½ ";
|
|
255
|
+
const code = a.issue ? `[${a.issue}]` : "";
|
|
256
|
+
const ev = (a.evidence || "").slice(0, 110);
|
|
257
|
+
console.log(` ${tag} ${a.name.padEnd(22)} ${code} ${ev}`);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
173
260
|
}
|
|
174
261
|
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
262
|
+
// Per-axis means: explicitly label which are observed across multiple scenarios.
|
|
263
|
+
console.log("\n Per-axis means across all observed scenarios (lower = worse on average):");
|
|
264
|
+
const axisRows = [...axisAgg.entries()].map(([name, v]) => ({ name, mean: round(v.sum / v.n), n: v.n }))
|
|
265
|
+
.sort((a, b) => a.mean - b.mean);
|
|
266
|
+
for (const a of axisRows) {
|
|
267
|
+
const flag = a.mean < 0.5 ? " ← weakest" : a.mean < 0.8 ? " ← weak" : "";
|
|
268
|
+
console.log(` ${a.name.padEnd(22)} ${a.mean.toFixed(2)} (n=${a.n})${flag}`);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
console.log(`\n How to read this:`);
|
|
272
|
+
console.log(` • A scenario PASS means the harness handled this task well per the rubric.`);
|
|
273
|
+
console.log(` • A scenario FAIL means at least one veto axis dropped below 1.0 OR an observed axis was 0.`);
|
|
274
|
+
console.log(` • The overall aggregate (${overall.toFixed(2)}) is NOT the harness lift — that requires`);
|
|
275
|
+
console.log(` a baseline comparison: \`agentrig eval --dynamic --variant baseline --n 5\` then`);
|
|
276
|
+
console.log(` \`node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline\`.`);
|
|
277
|
+
if (failed > 0) {
|
|
278
|
+
console.log(` • To investigate a FAIL: open \`.agentrig/eval/results/runs/<runId>/<scenario>.trial0.diff.patch\``);
|
|
279
|
+
console.log(` and \`<scenario>.trial0.judge.json\` to see exactly what the producer did and what the judge saw.`);
|
|
280
|
+
}
|
|
178
281
|
}
|
|
179
282
|
}
|
|
180
283
|
process.exit(0);
|
|
181
284
|
}
|
|
182
285
|
|
|
183
|
-
|
|
286
|
+
if (cmd === "calibrate") {
|
|
287
|
+
// Calibrate a judge model against the hand-labeled set in eval/calibration/.
|
|
288
|
+
// For each instance, the calibrate script simply compares the agent-supplied
|
|
289
|
+
// judge_scores.json (path passed via --judge-scores) against the ground truth.
|
|
290
|
+
// The orchestration of "actually invoke the judge and capture its output" is
|
|
291
|
+
// CLI-side (`agentrig doctor` does it); this script is the pure scoring half.
|
|
292
|
+
//
|
|
293
|
+
// Usage:
|
|
294
|
+
// node score.mjs calibrate --instance <path-to-instance.yml> --judge-scores <path.json>
|
|
295
|
+
// node score.mjs calibrate --report # roll up cached results in calibration/results/
|
|
296
|
+
const calibDir = join(scriptDir, "calibration");
|
|
297
|
+
if (args.includes("--report")) {
|
|
298
|
+
runCalibrateReport(calibDir);
|
|
299
|
+
process.exit(0);
|
|
300
|
+
}
|
|
301
|
+
const instancePath = getOpt(args, "--instance");
|
|
302
|
+
const judgeScoresPath = getOpt(args, "--judge-scores");
|
|
303
|
+
const judgeModel = getOpt(args, "--judge") || "unknown";
|
|
304
|
+
if (!instancePath || !judgeScoresPath) fail("calibrate requires --instance <path.yml> and --judge-scores <path.json> (or --report)");
|
|
305
|
+
const result = runCalibrateOne(instancePath, judgeScoresPath, judgeModel);
|
|
306
|
+
const resultsDir2 = join(calibDir, "results");
|
|
307
|
+
if (!existsSync(resultsDir2)) mkdirSync(resultsDir2, { recursive: true });
|
|
308
|
+
const safe = (s) => String(s).replace(/[^a-zA-Z0-9_.-]/g, "_");
|
|
309
|
+
const out = join(resultsDir2, `${safe(judgeModel)}.${safe(result.instanceId)}.${Date.now()}.json`);
|
|
310
|
+
writeFileSync(out, JSON.stringify(result, null, 2));
|
|
311
|
+
console.log(`Saved ${out}`);
|
|
312
|
+
console.log(` ${result.instanceId}: agreement=${(result.agreement * 100).toFixed(1)}% (${result.matches}/${result.compared}) bias=${result.bias.toFixed(3)}`);
|
|
313
|
+
process.exit(0);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
console.error("Usage: score.mjs <save|report|compare|calibrate> ...");
|
|
184
317
|
process.exit(2);
|
|
185
318
|
|
|
319
|
+
// --- calibration helpers ---------------------------------------------------
|
|
320
|
+
function runCalibrateOne(instancePath, judgeScoresPath, judgeModel) {
|
|
321
|
+
if (!existsSync(instancePath)) fail(`instance not found: ${instancePath}`);
|
|
322
|
+
if (!existsSync(judgeScoresPath)) fail(`judge scores not found: ${judgeScoresPath}`);
|
|
323
|
+
// Tiny YAML reader inline — only needs to handle the flat structure of our calibration files.
|
|
324
|
+
// For brevity we punt to a real parse via a child process; but to stay dep-free we just JSON-parse
|
|
325
|
+
// judge scores and use a regex-based reader for the YAML ground truth.
|
|
326
|
+
const text = readFileSync(instancePath, "utf8");
|
|
327
|
+
const truth = parseCalibYaml(text);
|
|
328
|
+
const judge = JSON.parse(readFileSync(judgeScoresPath, "utf8"));
|
|
329
|
+
const judgeAxes = new Map((judge.axes || []).map((a) => [a.name, a]));
|
|
330
|
+
const compared = [];
|
|
331
|
+
for (const t of truth.ground_truth || []) {
|
|
332
|
+
const j = judgeAxes.get(t.axis);
|
|
333
|
+
if (!j) { compared.push({ axis: t.axis, truth: t.score, judge: null, diff: null, within: false }); continue; }
|
|
334
|
+
if ((t.confidence ?? 1) === 0 && (j.confidence ?? 1) === 0) {
|
|
335
|
+
compared.push({ axis: t.axis, truth: 0, judge: 0, diff: 0, within: true });
|
|
336
|
+
continue;
|
|
337
|
+
}
|
|
338
|
+
const diff = j.score - t.score;
|
|
339
|
+
const within = Math.abs(diff) <= 0.5;
|
|
340
|
+
compared.push({ axis: t.axis, truth: t.score, judge: j.score, diff, within });
|
|
341
|
+
}
|
|
342
|
+
const matches = compared.filter((c) => c.within).length;
|
|
343
|
+
const agreement = compared.length ? matches / compared.length : 0;
|
|
344
|
+
const signedDiffs = compared.filter((c) => c.diff != null).map((c) => c.diff);
|
|
345
|
+
const bias = signedDiffs.length ? signedDiffs.reduce((s, x) => s + x, 0) / signedDiffs.length : 0;
|
|
346
|
+
return {
|
|
347
|
+
instanceId: truth.id || "unknown",
|
|
348
|
+
judgeModel,
|
|
349
|
+
compared: compared.length,
|
|
350
|
+
matches,
|
|
351
|
+
agreement: round(agreement),
|
|
352
|
+
bias: round(bias),
|
|
353
|
+
axes: compared,
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
function runCalibrateReport(calibDir) {
|
|
358
|
+
const resultsDir2 = join(calibDir, "results");
|
|
359
|
+
if (!existsSync(resultsDir2)) {
|
|
360
|
+
console.log("No calibration results yet. Run `score.mjs calibrate --instance <path> --judge-scores <path>` first.");
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
const byJudge = new Map();
|
|
364
|
+
for (const f of readdirSync(resultsDir2).filter((f) => f.endsWith(".json"))) {
|
|
365
|
+
let rec;
|
|
366
|
+
try { rec = JSON.parse(readFileSync(join(resultsDir2, f), "utf8")); }
|
|
367
|
+
catch { continue; }
|
|
368
|
+
if (!byJudge.has(rec.judgeModel)) byJudge.set(rec.judgeModel, []);
|
|
369
|
+
byJudge.get(rec.judgeModel).push(rec);
|
|
370
|
+
}
|
|
371
|
+
console.log("AgentRig — judge calibration report\n");
|
|
372
|
+
if (byJudge.size === 0) { console.log(" No calibration results yet."); return; }
|
|
373
|
+
console.log(` ${"judge".padEnd(28)} ${"n".padStart(3)} ${"agree%".padStart(7)} ${"bias".padStart(7)}`);
|
|
374
|
+
for (const [judge, recs] of byJudge) {
|
|
375
|
+
const meanAgree = recs.reduce((s, r) => s + r.agreement, 0) / recs.length;
|
|
376
|
+
const meanBias = recs.reduce((s, r) => s + r.bias, 0) / recs.length;
|
|
377
|
+
const flag = meanAgree < 0.8 ? " (below 80% threshold)" : "";
|
|
378
|
+
console.log(` ${(judge || "unknown").padEnd(28)} ${String(recs.length).padStart(3)} ${(meanAgree * 100).toFixed(1).padStart(6)}% ${meanBias.toFixed(3).padStart(7)}${flag}`);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
/** Minimal YAML reader for our calibration file shape: top-level scalars + a `ground_truth` list of
|
|
383
|
+
* `{ axis, score, confidence?, code?, evidence? }` flow-mapping items. Avoids adding `yaml` as a
|
|
384
|
+
* dep so the installed score.mjs stays self-contained. */
|
|
385
|
+
function parseCalibYaml(text) {
|
|
386
|
+
const out = { ground_truth: [] };
|
|
387
|
+
let inGT = false;
|
|
388
|
+
for (const raw of text.split(/\r?\n/)) {
|
|
389
|
+
if (/^ground_truth:\s*$/.test(raw)) { inGT = true; continue; }
|
|
390
|
+
if (inGT && /^\s*-\s*\{/.test(raw)) {
|
|
391
|
+
const body = raw.replace(/^\s*-\s*\{/, "").replace(/\}\s*$/, "");
|
|
392
|
+
const kv = {};
|
|
393
|
+
for (const pair of body.split(",")) {
|
|
394
|
+
const m = pair.trim().match(/^(\w+):\s*(.*)$/);
|
|
395
|
+
if (!m) continue;
|
|
396
|
+
let v = m[2].trim();
|
|
397
|
+
if (/^-?\d+(\.\d+)?$/.test(v)) v = Number(v);
|
|
398
|
+
else if (v.startsWith('"')) v = v.slice(1, -1);
|
|
399
|
+
kv[m[1]] = v;
|
|
400
|
+
}
|
|
401
|
+
out.ground_truth.push(kv);
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
404
|
+
// Any top-level (non-indented) key exits the ground_truth block.
|
|
405
|
+
if (/^\S/.test(raw)) inGT = false;
|
|
406
|
+
const m = raw.match(/^(\w+):\s*(.+?)\s*$/);
|
|
407
|
+
if (m && !inGT) out[m[1]] = /^-?\d+(\.\d+)?$/.test(m[2]) ? Number(m[2]) : m[2].replace(/^["']|["']$/g, "");
|
|
408
|
+
}
|
|
409
|
+
return out;
|
|
410
|
+
}
|
|
411
|
+
|
|
186
412
|
// --- helpers ---------------------------------------------------------------
|
|
187
413
|
function round(n) {
|
|
188
414
|
return Math.round(n * 10000) / 10000;
|
|
@@ -191,67 +417,167 @@ function round(n) {
|
|
|
191
417
|
function loadRecords() {
|
|
192
418
|
if (!existsSync(resultsDir)) return [];
|
|
193
419
|
const out = [];
|
|
420
|
+
let skipped = 0;
|
|
194
421
|
for (const f of readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
|
|
422
|
+
let rec;
|
|
195
423
|
try {
|
|
196
|
-
|
|
424
|
+
rec = JSON.parse(readFileSync(join(resultsDir, f), "utf8"));
|
|
197
425
|
} catch {
|
|
198
426
|
console.error(`warning: skipping corrupt result file ${f}`);
|
|
427
|
+
skipped++;
|
|
428
|
+
continue;
|
|
199
429
|
}
|
|
430
|
+
const reason = validateRecord(rec);
|
|
431
|
+
if (reason) {
|
|
432
|
+
console.error(`warning: skipping ${f} (${reason}) — move to results/_legacy/ to silence`);
|
|
433
|
+
skipped++;
|
|
434
|
+
continue;
|
|
435
|
+
}
|
|
436
|
+
out.push(rec);
|
|
200
437
|
}
|
|
438
|
+
if (skipped) console.error(`warning: ${skipped} result file(s) skipped due to invalid shape.`);
|
|
201
439
|
return out;
|
|
202
440
|
}
|
|
203
441
|
|
|
442
|
+
/** Minimal shape check for v2 records. Returns reason string if invalid, null if OK. */
|
|
443
|
+
function validateRecord(r) {
|
|
444
|
+
if (!r || typeof r !== "object") return "not an object";
|
|
445
|
+
if (r.schemaVersion !== 2) return `schemaVersion=${r.schemaVersion ?? "missing"} (expected 2)`;
|
|
446
|
+
if (typeof r.type !== "string") return "missing type";
|
|
447
|
+
if (typeof r.scenario !== "string") return "missing scenario";
|
|
448
|
+
if (!Array.isArray(r.axes)) return "axes is not an array";
|
|
449
|
+
for (const a of r.axes) {
|
|
450
|
+
if (!a || typeof a !== "object") return "axis is not an object";
|
|
451
|
+
if (typeof a.name !== "string") return "axis missing name";
|
|
452
|
+
if (typeof a.score !== "number") return `axis "${a.name}" missing numeric score`;
|
|
453
|
+
if (typeof a.confidence !== "number") return `axis "${a.name}" missing numeric confidence`;
|
|
454
|
+
}
|
|
455
|
+
return null;
|
|
456
|
+
}
|
|
457
|
+
|
|
204
458
|
function compare(records, scenario, asJson, baseline) {
|
|
205
459
|
if (!scenario) fail("compare requires --scenario <id>");
|
|
206
460
|
const forScenario = records.filter((r) => r.scenario === scenario);
|
|
207
|
-
|
|
461
|
+
|
|
462
|
+
// Group by variant; keep ALL trials, not just the latest. This is the spine of P4.
|
|
463
|
+
const trialsByVariant = new Map();
|
|
208
464
|
for (const r of forScenario.sort((a, b) => a.timestamp.localeCompare(b.timestamp))) {
|
|
209
|
-
|
|
465
|
+
const v = r.variant || "base";
|
|
466
|
+
if (!trialsByVariant.has(v)) trialsByVariant.set(v, []);
|
|
467
|
+
trialsByVariant.get(v).push(r);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// Per-variant summary: n trials, mean ± stdev of aggregate, pass-rate.
|
|
471
|
+
const variantSummaries = [];
|
|
472
|
+
for (const [variant, trials] of trialsByVariant) {
|
|
473
|
+
const aggs = trials.map((t) => t.aggregate);
|
|
474
|
+
const mean = aggs.reduce((s, x) => s + x, 0) / aggs.length;
|
|
475
|
+
const variance = aggs.length > 1 ? aggs.reduce((s, x) => s + (x - mean) ** 2, 0) / (aggs.length - 1) : 0;
|
|
476
|
+
const stdev = Math.sqrt(variance);
|
|
477
|
+
const passRate = trials.filter((t) => t.pass).length / trials.length;
|
|
478
|
+
variantSummaries.push({
|
|
479
|
+
variant,
|
|
480
|
+
n: trials.length,
|
|
481
|
+
meanAggregate: round(mean),
|
|
482
|
+
stdevAggregate: round(stdev),
|
|
483
|
+
passRate: round(passRate),
|
|
484
|
+
judge: trials[trials.length - 1].judge,
|
|
485
|
+
});
|
|
210
486
|
}
|
|
211
|
-
const variants = [...latestByVariant.values()];
|
|
212
487
|
|
|
213
|
-
// Harness-lift mode:
|
|
488
|
+
// Harness-lift mode: paired sign test of every other variant vs the baseline.
|
|
214
489
|
let lift = null;
|
|
215
490
|
if (baseline) {
|
|
216
|
-
const
|
|
217
|
-
if (!
|
|
218
|
-
lift =
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
491
|
+
const baseTrials = trialsByVariant.get(baseline);
|
|
492
|
+
if (!baseTrials) fail(`no results for baseline variant "${baseline}" on scenario "${scenario}"`);
|
|
493
|
+
lift = [];
|
|
494
|
+
for (const [variant, trials] of trialsByVariant) {
|
|
495
|
+
if (variant === baseline) continue;
|
|
496
|
+
// Pair trial i of variant with trial i of baseline. If trial counts differ, pair what we can.
|
|
497
|
+
const paired = Math.min(trials.length, baseTrials.length);
|
|
498
|
+
if (paired === 0) continue;
|
|
499
|
+
const deltas = [];
|
|
500
|
+
for (let i = 0; i < paired; i++) deltas.push(trials[i].aggregate - baseTrials[i].aggregate);
|
|
501
|
+
const median = deltas.slice().sort((a, b) => a - b)[Math.floor(deltas.length / 2)];
|
|
502
|
+
// Binomial sign test: under H0 (no effect), wins ~ Binomial(n, 0.5).
|
|
503
|
+
// Two-sided p-value = 2 * P(X >= k_wins | n, 0.5) for k_wins >= n/2.
|
|
504
|
+
const wins = deltas.filter((d) => d > 0).length;
|
|
505
|
+
const losses = deltas.filter((d) => d < 0).length;
|
|
506
|
+
const ties = deltas.filter((d) => d === 0).length;
|
|
507
|
+
const nNonTie = wins + losses;
|
|
508
|
+
const pValue = signTestPValue(Math.max(wins, losses), nNonTie);
|
|
509
|
+
const verdict = nNonTie < 3
|
|
510
|
+
? "INCONCLUSIVE (n<3, need more trials)"
|
|
511
|
+
: pValue >= 0.05
|
|
512
|
+
? "INCONCLUSIVE (p>=0.05)"
|
|
513
|
+
: Math.abs(median) < 0.05
|
|
514
|
+
? "INCONCLUSIVE (effect <0.05)"
|
|
515
|
+
: median > 0 ? "HELPS" : "HURTS";
|
|
516
|
+
|
|
517
|
+
// Per-axis median delta across paired trials (axes present in both sides).
|
|
518
|
+
const axisDelta = {};
|
|
519
|
+
const axesInBoth = new Set(baseTrials[0].axes.map((a) => a.name));
|
|
520
|
+
for (const axis of axesInBoth) {
|
|
521
|
+
const ds = [];
|
|
522
|
+
for (let i = 0; i < paired; i++) {
|
|
523
|
+
const ba = baseTrials[i].axes.find((a) => a.name === axis && a.confidence > 0);
|
|
524
|
+
const va = trials[i].axes.find((a) => a.name === axis && a.confidence > 0);
|
|
525
|
+
if (ba && va) ds.push(va.score - ba.score);
|
|
225
526
|
}
|
|
226
|
-
|
|
227
|
-
|
|
527
|
+
if (ds.length === 0) continue;
|
|
528
|
+
const sorted = ds.slice().sort((a, b) => a - b);
|
|
529
|
+
axisDelta[axis] = round(sorted[Math.floor(sorted.length / 2)]);
|
|
530
|
+
}
|
|
531
|
+
lift.push({ variant, n: paired, medianDelta: round(median), wins, losses, ties, pValue: round(pValue), verdict, axisDelta });
|
|
532
|
+
}
|
|
228
533
|
}
|
|
229
534
|
|
|
230
535
|
if (asJson) {
|
|
231
536
|
console.log(JSON.stringify({
|
|
232
537
|
scenario,
|
|
233
|
-
variants:
|
|
538
|
+
variants: variantSummaries,
|
|
234
539
|
...(lift ? { baseline, lift } : {}),
|
|
235
540
|
}, null, 2));
|
|
236
541
|
process.exit(0);
|
|
237
542
|
}
|
|
238
543
|
|
|
239
544
|
console.log(`AgentRig — variant comparison for "${scenario}"\n`);
|
|
240
|
-
if (
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
545
|
+
if (variantSummaries.length === 0) {
|
|
546
|
+
console.log(" No results for that scenario.");
|
|
547
|
+
} else {
|
|
548
|
+
console.log(` ${"variant".padEnd(12)} ${"n".padStart(3)} ${"mean".padStart(6)} ${"stdev".padStart(6)} ${"pass%".padStart(6)} judge`);
|
|
549
|
+
for (const s of variantSummaries) {
|
|
550
|
+
console.log(` ${s.variant.padEnd(12)} ${String(s.n).padStart(3)} ${s.meanAggregate.toFixed(3).padStart(6)} ${s.stdevAggregate.toFixed(3).padStart(6)} ${(s.passRate * 100).toFixed(0).padStart(5)}% ${s.judge}`);
|
|
551
|
+
}
|
|
244
552
|
}
|
|
245
553
|
if (lift) {
|
|
246
|
-
console.log(`\n Harness lift vs baseline "${baseline}":`);
|
|
554
|
+
console.log(`\n Harness lift vs baseline "${baseline}" (paired sign test):`);
|
|
247
555
|
for (const l of lift) {
|
|
248
|
-
const sign = l.
|
|
249
|
-
|
|
250
|
-
console.log(` ${l.variant.padEnd(12)} aggregate ${sign}${l.aggregateDelta.toFixed(2)} → harness ${verdict}`);
|
|
556
|
+
const sign = l.medianDelta > 0 ? "+" : "";
|
|
557
|
+
console.log(` ${l.variant.padEnd(12)} n=${l.n} median Δ ${sign}${l.medianDelta.toFixed(3)} wins/losses/ties ${l.wins}/${l.losses}/${l.ties} p=${l.pValue.toFixed(3)} → ${l.verdict}`);
|
|
251
558
|
for (const [name, d] of Object.entries(l.axisDelta)) {
|
|
252
|
-
if (d !== 0) console.log(` ${name.padEnd(
|
|
559
|
+
if (d !== 0) console.log(` ${name.padEnd(22)} median Δ ${d > 0 ? "+" : ""}${d.toFixed(3)}`);
|
|
253
560
|
}
|
|
254
561
|
}
|
|
255
562
|
}
|
|
256
563
|
process.exit(0);
|
|
257
564
|
}
|
|
565
|
+
|
|
566
|
+
/** Two-sided binomial sign-test p-value: P(X >= k or X <= n-k | n, 0.5). */
|
|
567
|
+
function signTestPValue(k, n) {
|
|
568
|
+
if (n === 0) return 1;
|
|
569
|
+
// sum of binomial PMF from max(k, n-k) to n, then double (two-sided).
|
|
570
|
+
const upper = Math.max(k, n - k);
|
|
571
|
+
let pTail = 0;
|
|
572
|
+
for (let i = upper; i <= n; i++) pTail += binomCoeff(n, i) * Math.pow(0.5, n);
|
|
573
|
+
// Cap at 1.0 (two-sided x2, but when k == n/2 exactly the tails meet).
|
|
574
|
+
return Math.min(1, pTail * 2);
|
|
575
|
+
}
|
|
576
|
+
function binomCoeff(n, k) {
|
|
577
|
+
if (k < 0 || k > n) return 0;
|
|
578
|
+
if (k === 0 || k === n) return 1;
|
|
579
|
+
k = Math.min(k, n - k);
|
|
580
|
+
let c = 1;
|
|
581
|
+
for (let i = 0; i < k; i++) c = (c * (n - i)) / (i + 1);
|
|
582
|
+
return c;
|
|
583
|
+
}
|