@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +62 -27
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +30 -5
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/doctor.js +53 -8
  7. package/dist/commands/doctor.js.map +1 -1
  8. package/dist/commands/eval-dynamic.js +316 -0
  9. package/dist/commands/eval-dynamic.js.map +1 -0
  10. package/dist/commands/eval-scaffold.js +173 -0
  11. package/dist/commands/eval-scaffold.js.map +1 -0
  12. package/dist/commands/eval.js +184 -55
  13. package/dist/commands/eval.js.map +1 -1
  14. package/dist/core/audit.js +237 -9
  15. package/dist/core/audit.js.map +1 -1
  16. package/dist/core/model-family.js +31 -0
  17. package/dist/core/model-family.js.map +1 -0
  18. package/dist/core/scenario-runner.js +298 -0
  19. package/dist/core/scenario-runner.js.map +1 -0
  20. package/dist/prompts/index.js +121 -30
  21. package/dist/prompts/index.js.map +1 -1
  22. package/knowledge/PRINCIPLES.md +2 -2
  23. package/knowledge/manifest.json +16 -1
  24. package/knowledge/templates/AGENTS.md +7 -6
  25. package/knowledge/templates/agents/README.md +4 -4
  26. package/knowledge/templates/agents/developer.yml +1 -1
  27. package/knowledge/templates/agents/judge.yml +1 -1
  28. package/knowledge/templates/agents/reviewer.yml +1 -1
  29. package/knowledge/templates/agents/triager.yml +5 -4
  30. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  31. package/knowledge/templates/eval/RUBRIC.md +87 -64
  32. package/knowledge/templates/eval/axes.json +25 -25
  33. package/knowledge/templates/eval/calibration/README.md +54 -0
  34. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  35. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  36. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  37. package/knowledge/templates/eval/checks.json +88 -11
  38. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  39. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  40. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  41. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  42. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  43. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  44. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  45. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  46. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  47. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  48. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  49. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  50. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  51. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  52. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  53. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  54. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  55. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  56. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  57. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  58. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  59. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  60. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  61. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  62. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  63. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  64. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  65. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  66. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  67. package/knowledge/templates/eval/score.mjs +368 -42
  68. package/knowledge/templates/eval/static-audit.mjs +204 -17
  69. package/knowledge/templates/harness/state-machine.yml +18 -12
  70. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  71. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  72. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  73. package/package.json +4 -3
  74. package/knowledge/templates/eval/scenarios/README.md +0 -24
  75. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  76. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  77. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -22,6 +22,26 @@ const scriptDir = dirname(fileURLToPath(import.meta.url));
22
22
  const resultsDir = join(scriptDir, "results");
23
23
  const axesPath = join(scriptDir, "axes.json");
24
24
 
25
+ // Mirror of src/core/model-family.ts. Kept inline to keep this script dep-free so it
26
+ // works in target repos that haven't run `npm install` after `agentrig init`.
27
+ const FAMILY_PATTERNS = [
28
+ ["anthropic-claude", /^(anthropic[\.\/-])?claude([-_\.]|$)/i],
29
+ ["openai-gpt", /^(openai[\.\/-])?(gpt|o[1-9]|codex|davinci|chatgpt)([-_\.]|$)/i],
30
+ ["google-gemini", /^(google[\.\/-])?(gemini|palm|bard|flash)([-_\.]|$)/i],
31
+ ["mistral", /^(mistral|mixtral|codestral|ministral)([-_\.]|$)/i],
32
+ ["deepseek", /^deepseek([-_\.]|$)/i],
33
+ ["meta-llama", /^(meta[\.\/-])?(llama|code-?llama)([-_\.]|$)/i],
34
+ ["xai-grok", /^(xai[\.\/-])?grok([-_\.]|$)/i],
35
+ ["cohere", /^(cohere[\.\/-])?(command|aya)([-_\.]|$)/i],
36
+ ["qwen", /^qwen([-_\.]|$)/i],
37
+ ];
38
+ function modelFamily(id) {
39
+ if (!id) return "";
40
+ for (const [name, rx] of FAMILY_PATTERNS) if (rx.test(id)) return name;
41
+ const m = id.match(/^([a-z0-9]+)/i);
42
+ return m ? `unknown:${m[1].toLowerCase()}` : `unknown:${id}`;
43
+ }
44
+
25
45
  function loadRegistry() {
26
46
  if (!existsSync(axesPath)) {
27
47
  console.error(`axes.json not found at ${axesPath}`);
@@ -30,7 +50,9 @@ function loadRegistry() {
30
50
  return JSON.parse(readFileSync(axesPath, "utf8"));
31
51
  }
32
52
 
33
- /** Build axis -> { category, codes } lookup for a rubric type. */
53
+ /** Build axis -> { category, codes, weight, veto } lookup for a rubric type. Supports
54
+ * both legacy schema (axis: [CODE,...]) and v2 schema (axis: { codes:[...], weight, veto }).
55
+ */
34
56
  function axisIndex(registry, type) {
35
57
  const def = registry.types?.[type];
36
58
  if (!def) {
@@ -39,7 +61,12 @@ function axisIndex(registry, type) {
39
61
  }
40
62
  const index = new Map();
41
63
  for (const [category, axes] of Object.entries(def.categories)) {
42
- for (const [axis, codes] of Object.entries(axes)) index.set(axis, { category, codes });
64
+ for (const [axis, spec] of Object.entries(axes)) {
65
+ const meta = Array.isArray(spec)
66
+ ? { category, codes: spec, weight: 1, veto: false }
67
+ : { category, codes: spec.codes || [], weight: spec.weight ?? 1, veto: Boolean(spec.veto) };
68
+ index.set(axis, meta);
69
+ }
43
70
  }
44
71
  return index;
45
72
  }
@@ -93,30 +120,70 @@ if (cmd === "save") {
93
120
  if (!meta.codes.includes(code)) fail(`issue code "${code}" is not valid for axis "${name}". Valid: ${meta.codes.join(", ")}`);
94
121
  if (!evidence) fail(`axis "${name}" scored ${score} < 1.0 but has no evidence — use name=score:CODE:evidence`);
95
122
  }
96
- return { name, category: meta.category, score, issue: code || null, evidence, confidence: 1 };
123
+ return { name, category: meta.category, weight: meta.weight, veto: meta.veto, score, issue: code || null, evidence, confidence: 1 };
97
124
  });
98
125
 
99
- // Recompute rollups from axis data (never trust hand-supplied totals). Confidence-gated.
126
+ // Recompute rollups from axis data (never trust hand-supplied totals). Confidence-gated + weighted.
100
127
  const observed = axes.filter((a) => a.confidence > 0);
101
128
  const categories = {};
102
- for (const a of observed) (categories[a.category] ||= []).push(a.score);
129
+ for (const a of observed) (categories[a.category] ||= []).push({ score: a.score, weight: a.weight });
103
130
  const categoryScores = Object.fromEntries(
104
- Object.entries(categories).map(([c, xs]) => [c, round(xs.reduce((s, x) => s + x, 0) / xs.length)]),
131
+ Object.entries(categories).map(([c, xs]) => {
132
+ const wSum = xs.reduce((s, x) => s + x.weight, 0);
133
+ const wScore = xs.reduce((s, x) => s + x.weight * x.score, 0);
134
+ return [c, round(wSum ? wScore / wSum : 0)];
135
+ }),
105
136
  );
106
- const aggregate = observed.length ? round(observed.reduce((s, a) => s + a.score, 0) / observed.length) : 0;
107
- const pass = observed.length > 0 && aggregate >= PASS && observed.every((a) => a.score > 0);
137
+ const wSum = observed.reduce((s, a) => s + a.weight, 0);
138
+ const wScore = observed.reduce((s, a) => s + a.weight * a.score, 0);
139
+ const aggregate = wSum ? round(wScore / wSum) : 0;
140
+
141
+ // Pass rule: aggregate clears threshold AND no observed axis is zero AND no veto axis < 1.0.
142
+ // veto axes encode "cosmetics cannot bail out a correctness/gate-compliance regression."
143
+ const vetoFails = observed.filter((a) => a.veto && a.score < 1).map((a) => a.name);
144
+ const hardZeros = observed.filter((a) => a.score === 0).map((a) => a.name);
145
+ let pass = observed.length > 0 && aggregate >= PASS && hardZeros.length === 0 && vetoFails.length === 0;
146
+ const failReason = !observed.length ? "no observed axes"
147
+ : vetoFails.length ? `veto axis fail: ${vetoFails.join(", ")}`
148
+ : hardZeros.length ? `zero score on: ${hardZeros.join(", ")}`
149
+ : aggregate < PASS ? `aggregate ${aggregate.toFixed(2)} < ${PASS}`
150
+ : null;
151
+
152
+ // Producer / judge metadata. Comes from --producer-model / --judge-model flags OR from
153
+ // env vars (the orchestrator sets AGENTRIG_PRODUCER_MODEL / AGENTRIG_JUDGE_MODEL so it
154
+ // doesn't have to thread two more positional args through). Family-divergence is enforced:
155
+ // a result where producer + judge share a family is rejected unless --allow-same-family
156
+ // (or AGENTRIG_ALLOW_SAME_FAMILY=1) is set, and the override gets recorded so reviewers
157
+ // can spot lazy single-model setups.
158
+ const producerModel = getOpt(args, "--producer-model") || process.env.AGENTRIG_PRODUCER_MODEL || "";
159
+ const judgeModel = getOpt(args, "--judge-model") || process.env.AGENTRIG_JUDGE_MODEL || judge;
160
+ const allowSameFamily = args.includes("--allow-same-family") || process.env.AGENTRIG_ALLOW_SAME_FAMILY === "1";
161
+ const trialIndex = getOpt(args, "--trial");
162
+ if (producerModel && judgeModel) {
163
+ if (modelFamily(producerModel) === modelFamily(judgeModel) && !allowSameFamily) {
164
+ fail(`producer "${producerModel}" and judge "${judgeModel}" share family "${modelFamily(producerModel)}". ` +
165
+ `Pass --allow-same-family (or set AGENTRIG_ALLOW_SAME_FAMILY=1) to override; the override will be recorded.`);
166
+ }
167
+ }
108
168
 
109
169
  const record = {
170
+ schemaVersion: 2,
110
171
  type, task, scenario, variant, run, judge,
172
+ producerModel: producerModel || null,
173
+ judgeModel: judgeModel || null,
174
+ producerFamily: producerModel ? modelFamily(producerModel) : null,
175
+ judgeFamily: judgeModel ? modelFamily(judgeModel) : null,
176
+ allowSameFamily,
177
+ trialIndex: trialIndex != null ? Number(trialIndex) : null,
111
178
  timestamp: new Date().toISOString(),
112
- aggregate, pass, categoryScores, axes,
179
+ aggregate, pass, failReason, categoryScores, axes,
113
180
  };
114
181
 
115
182
  if (!existsSync(resultsDir)) mkdirSync(resultsDir, { recursive: true });
116
183
  const safe = (s) => String(s).replace(/[^a-zA-Z0-9_.-]/g, "_");
117
- const file = join(resultsDir, `${safe(type)}.${safe(scenario)}.${safe(variant || "base")}.${Date.now()}.json`);
184
+ const file = join(resultsDir, `${safe(type)}.${safe(scenario)}.${safe(variant || "base")}${trialIndex != null ? "." + safe("trial" + trialIndex) : ""}.${Date.now()}.json`);
118
185
  writeFileSync(file, JSON.stringify(record, null, 2));
119
- console.log(`Saved ${file}\n aggregate=${aggregate.toFixed(2)} ${pass ? "PASS" : "FAIL"} (${observed.length}/${axes.length} axes observed)`);
186
+ console.log(`Saved ${file}\n aggregate=${aggregate.toFixed(2)} ${pass ? "PASS" : "FAIL"}${failReason ? ` — ${failReason}` : ""} (${observed.length}/${axes.length} axes observed)`);
120
187
  process.exit(0);
121
188
  }
122
189
 
@@ -134,6 +201,8 @@ if (cmd === "report" || cmd === "compare") {
134
201
  let scoped = records;
135
202
  if (filterType) scoped = scoped.filter((r) => r.type === filterType);
136
203
  if (filterVariant) scoped = scoped.filter((r) => (r.variant || "base") === filterVariant);
204
+ const filterRun = getOpt(args, "--run");
205
+ if (filterRun) scoped = scoped.filter((r) => r.run === filterRun);
137
206
 
138
207
  // Latest record per (type, scenario, variant).
139
208
  const latest = new Map();
@@ -160,6 +229,13 @@ if (cmd === "report" || cmd === "compare") {
160
229
  if (rows.length === 0) {
161
230
  console.log(" No results yet. Run `score.mjs save ...` first.");
162
231
  } else {
232
+ const passed = rows.filter((r) => r.pass).length;
233
+ const failed = rows.length - passed;
234
+ console.log(` Summary: ${passed}/${rows.length} scenario${rows.length === 1 ? "" : "s"} PASS, ${failed} FAIL`);
235
+ console.log(` Overall aggregate (mean of per-scenario aggregates): ${overall.toFixed(2)}\n`);
236
+ console.log(" Pass rule: aggregate ≥ 0.8 AND no observed axis = 0 AND no veto axis < 1.0.");
237
+ console.log(" Veto axes per type: run → correctness/gate_compliance; review → finding_correctness/blocking_decision.\n");
238
+
163
239
  const byType = new Map();
164
240
  for (const r of rows) {
165
241
  if (!byType.has(r.type)) byType.set(r.type, []);
@@ -169,20 +245,170 @@ if (cmd === "report" || cmd === "compare") {
169
245
  console.log(` ${type.toUpperCase()}`);
170
246
  for (const r of group) {
171
247
  const v = r.variant ? ` [${r.variant}]` : "";
172
- console.log(` ${r.pass ? "PASS" : "FAIL"} ${(r.scenario + v).padEnd(30)} ${r.aggregate.toFixed(2)} (${r.judge})`);
248
+ const verdict = r.pass ? "PASS" : "FAIL";
249
+ console.log(` ${verdict} ${(r.scenario + v).padEnd(30)} ${r.aggregate.toFixed(2)} ${r.failReason ? `← ${r.failReason}` : ""}`);
250
+ // Print failing axes with their issue code + evidence so the operator can act.
251
+ if (!r.pass) {
252
+ const failing = (r.axes || []).filter((a) => a.confidence > 0 && a.score < 1);
253
+ for (const a of failing) {
254
+ const tag = a.score === 0 ? "0 " : "½ ";
255
+ const code = a.issue ? `[${a.issue}]` : "";
256
+ const ev = (a.evidence || "").slice(0, 110);
257
+ console.log(` ${tag} ${a.name.padEnd(22)} ${code} ${ev}`);
258
+ }
259
+ }
173
260
  }
174
261
  }
175
- console.log("\n Per-axis means (observed only):");
176
- for (const [name, v] of axisAgg) console.log(` ${name.padEnd(22)} ${round(v.sum / v.n).toFixed(2)}`);
177
- console.log(`\n Overall: ${overall.toFixed(2)} across ${rows.length} result(s)`);
262
+ // Per-axis means: explicitly label which are observed across multiple scenarios.
263
+ console.log("\n Per-axis means across all observed scenarios (lower = worse on average):");
264
+ const axisRows = [...axisAgg.entries()].map(([name, v]) => ({ name, mean: round(v.sum / v.n), n: v.n }))
265
+ .sort((a, b) => a.mean - b.mean);
266
+ for (const a of axisRows) {
267
+ const flag = a.mean < 0.5 ? " ← weakest" : a.mean < 0.8 ? " ← weak" : "";
268
+ console.log(` ${a.name.padEnd(22)} ${a.mean.toFixed(2)} (n=${a.n})${flag}`);
269
+ }
270
+
271
+ console.log(`\n How to read this:`);
272
+ console.log(` • A scenario PASS means the harness handled this task well per the rubric.`);
273
+ console.log(` • A scenario FAIL means at least one veto axis dropped below 1.0 OR an observed axis was 0.`);
274
+ console.log(` • The overall aggregate (${overall.toFixed(2)}) is NOT the harness lift — that requires`);
275
+ console.log(` a baseline comparison: \`agentrig eval --dynamic --variant baseline --n 5\` then`);
276
+ console.log(` \`node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline\`.`);
277
+ if (failed > 0) {
278
+ console.log(` • To investigate a FAIL: open \`.agentrig/eval/results/runs/<runId>/<scenario>.trial0.diff.patch\``);
279
+ console.log(` and \`<scenario>.trial0.judge.json\` to see exactly what the producer did and what the judge saw.`);
280
+ }
178
281
  }
179
282
  }
180
283
  process.exit(0);
181
284
  }
182
285
 
183
- console.error("Usage: score.mjs <save|report|compare> ...");
286
+ if (cmd === "calibrate") {
287
+ // Calibrate a judge model against the hand-labeled set in eval/calibration/.
288
+ // For each instance, the calibrate script simply compares the agent-supplied
289
+ // judge_scores.json (path passed via --judge-scores) against the ground truth.
290
+ // The orchestration of "actually invoke the judge and capture its output" is
291
+ // CLI-side (`agentrig doctor` does it); this script is the pure scoring half.
292
+ //
293
+ // Usage:
294
+ // node score.mjs calibrate --instance <path-to-instance.yml> --judge-scores <path.json>
295
+ // node score.mjs calibrate --report # roll up cached results in calibration/results/
296
+ const calibDir = join(scriptDir, "calibration");
297
+ if (args.includes("--report")) {
298
+ runCalibrateReport(calibDir);
299
+ process.exit(0);
300
+ }
301
+ const instancePath = getOpt(args, "--instance");
302
+ const judgeScoresPath = getOpt(args, "--judge-scores");
303
+ const judgeModel = getOpt(args, "--judge") || "unknown";
304
+ if (!instancePath || !judgeScoresPath) fail("calibrate requires --instance <path.yml> and --judge-scores <path.json> (or --report)");
305
+ const result = runCalibrateOne(instancePath, judgeScoresPath, judgeModel);
306
+ const resultsDir2 = join(calibDir, "results");
307
+ if (!existsSync(resultsDir2)) mkdirSync(resultsDir2, { recursive: true });
308
+ const safe = (s) => String(s).replace(/[^a-zA-Z0-9_.-]/g, "_");
309
+ const out = join(resultsDir2, `${safe(judgeModel)}.${safe(result.instanceId)}.${Date.now()}.json`);
310
+ writeFileSync(out, JSON.stringify(result, null, 2));
311
+ console.log(`Saved ${out}`);
312
+ console.log(` ${result.instanceId}: agreement=${(result.agreement * 100).toFixed(1)}% (${result.matches}/${result.compared}) bias=${result.bias.toFixed(3)}`);
313
+ process.exit(0);
314
+ }
315
+
316
+ console.error("Usage: score.mjs <save|report|compare|calibrate> ...");
184
317
  process.exit(2);
185
318
 
319
+ // --- calibration helpers ---------------------------------------------------
320
+ function runCalibrateOne(instancePath, judgeScoresPath, judgeModel) {
321
+ if (!existsSync(instancePath)) fail(`instance not found: ${instancePath}`);
322
+ if (!existsSync(judgeScoresPath)) fail(`judge scores not found: ${judgeScoresPath}`);
323
+ // Tiny YAML reader inline — only needs to handle the flat structure of our calibration files.
324
+ // For brevity we punt to a real parse via a child process; but to stay dep-free we just JSON-parse
325
+ // judge scores and use a regex-based reader for the YAML ground truth.
326
+ const text = readFileSync(instancePath, "utf8");
327
+ const truth = parseCalibYaml(text);
328
+ const judge = JSON.parse(readFileSync(judgeScoresPath, "utf8"));
329
+ const judgeAxes = new Map((judge.axes || []).map((a) => [a.name, a]));
330
+ const compared = [];
331
+ for (const t of truth.ground_truth || []) {
332
+ const j = judgeAxes.get(t.axis);
333
+ if (!j) { compared.push({ axis: t.axis, truth: t.score, judge: null, diff: null, within: false }); continue; }
334
+ if ((t.confidence ?? 1) === 0 && (j.confidence ?? 1) === 0) {
335
+ compared.push({ axis: t.axis, truth: 0, judge: 0, diff: 0, within: true });
336
+ continue;
337
+ }
338
+ const diff = j.score - t.score;
339
+ const within = Math.abs(diff) <= 0.5;
340
+ compared.push({ axis: t.axis, truth: t.score, judge: j.score, diff, within });
341
+ }
342
+ const matches = compared.filter((c) => c.within).length;
343
+ const agreement = compared.length ? matches / compared.length : 0;
344
+ const signedDiffs = compared.filter((c) => c.diff != null).map((c) => c.diff);
345
+ const bias = signedDiffs.length ? signedDiffs.reduce((s, x) => s + x, 0) / signedDiffs.length : 0;
346
+ return {
347
+ instanceId: truth.id || "unknown",
348
+ judgeModel,
349
+ compared: compared.length,
350
+ matches,
351
+ agreement: round(agreement),
352
+ bias: round(bias),
353
+ axes: compared,
354
+ };
355
+ }
356
+
357
+ function runCalibrateReport(calibDir) {
358
+ const resultsDir2 = join(calibDir, "results");
359
+ if (!existsSync(resultsDir2)) {
360
+ console.log("No calibration results yet. Run `score.mjs calibrate --instance <path> --judge-scores <path>` first.");
361
+ return;
362
+ }
363
+ const byJudge = new Map();
364
+ for (const f of readdirSync(resultsDir2).filter((f) => f.endsWith(".json"))) {
365
+ let rec;
366
+ try { rec = JSON.parse(readFileSync(join(resultsDir2, f), "utf8")); }
367
+ catch { continue; }
368
+ if (!byJudge.has(rec.judgeModel)) byJudge.set(rec.judgeModel, []);
369
+ byJudge.get(rec.judgeModel).push(rec);
370
+ }
371
+ console.log("AgentRig — judge calibration report\n");
372
+ if (byJudge.size === 0) { console.log(" No calibration results yet."); return; }
373
+ console.log(` ${"judge".padEnd(28)} ${"n".padStart(3)} ${"agree%".padStart(7)} ${"bias".padStart(7)}`);
374
+ for (const [judge, recs] of byJudge) {
375
+ const meanAgree = recs.reduce((s, r) => s + r.agreement, 0) / recs.length;
376
+ const meanBias = recs.reduce((s, r) => s + r.bias, 0) / recs.length;
377
+ const flag = meanAgree < 0.8 ? " (below 80% threshold)" : "";
378
+ console.log(` ${(judge || "unknown").padEnd(28)} ${String(recs.length).padStart(3)} ${(meanAgree * 100).toFixed(1).padStart(6)}% ${meanBias.toFixed(3).padStart(7)}${flag}`);
379
+ }
380
+ }
381
+
382
+ /** Minimal YAML reader for our calibration file shape: top-level scalars + a `ground_truth` list of
383
+ * `{ axis, score, confidence?, code?, evidence? }` flow-mapping items. Avoids adding `yaml` as a
384
+ * dep so the installed score.mjs stays self-contained. */
385
+ function parseCalibYaml(text) {
386
+ const out = { ground_truth: [] };
387
+ let inGT = false;
388
+ for (const raw of text.split(/\r?\n/)) {
389
+ if (/^ground_truth:\s*$/.test(raw)) { inGT = true; continue; }
390
+ if (inGT && /^\s*-\s*\{/.test(raw)) {
391
+ const body = raw.replace(/^\s*-\s*\{/, "").replace(/\}\s*$/, "");
392
+ const kv = {};
393
+ for (const pair of body.split(",")) {
394
+ const m = pair.trim().match(/^(\w+):\s*(.*)$/);
395
+ if (!m) continue;
396
+ let v = m[2].trim();
397
+ if (/^-?\d+(\.\d+)?$/.test(v)) v = Number(v);
398
+ else if (v.startsWith('"')) v = v.slice(1, -1);
399
+ kv[m[1]] = v;
400
+ }
401
+ out.ground_truth.push(kv);
402
+ continue;
403
+ }
404
+ // Any top-level (non-indented) key exits the ground_truth block.
405
+ if (/^\S/.test(raw)) inGT = false;
406
+ const m = raw.match(/^(\w+):\s*(.+?)\s*$/);
407
+ if (m && !inGT) out[m[1]] = /^-?\d+(\.\d+)?$/.test(m[2]) ? Number(m[2]) : m[2].replace(/^["']|["']$/g, "");
408
+ }
409
+ return out;
410
+ }
411
+
186
412
  // --- helpers ---------------------------------------------------------------
187
413
  function round(n) {
188
414
  return Math.round(n * 10000) / 10000;
@@ -191,67 +417,167 @@ function round(n) {
191
417
  function loadRecords() {
192
418
  if (!existsSync(resultsDir)) return [];
193
419
  const out = [];
420
+ let skipped = 0;
194
421
  for (const f of readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
422
+ let rec;
195
423
  try {
196
- out.push(JSON.parse(readFileSync(join(resultsDir, f), "utf8")));
424
+ rec = JSON.parse(readFileSync(join(resultsDir, f), "utf8"));
197
425
  } catch {
198
426
  console.error(`warning: skipping corrupt result file ${f}`);
427
+ skipped++;
428
+ continue;
199
429
  }
430
+ const reason = validateRecord(rec);
431
+ if (reason) {
432
+ console.error(`warning: skipping ${f} (${reason}) — move to results/_legacy/ to silence`);
433
+ skipped++;
434
+ continue;
435
+ }
436
+ out.push(rec);
200
437
  }
438
+ if (skipped) console.error(`warning: ${skipped} result file(s) skipped due to invalid shape.`);
201
439
  return out;
202
440
  }
203
441
 
442
+ /** Minimal shape check for v2 records. Returns reason string if invalid, null if OK. */
443
+ function validateRecord(r) {
444
+ if (!r || typeof r !== "object") return "not an object";
445
+ if (r.schemaVersion !== 2) return `schemaVersion=${r.schemaVersion ?? "missing"} (expected 2)`;
446
+ if (typeof r.type !== "string") return "missing type";
447
+ if (typeof r.scenario !== "string") return "missing scenario";
448
+ if (!Array.isArray(r.axes)) return "axes is not an array";
449
+ for (const a of r.axes) {
450
+ if (!a || typeof a !== "object") return "axis is not an object";
451
+ if (typeof a.name !== "string") return "axis missing name";
452
+ if (typeof a.score !== "number") return `axis "${a.name}" missing numeric score`;
453
+ if (typeof a.confidence !== "number") return `axis "${a.name}" missing numeric confidence`;
454
+ }
455
+ return null;
456
+ }
457
+
204
458
  function compare(records, scenario, asJson, baseline) {
205
459
  if (!scenario) fail("compare requires --scenario <id>");
206
460
  const forScenario = records.filter((r) => r.scenario === scenario);
207
- const latestByVariant = new Map();
461
+
462
+ // Group by variant; keep ALL trials, not just the latest. This is the spine of P4.
463
+ const trialsByVariant = new Map();
208
464
  for (const r of forScenario.sort((a, b) => a.timestamp.localeCompare(b.timestamp))) {
209
- latestByVariant.set(r.variant || "base", r);
465
+ const v = r.variant || "base";
466
+ if (!trialsByVariant.has(v)) trialsByVariant.set(v, []);
467
+ trialsByVariant.get(v).push(r);
468
+ }
469
+
470
+ // Per-variant summary: n trials, mean ± stdev of aggregate, pass-rate.
471
+ const variantSummaries = [];
472
+ for (const [variant, trials] of trialsByVariant) {
473
+ const aggs = trials.map((t) => t.aggregate);
474
+ const mean = aggs.reduce((s, x) => s + x, 0) / aggs.length;
475
+ const variance = aggs.length > 1 ? aggs.reduce((s, x) => s + (x - mean) ** 2, 0) / (aggs.length - 1) : 0;
476
+ const stdev = Math.sqrt(variance);
477
+ const passRate = trials.filter((t) => t.pass).length / trials.length;
478
+ variantSummaries.push({
479
+ variant,
480
+ n: trials.length,
481
+ meanAggregate: round(mean),
482
+ stdevAggregate: round(stdev),
483
+ passRate: round(passRate),
484
+ judge: trials[trials.length - 1].judge,
485
+ });
210
486
  }
211
- const variants = [...latestByVariant.values()];
212
487
 
213
- // Harness-lift mode: delta of every other variant vs the baseline.
488
+ // Harness-lift mode: paired sign test of every other variant vs the baseline.
214
489
  let lift = null;
215
490
  if (baseline) {
216
- const base = latestByVariant.get(baseline);
217
- if (!base) fail(`no results for baseline variant "${baseline}" on scenario "${scenario}"`);
218
- lift = variants
219
- .filter((r) => (r.variant || "base") !== baseline)
220
- .map((r) => {
221
- const axisDelta = {};
222
- const baseAxes = Object.fromEntries((base.axes || []).filter((a) => a.confidence > 0).map((a) => [a.name, a.score]));
223
- for (const a of (r.axes || []).filter((a) => a.confidence > 0)) {
224
- if (baseAxes[a.name] !== undefined) axisDelta[a.name] = round(a.score - baseAxes[a.name]);
491
+ const baseTrials = trialsByVariant.get(baseline);
492
+ if (!baseTrials) fail(`no results for baseline variant "${baseline}" on scenario "${scenario}"`);
493
+ lift = [];
494
+ for (const [variant, trials] of trialsByVariant) {
495
+ if (variant === baseline) continue;
496
+ // Pair trial i of variant with trial i of baseline. If trial counts differ, pair what we can.
497
+ const paired = Math.min(trials.length, baseTrials.length);
498
+ if (paired === 0) continue;
499
+ const deltas = [];
500
+ for (let i = 0; i < paired; i++) deltas.push(trials[i].aggregate - baseTrials[i].aggregate);
501
+ const median = deltas.slice().sort((a, b) => a - b)[Math.floor(deltas.length / 2)];
502
+ // Binomial sign test: under H0 (no effect), wins ~ Binomial(n, 0.5).
503
+ // Two-sided p-value = 2 * P(X >= k_wins | n, 0.5) for k_wins >= n/2.
504
+ const wins = deltas.filter((d) => d > 0).length;
505
+ const losses = deltas.filter((d) => d < 0).length;
506
+ const ties = deltas.filter((d) => d === 0).length;
507
+ const nNonTie = wins + losses;
508
+ const pValue = signTestPValue(Math.max(wins, losses), nNonTie);
509
+ const verdict = nNonTie < 3
510
+ ? "INCONCLUSIVE (n<3, need more trials)"
511
+ : pValue >= 0.05
512
+ ? "INCONCLUSIVE (p>=0.05)"
513
+ : Math.abs(median) < 0.05
514
+ ? "INCONCLUSIVE (effect <0.05)"
515
+ : median > 0 ? "HELPS" : "HURTS";
516
+
517
+ // Per-axis median delta across paired trials (axes present in both sides).
518
+ const axisDelta = {};
519
+ const axesInBoth = new Set(baseTrials[0].axes.map((a) => a.name));
520
+ for (const axis of axesInBoth) {
521
+ const ds = [];
522
+ for (let i = 0; i < paired; i++) {
523
+ const ba = baseTrials[i].axes.find((a) => a.name === axis && a.confidence > 0);
524
+ const va = trials[i].axes.find((a) => a.name === axis && a.confidence > 0);
525
+ if (ba && va) ds.push(va.score - ba.score);
225
526
  }
226
- return { variant: r.variant || "base", aggregateDelta: round(r.aggregate - base.aggregate), axisDelta };
227
- });
527
+ if (ds.length === 0) continue;
528
+ const sorted = ds.slice().sort((a, b) => a - b);
529
+ axisDelta[axis] = round(sorted[Math.floor(sorted.length / 2)]);
530
+ }
531
+ lift.push({ variant, n: paired, medianDelta: round(median), wins, losses, ties, pValue: round(pValue), verdict, axisDelta });
532
+ }
228
533
  }
229
534
 
230
535
  if (asJson) {
231
536
  console.log(JSON.stringify({
232
537
  scenario,
233
- variants: variants.map((r) => ({ variant: r.variant || "base", aggregate: r.aggregate, pass: r.pass, judge: r.judge, categoryScores: r.categoryScores })),
538
+ variants: variantSummaries,
234
539
  ...(lift ? { baseline, lift } : {}),
235
540
  }, null, 2));
236
541
  process.exit(0);
237
542
  }
238
543
 
239
544
  console.log(`AgentRig — variant comparison for "${scenario}"\n`);
240
- if (variants.length === 0) console.log(" No results for that scenario.");
241
- for (const r of variants) {
242
- console.log(` ${(r.variant || "base").padEnd(12)} ${r.aggregate.toFixed(2)} ${r.pass ? "PASS" : "FAIL"} (${r.judge})`);
243
- for (const [c, s] of Object.entries(r.categoryScores || {})) console.log(` ${c.padEnd(20)} ${s.toFixed(2)}`);
545
+ if (variantSummaries.length === 0) {
546
+ console.log(" No results for that scenario.");
547
+ } else {
548
+ console.log(` ${"variant".padEnd(12)} ${"n".padStart(3)} ${"mean".padStart(6)} ${"stdev".padStart(6)} ${"pass%".padStart(6)} judge`);
549
+ for (const s of variantSummaries) {
550
+ console.log(` ${s.variant.padEnd(12)} ${String(s.n).padStart(3)} ${s.meanAggregate.toFixed(3).padStart(6)} ${s.stdevAggregate.toFixed(3).padStart(6)} ${(s.passRate * 100).toFixed(0).padStart(5)}% ${s.judge}`);
551
+ }
244
552
  }
245
553
  if (lift) {
246
- console.log(`\n Harness lift vs baseline "${baseline}":`);
554
+ console.log(`\n Harness lift vs baseline "${baseline}" (paired sign test):`);
247
555
  for (const l of lift) {
248
- const sign = l.aggregateDelta > 0 ? "+" : "";
249
- const verdict = l.aggregateDelta > 0 ? "HELPS" : l.aggregateDelta < 0 ? "HURTS" : "no change";
250
- console.log(` ${l.variant.padEnd(12)} aggregate ${sign}${l.aggregateDelta.toFixed(2)} → harness ${verdict}`);
556
+ const sign = l.medianDelta > 0 ? "+" : "";
557
+ console.log(` ${l.variant.padEnd(12)} n=${l.n} median Δ ${sign}${l.medianDelta.toFixed(3)} wins/losses/ties ${l.wins}/${l.losses}/${l.ties} p=${l.pValue.toFixed(3)} → ${l.verdict}`);
251
558
  for (const [name, d] of Object.entries(l.axisDelta)) {
252
- if (d !== 0) console.log(` ${name.padEnd(20)} ${d > 0 ? "+" : ""}${d.toFixed(2)}`);
559
+ if (d !== 0) console.log(` ${name.padEnd(22)} median Δ ${d > 0 ? "+" : ""}${d.toFixed(3)}`);
253
560
  }
254
561
  }
255
562
  }
256
563
  process.exit(0);
257
564
  }
565
+
566
+ /** Two-sided binomial sign-test p-value: P(X >= k or X <= n-k | n, 0.5). */
567
+ function signTestPValue(k, n) {
568
+ if (n === 0) return 1;
569
+ // sum of binomial PMF from max(k, n-k) to n, then double (two-sided).
570
+ const upper = Math.max(k, n - k);
571
+ let pTail = 0;
572
+ for (let i = upper; i <= n; i++) pTail += binomCoeff(n, i) * Math.pow(0.5, n);
573
+ // Cap at 1.0 (two-sided x2, but when k == n/2 exactly the tails meet).
574
+ return Math.min(1, pTail * 2);
575
+ }
576
+ function binomCoeff(n, k) {
577
+ if (k < 0 || k > n) return 0;
578
+ if (k === 0 || k === n) return 1;
579
+ k = Math.min(k, n - k);
580
+ let c = 1;
581
+ for (let i = 0; i < k; i++) c = (c * (n - i)) / (i + 1);
582
+ return c;
583
+ }