@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +62 -27
  2. package/dist/agent/copilot.js +46 -5
  3. package/dist/agent/copilot.js.map +1 -1
  4. package/dist/cli.js +30 -5
  5. package/dist/cli.js.map +1 -1
  6. package/dist/commands/doctor.js +53 -8
  7. package/dist/commands/doctor.js.map +1 -1
  8. package/dist/commands/eval-dynamic.js +316 -0
  9. package/dist/commands/eval-dynamic.js.map +1 -0
  10. package/dist/commands/eval-scaffold.js +173 -0
  11. package/dist/commands/eval-scaffold.js.map +1 -0
  12. package/dist/commands/eval.js +184 -55
  13. package/dist/commands/eval.js.map +1 -1
  14. package/dist/core/audit.js +237 -9
  15. package/dist/core/audit.js.map +1 -1
  16. package/dist/core/model-family.js +31 -0
  17. package/dist/core/model-family.js.map +1 -0
  18. package/dist/core/scenario-runner.js +298 -0
  19. package/dist/core/scenario-runner.js.map +1 -0
  20. package/dist/prompts/index.js +121 -30
  21. package/dist/prompts/index.js.map +1 -1
  22. package/knowledge/PRINCIPLES.md +2 -2
  23. package/knowledge/manifest.json +16 -1
  24. package/knowledge/templates/AGENTS.md +7 -6
  25. package/knowledge/templates/agents/README.md +4 -4
  26. package/knowledge/templates/agents/developer.yml +1 -1
  27. package/knowledge/templates/agents/judge.yml +1 -1
  28. package/knowledge/templates/agents/reviewer.yml +1 -1
  29. package/knowledge/templates/agents/triager.yml +5 -4
  30. package/knowledge/templates/dashboard/dashboard.mjs +12 -5
  31. package/knowledge/templates/eval/RUBRIC.md +87 -64
  32. package/knowledge/templates/eval/axes.json +25 -25
  33. package/knowledge/templates/eval/calibration/README.md +54 -0
  34. package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
  35. package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
  36. package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
  37. package/knowledge/templates/eval/checks.json +88 -11
  38. package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
  39. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
  40. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
  41. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
  42. package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
  43. package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
  44. package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
  45. package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
  46. package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
  47. package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
  48. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
  49. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
  50. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
  51. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
  52. package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
  53. package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
  54. package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
  55. package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
  56. package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
  57. package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
  58. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
  59. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
  60. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
  61. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
  62. package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
  63. package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
  64. package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
  65. package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
  66. package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
  67. package/knowledge/templates/eval/score.mjs +368 -42
  68. package/knowledge/templates/eval/static-audit.mjs +204 -17
  69. package/knowledge/templates/harness/state-machine.yml +18 -12
  70. package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
  71. package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
  72. package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
  73. package/package.json +4 -3
  74. package/knowledge/templates/eval/scenarios/README.md +0 -24
  75. package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
  76. package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
  77. package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
@@ -1,10 +1,10 @@
1
1
  #!/usr/bin/env node
2
2
  // AgentRig static harness audit (principle 6) — deterministic, dependency-free, no model.
3
3
  // Interprets checks.json (the single source of truth, shared with `agentrig eval --static`)
4
- // against this repository and prints a Harness Score. Usage:
4
+ // against this repository and prints an Install Completeness + Quality Probes report. Usage:
5
5
  // node .agentrig/eval/static-audit.mjs human-readable report
6
6
  // node .agentrig/eval/static-audit.mjs --json machine-readable
7
- // node .agentrig/eval/static-audit.mjs --min 80 exit non-zero if score < 80%
7
+ // node .agentrig/eval/static-audit.mjs --min 80 exit non-zero if completeness < 80%
8
8
  import { readFileSync, existsSync, statSync, readdirSync } from "node:fs";
9
9
  import { fileURLToPath } from "node:url";
10
10
  import { dirname, join, resolve } from "node:path";
@@ -33,6 +33,66 @@ function extractValue(text, key) {
33
33
  return m ? m[1].trim() : null;
34
34
  }
35
35
 
36
+ // Mirror of src/core/model-family.ts. Kept inline to keep this script dep-free.
37
+ const FAMILY_PATTERNS = [
38
+ ["anthropic-claude", /^(anthropic[\.\/-])?claude([-_\.]|$)/i],
39
+ ["openai-gpt", /^(openai[\.\/-])?(gpt|o[1-9]|codex|davinci|chatgpt)([-_\.]|$)/i],
40
+ ["google-gemini", /^(google[\.\/-])?(gemini|palm|bard|flash)([-_\.]|$)/i],
41
+ ["mistral", /^(mistral|mixtral|codestral|ministral)([-_\.]|$)/i],
42
+ ["deepseek", /^deepseek([-_\.]|$)/i],
43
+ ["meta-llama", /^(meta[\.\/-])?(llama|code-?llama)([-_\.]|$)/i],
44
+ ["xai-grok", /^(xai[\.\/-])?grok([-_\.]|$)/i],
45
+ ["cohere", /^(cohere[\.\/-])?(command|aya)([-_\.]|$)/i],
46
+ ["qwen", /^qwen([-_\.]|$)/i],
47
+ ];
48
+ function modelFamily(id) {
49
+ if (!id) return "";
50
+ for (const [name, rx] of FAMILY_PATTERNS) if (rx.test(id)) return name;
51
+ const m = id.match(/^([a-z0-9]+)/i);
52
+ return m ? `unknown:${m[1].toLowerCase()}` : `unknown:${id}`;
53
+ }
54
+
55
+ // Line-oriented mini-YAML reader good enough for state-machine.yml.
56
+ function readStateMachine(text) {
57
+ if (!text) return { states: [], transitions: [] };
58
+ const states = [];
59
+ const transitions = [];
60
+ let inStates = false;
61
+ let inTransitions = false;
62
+ for (const raw of text.split(/\r?\n/)) {
63
+ if (/^states:\s*$/.test(raw)) { inStates = true; inTransitions = false; continue; }
64
+ if (/^transitions:\s*$/.test(raw)) { inTransitions = true; inStates = false; continue; }
65
+ if (/^\S/.test(raw)) { inStates = false; inTransitions = false; continue; }
66
+ const line = raw.replace(/#.*$/, "").trimEnd();
67
+ if (!line.trim().startsWith("-")) continue;
68
+ if (inStates) {
69
+ const m = line.match(/-\s*name:\s*(\S+)/);
70
+ if (m) states.push(m[1]);
71
+ } else if (inTransitions) {
72
+ // Accept both legacy ("- from: A to: B trigger: ...") and proper flow-mapping
73
+ // ("- { from: A, to: B, trigger: ... }") syntaxes.
74
+ const item = line.replace(/^\s*-\s*/, "").replace(/^\{|\}$/g, "");
75
+ const get = (k) => (item.match(new RegExp("\\b" + k + ":\\s*([^,\\s}]+)")) || [])[1];
76
+ const from = get("from"), to = get("to"), trigger = get("trigger");
77
+ if (from && to) transitions.push({ from, to, trigger });
78
+ }
79
+ }
80
+ return { states, transitions };
81
+ }
82
+ function hasPath(adj, src, dst) {
83
+ if (src === dst) return true;
84
+ const seen = new Set([src]);
85
+ const q = [src];
86
+ while (q.length) {
87
+ const cur = q.shift();
88
+ for (const n of adj.get(cur) || []) {
89
+ if (n === dst) return true;
90
+ if (!seen.has(n)) { seen.add(n); q.push(n); }
91
+ }
92
+ }
93
+ return false;
94
+ }
95
+
36
96
  function scoreCheck(c) {
37
97
  switch (c.type) {
38
98
  case "path-exists":
@@ -58,6 +118,25 @@ function scoreCheck(c) {
58
118
  if (missing.length === 0) return { score: 1, evidence: "" };
59
119
  return { score: 0.5, evidence: `missing keys: ${missing.join(", ")}` };
60
120
  }
121
+ case "frontmatter-keys-all": {
122
+ const dir = c.path, fileName = c.file || "SKILL.md";
123
+ const abs = rel(dir);
124
+ if (!existsSync(abs) || !statSync(abs).isDirectory()) return { score: 0, evidence: `missing dir ${dir}` };
125
+ const keys = c.keys || [];
126
+ const offenders = [];
127
+ for (const entry of readdirSync(abs)) {
128
+ if (entry.startsWith(".") || entry.startsWith("_")) continue;
129
+ const subAbs = join(abs, entry);
130
+ if (!statSync(subAbs).isDirectory()) continue;
131
+ const filePath = join(subAbs, fileName);
132
+ if (!existsSync(filePath)) { offenders.push(`${entry}/${fileName} missing`); continue; }
133
+ const fm = frontmatter(readFileSync(filePath, "utf8"));
134
+ if (fm == null) { offenders.push(`${entry} no frontmatter`); continue; }
135
+ const missing = keys.filter((k) => !new RegExp("^\\s*" + k + "\\s*:", "m").test(fm));
136
+ if (missing.length) offenders.push(`${entry} missing ${missing.join("/")}`);
137
+ }
138
+ return offenders.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: offenders.join("; ") };
139
+ }
61
140
  case "roles-distinct-models": {
62
141
  const dev = extractValue(read(c.developer), c.key || "model");
63
142
  const rev = extractValue(read(c.reviewer), c.key || "model");
@@ -65,6 +144,95 @@ function scoreCheck(c) {
65
144
  if (dev !== rev) return { score: 1, evidence: "" };
66
145
  return { score: 0.5, evidence: `developer and reviewer share model "${dev}"` };
67
146
  }
147
+ case "roles-distinct-families": {
148
+ const dev = extractValue(read(c.developer), c.key || "model");
149
+ const rev = extractValue(read(c.reviewer), c.key || "model");
150
+ if (!dev || !rev) return { score: 0, evidence: "developer/reviewer model not declared" };
151
+ const sameFamily = modelFamily(dev) === modelFamily(rev);
152
+ if (!sameFamily) return { score: 1, evidence: "" };
153
+ return { score: 0, evidence: `developer "${dev}" and reviewer "${rev}" share a model family` };
154
+ }
155
+ case "state-machine-dag": {
156
+ const text = read(c.path);
157
+ if (text == null) return { score: 0, evidence: `missing ${c.path}` };
158
+ const { states, transitions } = readStateMachine(text);
159
+ const minStates = c.minStates ?? 6;
160
+ const requirePath = c.requirePath || "queued->merged";
161
+ const problems = [];
162
+ if (states.length < minStates) problems.push(`${states.length} states, need ≥${minStates}`);
163
+ const stateSet = new Set(states);
164
+ const adj = new Map();
165
+ for (const t of transitions) {
166
+ if (t.from === "any") {
167
+ for (const s of stateSet) {
168
+ if (!adj.has(s)) adj.set(s, new Set());
169
+ adj.get(s).add(t.to);
170
+ }
171
+ } else {
172
+ if (!adj.has(t.from)) adj.set(t.from, new Set());
173
+ adj.get(t.from).add(t.to);
174
+ }
175
+ }
176
+ const [src, dst] = requirePath.split("->");
177
+ if (src && dst && !hasPath(adj, src, dst)) problems.push(`no path ${src}→${dst}`);
178
+ return problems.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: problems.join("; ") };
179
+ }
180
+ case "quality-probe": {
181
+ const probe = c.probe, p = c.path;
182
+ if (probe === "no-unfilled-placeholders") {
183
+ const text = read(p);
184
+ if (text == null) return { score: 0, evidence: `missing ${p}` };
185
+ // Strip code blocks + inline code so we don't false-positive on docs that *describe*
186
+ // placeholder syntax (e.g. "{{VAR}} substitution" in an architecture overview).
187
+ const stripped = text
188
+ .replace(/```[\s\S]*?```/g, "")
189
+ .replace(/`[^`\n]*`/g, "");
190
+ const tokens = stripped.match(/\{\{[A-Z_]+\}\}/g) || [];
191
+ return tokens.length === 0
192
+ ? { score: 1, evidence: "" }
193
+ : { score: 0, evidence: `unfilled tokens in ${p}: ${[...new Set(tokens)].join(", ")}` };
194
+ }
195
+ if (probe === "axes-json-coherent") {
196
+ const text = read(p);
197
+ if (text == null) return { score: 0, evidence: `missing ${p}` };
198
+ let j;
199
+ try { j = JSON.parse(text); } catch (e) { return { score: 0, evidence: `${p} not valid JSON: ${e.message}` }; }
200
+ if (!j.types) return { score: 0, evidence: `${p} missing "types"` };
201
+ const issues = [];
202
+ for (const [tname, t] of Object.entries(j.types)) {
203
+ if (!t.categories) { issues.push(`${tname}: no categories`); continue; }
204
+ for (const [cname, cat] of Object.entries(t.categories)) {
205
+ for (const [axis, spec] of Object.entries(cat)) {
206
+ // Both shapes: v1 = ["CODE",...]; v2 = { codes: [...], weight, veto }
207
+ const codes = Array.isArray(spec) ? spec : spec && spec.codes;
208
+ if (!Array.isArray(codes) || codes.length === 0) issues.push(`${tname}/${cname}/${axis}: no issue codes`);
209
+ }
210
+ }
211
+ }
212
+ return issues.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: issues.join("; ") };
213
+ }
214
+ if (probe === "checks-json-coherent") {
215
+ const text = read(p);
216
+ if (text == null) return { score: 0, evidence: `missing ${p}` };
217
+ let j;
218
+ try { j = JSON.parse(text); } catch (e) { return { score: 0, evidence: `${p} not valid JSON: ${e.message}` }; }
219
+ const checks = j.checks || [];
220
+ const known = new Set(["path-exists","file-contains","dir-min","frontmatter-keys","frontmatter-keys-all","roles-distinct-models","roles-distinct-families","state-machine-dag","quality-probe"]);
221
+ const ids = checks.map((x) => x.id);
222
+ const dupIds = ids.filter((id, i) => id && ids.indexOf(id) !== i);
223
+ const badTypes = checks.filter((x) => !known.has(x.type));
224
+ const issues = [];
225
+ if (dupIds.length) issues.push(`duplicate ids: ${[...new Set(dupIds)].join(", ")}`);
226
+ if (badTypes.length) issues.push(`unknown check types: ${badTypes.map((x) => x.type).join(", ")}`);
227
+ return issues.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: issues.join("; ") };
228
+ }
229
+ if (probe === "context-md-present") {
230
+ return existsSync(rel(p))
231
+ ? { score: 1, evidence: "" }
232
+ : { score: 0.5, evidence: `${p} missing — run \`agentrig init\` to investigate the repo` };
233
+ }
234
+ return { score: 0, evidence: `unknown quality probe "${probe}"` };
235
+ }
68
236
  default:
69
237
  return { score: 0, evidence: `unknown check type ${c.type}` };
70
238
  }
@@ -75,38 +243,57 @@ if (!existsSync(checksPath)) {
75
243
  process.exit(2);
76
244
  }
77
245
  const { checks } = JSON.parse(readFileSync(checksPath, "utf8"));
78
- const results = checks.map((c) => ({ ...c, ...scoreCheck(c) }));
246
+ const results = checks.map((c) => ({ ...c, ...scoreCheck(c), layer: c.layer === "quality" ? "quality" : "completeness" }));
79
247
 
80
- let wSum = 0, wScore = 0;
248
+ let cwSum = 0, cwScore = 0, qwSum = 0, qwScore = 0;
81
249
  const byPrinciple = new Map();
82
250
  for (const r of results) {
83
251
  const w = r.weight ?? 1;
84
- wSum += w;
85
- wScore += w * r.score;
252
+ if (r.layer === "quality") { qwSum += w; qwScore += w * r.score; }
253
+ else { cwSum += w; cwScore += w * r.score; }
86
254
  const p = byPrinciple.get(r.principle) || { sum: 0, n: 0 };
87
255
  p.sum += r.score; p.n += 1;
88
256
  byPrinciple.set(r.principle, p);
89
257
  }
90
- const aggregate = wSum ? wScore / wSum : 0;
91
- const pct = Math.round(aggregate * 1000) / 10;
258
+ const completenessAgg = cwSum ? cwScore / cwSum : 0;
259
+ const qualityAgg = qwSum ? qwScore / qwSum : 0;
260
+ const pct = Math.round(completenessAgg * 1000) / 10;
261
+ const qpct = Math.round(qualityAgg * 1000) / 10;
92
262
 
93
263
  if (asJson) {
94
264
  console.log(JSON.stringify({
95
- harnessScore: pct,
96
- aggregate,
265
+ installCompleteness: pct,
266
+ qualityProbes: qpct,
267
+ aggregate: completenessAgg,
268
+ qualityAggregate: qualityAgg,
97
269
  principles: [...byPrinciple.entries()].sort((a, b) => a[0] - b[0]).map(([principle, v]) => ({ principle, score: v.sum / v.n })),
98
- checks: results.map((r) => ({ id: r.id, principle: r.principle, title: r.title, score: r.score, evidence: r.evidence })),
270
+ checks: results.map((r) => ({ id: r.id, principle: r.principle, layer: r.layer, title: r.title, score: r.score, evidence: r.evidence })),
99
271
  }, null, 2));
100
272
  } else {
101
- console.log("AgentRig — harness audit\n");
102
- for (const r of results.sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id))) {
103
- const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
104
- console.log(` [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n ↳ ${r.evidence}` : ""));
273
+ console.log("AgentRig — install completeness audit\n");
274
+ const completeness = results.filter((r) => r.layer === "completeness").sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id));
275
+ const quality = results.filter((r) => r.layer === "quality").sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id));
276
+ if (completeness.length) {
277
+ console.log(" Layer A1 — structural completeness");
278
+ for (const r of completeness) {
279
+ const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
280
+ console.log(` [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n ↳ ${r.evidence}` : ""));
281
+ }
282
+ }
283
+ if (quality.length) {
284
+ console.log("\n Layer A2 — quality probes");
285
+ for (const r of quality) {
286
+ const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
287
+ console.log(` [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n ↳ ${r.evidence}` : ""));
288
+ }
289
+ }
290
+ console.log(`\n Install Completeness: ${pct}% (${completeness.filter((r) => r.score === 1).length}/${completeness.length} checks full credit)`);
291
+ if (quality.length) {
292
+ console.log(` Quality Probes: ${qpct}% (${quality.filter((r) => r.score === 1).length}/${quality.length} checks full credit)`);
105
293
  }
106
- console.log(`\n Harness Score: ${pct}% (${results.filter((r) => r.score === 1).length}/${results.length} checks full credit)`);
107
294
  }
108
295
 
109
296
  if (minPct != null && pct < minPct) {
110
- if (!asJson) console.error(`\nHarness Score ${pct}% is below required ${minPct}%`);
297
+ if (!asJson) console.error(`\nInstall Completeness ${pct}% is below required ${minPct}%`);
111
298
  process.exit(1);
112
299
  }
@@ -16,16 +16,16 @@ states:
16
16
  - name: parked # self-parked: needs a human (low reversibility)
17
17
 
18
18
  transitions:
19
- - from: ingested to: queued trigger: agent role: triager gate: human_approval
20
- - from: queued to: implementing trigger: agent role: developer
21
- - from: implementing to: reviewing trigger: agent role: developer gate: self_verify_passed
22
- - from: reviewing to: judging trigger: agent role: reviewer
23
- - from: reviewing to: implementing trigger: agent role: reviewer reason: changes_requested
24
- - from: judging to: ready_to_merge trigger: agent role: judge gate: rubric_passed
25
- - from: judging to: implementing trigger: agent role: judge reason: below_threshold
26
- - from: ready_to_merge to: merged trigger: human gate: human_approval # principle 9
27
- - from: any to: parked trigger: auto reason: low_reversibility_or_stuck
28
- - from: any to: closed trigger: human
19
+ - { from: ingested, to: queued, trigger: agent, role: triager, gate: human_approval }
20
+ - { from: queued, to: implementing, trigger: agent, role: developer }
21
+ - { from: implementing, to: reviewing, trigger: agent, role: developer, gate: self_verify_passed }
22
+ - { from: reviewing, to: judging, trigger: agent, role: reviewer }
23
+ - { from: reviewing, to: implementing, trigger: agent, role: reviewer, reason: changes_requested }
24
+ - { from: judging, to: ready_to_merge, trigger: agent, role: judge, gate: rubric_passed }
25
+ - { from: judging, to: implementing, trigger: agent, role: judge, reason: below_threshold }
26
+ - { from: ready_to_merge, to: merged, trigger: human, gate: human_approval } # principle 9
27
+ - { from: any, to: parked, trigger: auto, reason: low_reversibility_or_stuck }
28
+ - { from: any, to: closed, trigger: human }
29
29
 
30
30
  # Principle 3: GitHub labels mirror DAG state. Human-only labels must never be applied by an agent.
31
31
  labels:
@@ -48,9 +48,15 @@ limits:
48
48
  requeue_if_stuck_hours: 4
49
49
 
50
50
  hooks:
51
+ # Cheap, deterministic, no model — safe on every PR.
51
52
  pre_pr:
52
53
  - self-verify
53
54
  pre_merge:
55
+ - install-completeness # Layer A: structural audit + quality probes (no model)
56
+ # Layer B (dynamic behavioral eval) is expensive: it spawns the harness end-to-end
57
+ # under a separate judge model with N>=5 trials per scenario. Run nightly on main,
58
+ # NOT per-PR. See .agentrig/eval/RUBRIC.md for cost notes.
59
+ nightly:
54
60
  - harness-eval
55
61
 
56
62
  # --- Trigger taxonomy (principle 1) ------------------------------------------
@@ -101,5 +107,5 @@ issue_comments:
101
107
  # Keep adjacent pipeline roles on DIFFERENT model families (single-model-bias mitigation).
102
108
  model_tiers:
103
109
  cheap: { models: [claude-haiku-4.5, gpt-5-mini], use: "triage, high-volume analysis" }
104
- standard: { models: [claude-sonnet-4.5, gpt-5.4], use: "implementation" }
105
- premium: { models: [claude-opus-4.5, gpt-5], use: "review, judging, auditing" }
110
+ standard: { models: [claude-sonnet-4.6, gpt-5.4], use: "implementation" }
111
+ premium: { models: [claude-opus-4.8, gpt-5.5], use: "review, judging, auditing" }
@@ -1,83 +1,88 @@
1
1
  ---
2
2
  name: harness-eval
3
- description: Evaluate THIS repository's agent harness — a deterministic structure audit plus an independent, rubric-driven dynamic eval (run/spec/review) with A/B variant comparison.
4
- triggers:
5
- - "evaluate the harness"
6
- - pre_merge hook
7
- - "did my harness change make things better or worse?"
3
+ description: Evaluate THIS repository's agent harness — a deterministic structure audit (A1) plus content quality probes (A2), plus an isolated producer/judge dynamic eval (B) with paired sign-test A/B variant comparison.
8
4
  allowed-tools: Bash Read Grep Glob
9
- argument-hint: "[--static|--dynamic] [--scenario id] [--variant v]"
5
+ argument-hint: "[--static|--dynamic] [--scenario id] [--variant v] [--n trials]"
10
6
  ---
11
7
 
12
8
  # harness-eval (principle 6 — evaluate the harness itself)
13
9
 
14
- A harness you cannot measure is a harness you cannot improve. This skill scores the harness on two
15
- complementary layers and writes results to `.agentrig/eval/results/` (validated, never hand-edited).
10
+ A harness you cannot measure is a harness you cannot improve. This skill scores the harness on
11
+ three complementary layers and writes results to `.agentrig/eval/results/` (validated on write
12
+ *and* on read; never hand-edit JSON).
16
13
 
17
- ## Layer Astatic audit (deterministic, no model)
18
- Each of the 12 principles maps to concrete checks in `.agentrig/eval/checks.json`, scored 0/0.5/1.0.
14
+ ## Layer A1install completeness (deterministic, no model)
15
+ Every canonical artifact present at the path the manifest declares.
19
16
 
20
17
  ```bash
21
- node .agentrig/eval/static-audit.mjs # human-readable report + aggregate score
22
- node .agentrig/eval/static-audit.mjs --json # machine-readable, for CI gates
18
+ node .agentrig/eval/static-audit.mjs --json # Install Completeness %
23
19
  ```
24
20
 
25
- Use this in CI and as a fast pre-merge gate. It needs no model and no network.
21
+ ## Layer A2 quality probes (deterministic, no model)
22
+ Cheap content sanity: YAML parseable, no unfilled `{{PLACEHOLDER}}` in `AGENTS.md`, every skill has
23
+ the required frontmatter, axes.json has an issue code per axis, developer/reviewer **model
24
+ families** differ (not just the model id strings).
26
25
 
27
- ## Layer B dynamic behavioral eval (agentic, independent judge)
28
- Run scenarios in `.agentrig/eval/scenarios/*.md` through the harness, then score as an **independent
29
- judge** (a different model than the producer) against `.agentrig/eval/RUBRIC.md` and the registry in
30
- `.agentrig/eval/axes.json`.
26
+ A1 + A2 are what CI gates on. Both surface in the same `--static` report under "Layer A1" and
27
+ "Layer A2" sections.
31
28
 
32
- **Sandbox:** obey `.agentrig/eval/sandbox/eval-rules.md`work in a throwaway worktree; never push,
33
- open PRs, or merge.
29
+ ## Layer Bdynamic behavioral eval (agentic, independent judge, fixture-based)
34
30
 
35
- **Lifecycle:** score the whole lifecycle, not just the patch. Use the rubric `--type` that matches
36
- the scenario: `spec` (task quality), `run` (implementation), `review` (the reviewer's behavior).
37
- Link them with a shared `--task` id.
31
+ For each scenario in `.agentrig/eval/scenarios/*/`:
38
32
 
39
- **Rules (enforced by score.mjs):** strict 0/0.5/1.0 tiers; any axis < 1.0 needs an issue code from
40
- that axis's registry **plus** an evidence string; unobserved axes are `=na`; rollups are recomputed
41
- from axis data.
33
+ 1. **Seed** a throwaway worktree from `scenarios/<id>/fixture/` (or `baseline/`+`change/` for
34
+ review scenarios).
35
+ 2. **Producer** model runs in that worktree against `scenarios/<id>/prompt.md`. For
36
+ `--variant harness`, the AgentRig harness is staged into the worktree first; for
37
+ `--variant baseline`, the agent runs bare.
38
+ 3. **Oracle** (`scenarios/<id>/oracle.yml`) deterministically scores the hard axes (correctness,
39
+ tests, scope, regression_risk, …) by running commands / inspecting the diff. **No LLM.**
40
+ 4. **Judge** model — explicitly a **different family** from the producer — runs in a separate
41
+ `provider.startConversation()` call in its own cwd containing only `prompt.md`, `diff.patch`,
42
+ `transcript.md`, `oracle.json`, and `judge_brief.md`. It does NOT see the producer worktree or
43
+ reasoning trace. It writes `<artifactsDir>/<scenario>.trial<N>.judge.json`; the orchestrator
44
+ reads, validates, and persists via `score.mjs save`.
45
+
46
+ **Family-divergence is enforced.** `score.mjs save` rejects a producer/judge pair in the same
47
+ family unless `--allow-same-family` is set (and records the override). Bare CLI:
42
48
 
43
49
  ```bash
44
- node .agentrig/eval/score.mjs save --type run --task <id> --scenario <id> --judge <model> \
45
- --axis 'correctness=1.0' \
46
- --axis 'scope=0.5:OQ-SCOPE-CHURN:left build artifacts in the diff' \
47
- --axis 'tests=na'
48
- node .agentrig/eval/score.mjs report
50
+ agentrig eval --dynamic --variant harness --n 5 --producer-model claude-sonnet-4.6 --judge-model gpt-5.5
51
+ agentrig eval --dynamic --variant baseline --n 5 --producer-model claude-sonnet-4.6 --judge-model gpt-5.5
52
+ node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
49
53
  ```
50
54
 
51
- **Artifacts:** for each run, save `diff.patch`, a short `output` transcript, and `meta.json`
52
- (scenario, base_commit, variant, model, duration) next to the score so regressions are inspectable.
55
+ **Aggregation: weighted + veto.** axes.json declares `weight` and `veto: true` per axis.
56
+ A veto axis < 1.0 fails the scenario regardless of aggregate (e.g. correctness can never be
57
+ papered over by clarity).
53
58
 
54
- ## Comparing harness changes (A/B)
55
- To know whether a prompt/skill/rule change helped, run the **same** scenario before and after under
56
- different `--variant`s, then:
59
+ ## Statistical lift
57
60
 
58
- ```bash
59
- node .agentrig/eval/score.mjs compare --scenario <id>
60
- ```
61
+ Single-trial deltas are coin flips. The eval requires `n ≥ 3` paired trials for any verdict
62
+ other than **INCONCLUSIVE**. `score.mjs compare` runs a paired binomial sign test and reports
63
+ median delta + p-value:
61
64
 
62
- A change that lowers the aggregate is a regression even if it "feels" better. A static score < 1.0
63
- on a principle points at a missing/weak artifact fix the artifact, then re-audit.
65
+ - **HELPS** p < 0.05 and median > 0.05
66
+ - **HURTS** p < 0.05 and median < -0.05
67
+ - **INCONCLUSIVE** — n < 3, p ≥ 0.05, or |median| < 0.05
64
68
 
65
- ## Does the harness actually help? (with vs without)
66
- The most important question for a consumer: *does installing AgentRig's harness make agents better
67
- in THIS repo?* Measure it by running the same scenarios twice and comparing:
69
+ A change that doesn't clear `HELPS` is a regression risk even if individual trials looked good.
68
70
 
69
- ```bash
70
- # 1) Harness ON (the agent uses AGENTS.md + rules + skills as installed)
71
- agentrig eval --dynamic --scenario <id> --variant harness
71
+ ## Sandbox
72
+ Obey `.agentrig/eval/sandbox/eval-rules.md`: throwaway worktree under `$TMPDIR/agentrig-eval/`,
73
+ never push / open PRs / merge / mutate real labels. The eval measures behavior; it must not
74
+ mutate real branches.
72
75
 
73
- # 2) Baseline harness OFF (a bare agent; ignore AGENTS.md/.agents/instructions surfaces)
74
- agentrig eval --dynamic --scenario <id> --variant baseline
76
+ ## Calibrate the judge before trusting it
75
77
 
76
- # 3) Report the lift (per-axis + aggregate delta + a HELPS/HURTS verdict)
77
- node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
78
+ A lazy judge that returns 1.0 everywhere passes every `score.mjs save` validation. Run the judge
79
+ over the hand-labeled `calibration/` instances and require ≥ 80% agreement before publishing
80
+ results:
81
+
82
+ ```bash
83
+ node .agentrig/eval/score.mjs calibrate --judge <model> --instance .agentrig/eval/calibration/run/seed-correct.yml --judge-scores /tmp/judge-out.json
84
+ node .agentrig/eval/score.mjs calibrate --report
85
+ agentrig doctor # flags any judge below the 80% threshold
78
86
  ```
79
87
 
80
- For a rigorous baseline, run the harness-off trial in a sandbox/worktree with the harness + compiled
81
- surfaces moved aside (`AGENTS.md`, `.agents/`, `.github/instructions/`, `CLAUDE.md`, `.cursor/`), so
82
- the agent genuinely has no harness guidance. A positive aggregate delta means the harness helps in
83
- this repo; track it over time as you tune rules/skills/prompts.
88
+ See `.agentrig/eval/calibration/README.md` for the instance format.
@@ -0,0 +1,68 @@
1
+ ---
2
+ name: log-gotcha
3
+ description: Record a newly-discovered gotcha to `.agents/wiki/` BEFORE handoff — the harness's feedback loop. The wiki is how the next agent doesn't repeat your discovery.
4
+ triggers:
5
+ - hit something non-obvious during the task
6
+ - silent failure / suspicious default / quirk in a library or runtime
7
+ - before handoff if anything surprised you
8
+ allowed-tools: Bash Read Write Edit Grep Glob
9
+ argument-hint: "[--topic <area>]"
10
+ ---
11
+
12
+ # log-gotcha (principle 8)
13
+
14
+ Every mistake is a prompt bug. The wiki is **how the harness learns**: every entry there is one
15
+ agent-turn the next agent skips because they already know what you discovered. Logging is part of
16
+ the task, not a separate "good-to-have" step.
17
+
18
+ ## When to log
19
+
20
+ You should log a gotcha if **any** of these apply to what you just did:
21
+
22
+ - A test, framework, or runtime did something surprising (e.g. `divide(1, 0)` returns `Infinity`
23
+ not throws; `node --test some-dir` resolves the dir as a module; `console.log` after
24
+ `process.exit` silently truncates piped output).
25
+ - A library default bit you (silent overwrite, surprising coercion, hidden API contract).
26
+ - An AGENTS.md rule wasn't loud enough — you almost violated it, or did, until you caught yourself.
27
+ - A non-obvious cross-file dependency that someone touching one file would miss.
28
+ - A flaky test, an environment-specific assumption, a build-cache surprise.
29
+
30
+ **Do not log** taste opinions, style preferences, or things that are already in CONTRIBUTING.md.
31
+
32
+ ## How to log
33
+
34
+ 1. **Check the wiki first.** Run `ls .agents/wiki/` and `grep -ri "<keyword>" .agents/wiki/` for
35
+ the most natural keywords. **If an existing entry covers it, SHARPEN that entry instead of
36
+ adding a near-duplicate** (the wiki README has a strict admission test on duplication).
37
+ 2. **Pick a topic file.** Either an existing one (e.g. `troubleshooting.md`) or create
38
+ `.agents/wiki/<topic>.md` if the area is new (e.g. `node-test-runner.md`,
39
+ `html-templates.md`). Topic names are kebab-case nouns.
40
+ 3. **Write a stub entry** using the template below. Keep it terse — 5 lines max.
41
+ 4. **Commit it as part of your fix's diff.** Wiki entries are not "after-the-fact paperwork" —
42
+ they go in the SAME commit/PR as the fix that revealed them, so reviewers can see them.
43
+
44
+ ## Entry template
45
+
46
+ ```markdown
47
+ ### <short noun-phrase title>
48
+ - **Symptom:** what went wrong / how it showed up
49
+ - **Cause:** the real root cause (not the symptom)
50
+ - **Fix:** the change you made (or wider remediation)
51
+ - **Prevention:** one-line rule that would have spared you this discovery
52
+ - **Discovered:** <date> in <scenario or task id>
53
+ ```
54
+
55
+ ## Skill failure modes (explicitly)
56
+
57
+ - **"I didn't really hit a gotcha."** Most fixes DO reveal one — you just didn't notice because the
58
+ fix took less than 5 minutes. The discriminating question is *"could the next agent have known
59
+ this from the existing instructions?"* — if no, log it.
60
+ - **"It's too small."** Small gotchas are exactly the ones that vanish from memory by tomorrow.
61
+ - **"I'll log later."** No — log it in the SAME commit. "Later" is how wikis die.
62
+
63
+ ## Verification
64
+
65
+ Before considering this skill complete, confirm with `git diff --cached --stat` that your wiki
66
+ entry shows in the staged diff. The `memory` axis in the harness eval explicitly checks for this:
67
+ "diff contains a `.agents/wiki/` entry" → 1.0; "mentioned in summary but not committed" → 0.5;
68
+ "silent" → 0.
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: self-verify
3
- description: Run the project's own build/test/lint and converge before handing work to a reviewer.
3
+ description: Run the project's own build/test/lint and converge before handing work to a reviewer. Requires explicit baseline → after evidence — the suite must be shown to change state, not just be "green at the end".
4
4
  triggers:
5
5
  - before requesting review
6
6
  - before opening a PR
@@ -13,13 +13,37 @@ argument-hint: "[--max-iterations N]"
13
13
  After producing changes, **verify your own work before handoff**. Do not invoke the reviewer until
14
14
  this loop converges.
15
15
 
16
- ## Steps
17
- 1. Run the install/build/test/lint commands recorded in `AGENTS.md` (the `commands` block).
18
- 2. If all green **continue** to review.
19
- 3. If red read the failure, fix, and re-run. Cap at **N=3** iterations (default).
20
- 4. If still red after N **self-park**: leave a precise note (what failed, what you tried) and
21
- move the task to `parked`. Never hand a red build to a reviewer.
16
+ ## Steps (do them in order; do not skip)
17
+
18
+ 1. **Baseline.** Run the install/build/test/lint commands from `AGENTS.md`'s `commands` block
19
+ **once before you make any edit related to the failing symptom**. Capture the result:
20
+ - For a fix scenario: confirm the suite is RED in the expected way (the target test fails).
21
+ - For a feature scenario: confirm the suite is GREEN (so you know your changes are what break it
22
+ if it goes red later).
23
+ - Surface this baseline in your transcript — e.g. *"baseline: `npm test` → 1 fail (divide-by-zero)"*.
24
+
25
+ 2. **Iterate.** Make the change; re-run the commands. Cap at **N=3** iterations.
26
+
27
+ 3. **After.** Re-run the full suite at the end and surface the new state explicitly —
28
+ e.g. *"after fix: `npm test` → 0 fails, all 4 tests pass"*. The transition from baseline → after
29
+ is the evidence that your work did what you claim. Reporting only "tests pass" without the
30
+ baseline is half a self-verification.
31
+
32
+ 4. **Self-park if still red.** Leave a precise note (what failed, what you tried) and move the task
33
+ to `parked`. Never hand a red build to a reviewer.
34
+
35
+ ## Handoff checklist (run BEFORE you declare done)
36
+
37
+ - [ ] Baseline output captured + surfaced in transcript
38
+ - [ ] After output captured + surfaced in transcript
39
+ - [ ] Diff is on-target (no unrelated churn — check `git diff --stat`)
40
+ - [ ] **Did you hit any non-obvious behavior or surprise?** → run the `log-gotcha` skill before
41
+ handing off. This includes silently-passing-yet-wrong APIs, JS-floating-point quirks, framework
42
+ defaults that bit you, environment surprises, etc. Wiki entries are how the next agent avoids
43
+ repeating your discovery.
22
44
 
23
45
  ## Notes
46
+
24
47
  - Pin verification to your own HEAD; do not trust stale CI from an earlier commit.
25
- - Record any new gotcha in `.agents/wiki/`.
48
+ - If the build is too expensive to run a full baseline (10+ min), at minimum run the **smallest
49
+ set of tests that demonstrates the symptom** before AND after your fix.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@doidor/agentrig",
3
- "version": "0.9.0",
3
+ "version": "0.10.0",
4
4
  "description": "AgentRig — an agentic meta-harness. A CLI that investigates a repository and installs (and evaluates) a best-practice agent harness.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -55,6 +55,7 @@
55
55
  "license": "MIT",
56
56
  "dependencies": {
57
57
  "@github/copilot-sdk": "^1.0.0",
58
+ "yaml": "^2.9.0",
58
59
  "zod": "^4.3.6"
59
60
  },
60
61
  "peerDependencies": {
@@ -68,8 +69,8 @@
68
69
  "devDependencies": {
69
70
  "@changesets/changelog-github": "^0.7.0",
70
71
  "@changesets/cli": "^2.31.0",
71
- "@doidor/markbook": "^0.1.2",
72
- "@doidor/markbook-core": "^0.1.2",
72
+ "@doidor/markbook": "^0.2.0",
73
+ "@doidor/markbook-core": "^0.2.0",
73
74
  "@types/node": "^22.0.0",
74
75
  "typescript": "^5.6.0"
75
76
  }