@doidor/agentrig 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +224 -0
  3. package/dist/agent/claude.js +125 -0
  4. package/dist/agent/claude.js.map +1 -0
  5. package/dist/agent/copilot.js +147 -0
  6. package/dist/agent/copilot.js.map +1 -0
  7. package/dist/agent/index.js +17 -0
  8. package/dist/agent/index.js.map +1 -0
  9. package/dist/agent/provider.js +10 -0
  10. package/dist/agent/provider.js.map +1 -0
  11. package/dist/cli.js +169 -0
  12. package/dist/cli.js.map +1 -0
  13. package/dist/commands/compile.js +42 -0
  14. package/dist/commands/compile.js.map +1 -0
  15. package/dist/commands/dashboard.js +35 -0
  16. package/dist/commands/dashboard.js.map +1 -0
  17. package/dist/commands/doctor.js +40 -0
  18. package/dist/commands/doctor.js.map +1 -0
  19. package/dist/commands/eval.js +178 -0
  20. package/dist/commands/eval.js.map +1 -0
  21. package/dist/commands/init.js +100 -0
  22. package/dist/commands/init.js.map +1 -0
  23. package/dist/commands/update.js +176 -0
  24. package/dist/commands/update.js.map +1 -0
  25. package/dist/core/activity.js +80 -0
  26. package/dist/core/activity.js.map +1 -0
  27. package/dist/core/audit.js +112 -0
  28. package/dist/core/audit.js.map +1 -0
  29. package/dist/core/compile.js +250 -0
  30. package/dist/core/compile.js.map +1 -0
  31. package/dist/core/fsutil.js +45 -0
  32. package/dist/core/fsutil.js.map +1 -0
  33. package/dist/core/install.js +97 -0
  34. package/dist/core/install.js.map +1 -0
  35. package/dist/core/knowledge.js +34 -0
  36. package/dist/core/knowledge.js.map +1 -0
  37. package/dist/core/logger.js +31 -0
  38. package/dist/core/logger.js.map +1 -0
  39. package/dist/core/paths.js +22 -0
  40. package/dist/core/paths.js.map +1 -0
  41. package/dist/core/setupsteps.js +72 -0
  42. package/dist/core/setupsteps.js.map +1 -0
  43. package/dist/core/state.js +19 -0
  44. package/dist/core/state.js.map +1 -0
  45. package/dist/core/surfaces.js +62 -0
  46. package/dist/core/surfaces.js.map +1 -0
  47. package/dist/prompts/index.js +117 -0
  48. package/dist/prompts/index.js.map +1 -0
  49. package/dist/version.js +26 -0
  50. package/dist/version.js.map +1 -0
  51. package/knowledge/PRINCIPLES.md +106 -0
  52. package/knowledge/manifest.json +247 -0
  53. package/knowledge/templates/AGENTS.md +66 -0
  54. package/knowledge/templates/AGENTS.package.example.md +19 -0
  55. package/knowledge/templates/agents/README.md +33 -0
  56. package/knowledge/templates/agents/developer.md +7 -0
  57. package/knowledge/templates/agents/developer.yml +7 -0
  58. package/knowledge/templates/agents/judge.md +6 -0
  59. package/knowledge/templates/agents/judge.yml +6 -0
  60. package/knowledge/templates/agents/reviewer.md +6 -0
  61. package/knowledge/templates/agents/reviewer.yml +7 -0
  62. package/knowledge/templates/agents/triager.md +8 -0
  63. package/knowledge/templates/agents/triager.yml +8 -0
  64. package/knowledge/templates/dashboard/dashboard.mjs +261 -0
  65. package/knowledge/templates/eval/RUBRIC.md +94 -0
  66. package/knowledge/templates/eval/axes.json +56 -0
  67. package/knowledge/templates/eval/checks.json +304 -0
  68. package/knowledge/templates/eval/sandbox/eval-rules.md +23 -0
  69. package/knowledge/templates/eval/scenarios/README.md +24 -0
  70. package/knowledge/templates/eval/scenarios/add-small-feature.md +28 -0
  71. package/knowledge/templates/eval/scenarios/fix-failing-test.md +27 -0
  72. package/knowledge/templates/eval/scenarios/review-catches-bug.md +30 -0
  73. package/knowledge/templates/eval/score.mjs +257 -0
  74. package/knowledge/templates/eval/static-audit.mjs +112 -0
  75. package/knowledge/templates/harness/ORCHESTRATION.md +53 -0
  76. package/knowledge/templates/harness/state-machine.yml +105 -0
  77. package/knowledge/templates/mcp/mcp.json +12 -0
  78. package/knowledge/templates/rules/README.md +32 -0
  79. package/knowledge/templates/rules/code-review.md +26 -0
  80. package/knowledge/templates/rules/coding-standards.md +15 -0
  81. package/knowledge/templates/rules/no-debug-logging.md +16 -0
  82. package/knowledge/templates/rules/security.md +23 -0
  83. package/knowledge/templates/scripts/repair-worktrees.sh +124 -0
  84. package/knowledge/templates/skills/fix-ci/SKILL.md +17 -0
  85. package/knowledge/templates/skills/harness-eval/SKILL.md +83 -0
  86. package/knowledge/templates/skills/self-verify/SKILL.md +25 -0
  87. package/knowledge/templates/skills/skill-authoring/SKILL.md +35 -0
  88. package/knowledge/templates/skills/skill-improver/SKILL.md +23 -0
  89. package/knowledge/templates/skills/verify-loop/SKILL.md +35 -0
  90. package/knowledge/templates/wiki/README.md +23 -0
  91. package/knowledge/templates/wiki/_TEMPLATE.md +16 -0
  92. package/knowledge/templates/wiki/index.md +29 -0
  93. package/knowledge/templates/wiki/troubleshooting.md +14 -0
  94. package/package.json +70 -0
@@ -0,0 +1,261 @@
1
+ #!/usr/bin/env node
2
+ // AgentRig harness dashboard. Dependency-free. Surfaces, in one place:
3
+ // • the agent roster (roles + models) (principle 2)
4
+ // • live GitHub tasks per harness label via `gh` (principle 3 — system of record)
5
+ // • the static Harness Score + per-principle (principle 6)
6
+ // • the latest dynamic eval summary (principle 6)
7
+ // • the harness hard limits (principle 10)
8
+ // Usage:
9
+ // node .agentrig/dashboard/dashboard.mjs terminal dashboard
10
+ // node .agentrig/dashboard/dashboard.mjs --json machine-readable
11
+ // node .agentrig/dashboard/dashboard.mjs --html [file] write a self-contained HTML page
12
+ // node .agentrig/dashboard/dashboard.mjs --no-tasks live GitHub lookups skipped (offline)
13
+ import { readFileSync, existsSync, readdirSync, writeFileSync } from "node:fs";
14
+ import { execFileSync } from "node:child_process";
15
+ import { fileURLToPath } from "node:url";
16
+ import { dirname, join, resolve } from "node:path";
17
+
18
+ const scriptDir = dirname(fileURLToPath(import.meta.url));
19
+ const repoRoot = resolve(scriptDir, "..", "..");
20
+
21
+ const args = process.argv.slice(2);
22
+ const asJson = args.includes("--json");
23
+ const htmlIdx = args.indexOf("--html");
24
+ const asHtml = htmlIdx >= 0;
25
+ const htmlOut = asHtml ? args[htmlIdx + 1] && !args[htmlIdx + 1].startsWith("-") ? args[htmlIdx + 1] : join(scriptDir, "dashboard.html") : null;
26
+ const noTasks = args.includes("--no-tasks");
27
+
28
+ const rel = (p) => resolve(repoRoot, p);
29
+ const read = (p) => (existsSync(rel(p)) ? readFileSync(rel(p), "utf8") : null);
30
+
31
+ function runNode(scriptRelPath, scriptArgs) {
32
+ try {
33
+ const out = execFileSync(process.execPath, [rel(scriptRelPath), ...scriptArgs], {
34
+ encoding: "utf8",
35
+ stdio: ["ignore", "pipe", "ignore"],
36
+ });
37
+ return JSON.parse(out);
38
+ } catch {
39
+ return null;
40
+ }
41
+ }
42
+
43
+ // --- Agent roster -----------------------------------------------------------
44
+ function loadRoster() {
45
+ const dir = rel(".agentrig/agents");
46
+ if (!existsSync(dir)) return [];
47
+ return readdirSync(dir)
48
+ .filter((f) => f.endsWith(".yml"))
49
+ .map((f) => {
50
+ const text = readFileSync(join(dir, f), "utf8");
51
+ const get = (k) => (text.match(new RegExp("^\\s*" + k + "\\s*:\\s*(.+)\\s*$", "m")) || [])[1]?.trim() ?? null;
52
+ return { role: get("role") || f.replace(/\.yml$/, ""), model: get("model"), tier: get("model_tier") };
53
+ })
54
+ .sort((a, b) => a.role.localeCompare(b.role));
55
+ }
56
+
57
+ // --- State<->label map from the state machine -------------------------------
58
+ function loadStateLabels() {
59
+ const text = read(".agentrig/harness/state-machine.yml");
60
+ if (!text) return {};
61
+ const lines = text.split("\n");
62
+ const map = {};
63
+ let inStateMap = false;
64
+ let baseIndent = null;
65
+ for (const line of lines) {
66
+ if (/^\s*state_map:\s*$/.test(line)) {
67
+ inStateMap = true;
68
+ baseIndent = null;
69
+ continue;
70
+ }
71
+ if (inStateMap) {
72
+ if (line.trim() === "") continue;
73
+ const indent = line.length - line.trimStart().length;
74
+ const m = line.match(/^\s*([a-z_]+)\s*:\s*([A-Za-z0-9_-]+)\s*$/);
75
+ if (baseIndent === null && m) baseIndent = indent;
76
+ if (m && indent === baseIndent) map[m[1]] = m[2];
77
+ else if (indent <= (baseIndent ?? 0) - 1 || /^\s*[a-z_]+:\s*$/.test(line)) {
78
+ if (!m) break;
79
+ }
80
+ }
81
+ }
82
+ return map;
83
+ }
84
+
85
+ function loadLimits() {
86
+ const text = read(".agentrig/harness/state-machine.yml");
87
+ if (!text) return {};
88
+ const out = {};
89
+ const block = text.split(/^\s*limits:\s*$/m)[1];
90
+ if (!block) return out;
91
+ for (const line of block.split("\n")) {
92
+ const m = line.match(/^\s{2,}([a-z_]+)\s*:\s*(\d+)\s*$/);
93
+ if (m) out[m[1]] = Number(m[2]);
94
+ else if (/^\S/.test(line) && line.trim() !== "") break;
95
+ }
96
+ return out;
97
+ }
98
+
99
+ // --- Live GitHub tasks via gh ----------------------------------------------
100
+ function ghAvailable() {
101
+ try {
102
+ execFileSync("gh", ["auth", "status"], { stdio: "ignore" });
103
+ return true;
104
+ } catch {
105
+ return false;
106
+ }
107
+ }
108
+
109
+ function ghList(kind, label) {
110
+ // kind: "issue" | "pr"
111
+ try {
112
+ const out = execFileSync(
113
+ "gh",
114
+ [kind, "list", "--label", label, "--state", "open", "--limit", "30", "--json", "number,title,url,assignees"],
115
+ { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] },
116
+ );
117
+ return JSON.parse(out).map((x) => ({
118
+ kind,
119
+ number: x.number,
120
+ title: x.title,
121
+ url: x.url,
122
+ assignees: (x.assignees || []).map((a) => a.login),
123
+ }));
124
+ } catch {
125
+ return [];
126
+ }
127
+ }
128
+
129
+ function loadTasks(stateLabels) {
130
+ if (noTasks) return { available: false, reason: "skipped (--no-tasks)", byState: {} };
131
+ if (!ghAvailable()) return { available: false, reason: "gh not installed or not authenticated", byState: {} };
132
+ const byState = {};
133
+ for (const [state, label] of Object.entries(stateLabels)) {
134
+ byState[state] = { label, items: [...ghList("issue", label), ...ghList("pr", label)] };
135
+ }
136
+ return { available: true, reason: null, byState };
137
+ }
138
+
139
+ // --- Gather everything ------------------------------------------------------
140
+ const audit = runNode(".agentrig/eval/static-audit.mjs", ["--json"]);
141
+ const evals = runNode(".agentrig/eval/score.mjs", ["report", "--json"]) || { overall: 0, scenarios: [], axes: [] };
142
+ const roster = loadRoster();
143
+ const stateLabels = loadStateLabels();
144
+ const limits = loadLimits();
145
+ const tasks = loadTasks(stateLabels);
146
+
147
+ const data = {
148
+ generatedAt: new Date().toISOString(),
149
+ repo: repoRoot,
150
+ harnessScore: audit?.harnessScore ?? null,
151
+ principles: audit?.principles ?? [],
152
+ roster,
153
+ tasks,
154
+ evals,
155
+ limits,
156
+ };
157
+
158
+ // --- Render -----------------------------------------------------------------
159
+ if (asJson) {
160
+ console.log(JSON.stringify(data, null, 2));
161
+ process.exit(0);
162
+ }
163
+
164
+ if (asHtml) {
165
+ writeFileSync(htmlOut, renderHtml(data));
166
+ console.log(`Wrote ${htmlOut}`);
167
+ process.exit(0);
168
+ }
169
+
170
+ renderTerminal(data);
171
+
172
+ function renderTerminal(d) {
173
+ const useColor = process.stdout.isTTY && !process.env.NO_COLOR;
174
+ const c = (code, s) => (useColor ? `\x1b[${code}m${s}\x1b[0m` : s);
175
+ const bold = (s) => c("1", s), dim = (s) => c("2", s), green = (s) => c("32", s), yellow = (s) => c("33", s), red = (s) => c("31", s), cyan = (s) => c("36", s);
176
+ const rule = dim("─".repeat(64));
177
+
178
+ console.log(`\n${bold("AgentRig — harness dashboard")} ${dim(d.repo)}`);
179
+ console.log(rule);
180
+
181
+ const scoreColor = d.harnessScore == null ? dim : d.harnessScore >= 80 ? green : d.harnessScore >= 50 ? yellow : red;
182
+ console.log(`${bold("Harness Score")} ${scoreColor(d.harnessScore == null ? "n/a" : d.harnessScore + "%")}`);
183
+ if (d.principles.length) {
184
+ const weak = d.principles.filter((p) => p.score < 1).map((p) => `P${p.principle} ${(p.score * 100).toFixed(0)}%`);
185
+ console.log(dim(` weak principles: ${weak.length ? weak.join(", ") : "none — all full credit"}`));
186
+ }
187
+
188
+ console.log(`\n${bold("Agents")} ${dim(`(${d.roster.length} roles)`)}`);
189
+ for (const a of d.roster) console.log(` ${cyan(a.role.padEnd(11))} ${(a.model || "?").padEnd(20)} ${dim(a.tier || "")}`);
190
+
191
+ console.log(`\n${bold("Tasks")}`);
192
+ if (!d.tasks.available) {
193
+ console.log(dim(` unavailable — ${d.tasks.reason}`));
194
+ } else {
195
+ let total = 0;
196
+ for (const [state, info] of Object.entries(d.tasks.byState)) {
197
+ const items = info.items;
198
+ total += items.length;
199
+ const head = ` ${state.padEnd(16)} ${dim(info.label)} ${bold(String(items.length))}`;
200
+ console.log(head);
201
+ for (const it of items.slice(0, 8)) {
202
+ const who = it.assignees.length ? dim(` @${it.assignees.join(", @")}`) : dim(" unassigned");
203
+ console.log(` ${it.kind === "pr" ? "PR" : "# "}${it.number} ${it.title.slice(0, 48)}${who}`);
204
+ }
205
+ }
206
+ if (total === 0) console.log(dim(" no open tasks carrying harness labels"));
207
+ }
208
+
209
+ console.log(`\n${bold("Evals")} ${dim("(dynamic)")}`);
210
+ const evalRows = d.evals.results || d.evals.scenarios || [];
211
+ if (!evalRows.length) {
212
+ console.log(dim(" no dynamic eval runs yet — `agentrig eval --dynamic`"));
213
+ } else {
214
+ console.log(` overall ${bold(d.evals.overall.toFixed(2))} across ${evalRows.length} result(s)`);
215
+ for (const s of evalRows) {
216
+ const label = `${s.type ? s.type + "/" : ""}${s.scenario}${s.variant ? " [" + s.variant + "]" : ""}`;
217
+ console.log(` ${s.pass ? green("PASS") : red("FAIL")} ${label.padEnd(28)} ${s.aggregate.toFixed(2)} ${dim("(" + s.judge + ")")}`);
218
+ }
219
+ }
220
+
221
+ if (Object.keys(d.limits).length) {
222
+ console.log(`\n${bold("Limits")}`);
223
+ console.log(dim(" " + Object.entries(d.limits).map(([k, v]) => `${k}=${v}`).join(" ")));
224
+ }
225
+ console.log("");
226
+ }
227
+
228
+ function renderHtml(d) {
229
+ const esc = (s) => String(s).replace(/[&<>]/g, (m) => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;" }[m]));
230
+ const scoreClass = d.harnessScore == null ? "na" : d.harnessScore >= 80 ? "good" : d.harnessScore >= 50 ? "warn" : "bad";
231
+ const rosterRows = d.roster.map((a) => `<tr><td>${esc(a.role)}</td><td>${esc(a.model || "?")}</td><td>${esc(a.tier || "")}</td></tr>`).join("");
232
+ let tasksHtml;
233
+ if (!d.tasks.available) {
234
+ tasksHtml = `<p class="muted">Tasks unavailable — ${esc(d.tasks.reason)}</p>`;
235
+ } else {
236
+ tasksHtml = Object.entries(d.tasks.byState).map(([state, info]) => {
237
+ const items = info.items.map((it) => `<li><span class="tag">${it.kind === "pr" ? "PR" : "#"}${it.number}</span> <a href="${esc(it.url)}">${esc(it.title)}</a> <span class="muted">${it.assignees.length ? "@" + it.assignees.map(esc).join(", @") : "unassigned"}</span></li>`).join("");
238
+ return `<div class="state"><h4>${esc(state)} <span class="muted">${esc(info.label)} · ${info.items.length}</span></h4><ul>${items || '<li class="muted">none</li>'}</ul></div>`;
239
+ }).join("");
240
+ }
241
+ const evalList = d.evals.results || d.evals.scenarios || [];
242
+ const evalRows = evalList.map((s) => `<tr><td>${s.pass ? "✅" : "❌"}</td><td>${esc((s.type ? s.type + "/" : "") + s.scenario + (s.variant ? " [" + s.variant + "]" : ""))}</td><td>${s.aggregate.toFixed(2)}</td><td class="muted">${esc(s.judge)}</td></tr>`).join("");
243
+ const limits = Object.entries(d.limits).map(([k, v]) => `<code>${esc(k)}=${esc(v)}</code>`).join(" ");
244
+ return `<!doctype html><html><head><meta charset="utf-8"><title>AgentRig dashboard</title>
245
+ <style>
246
+ :root{color-scheme:light dark}body{font:14px/1.5 system-ui,sans-serif;margin:2rem auto;max-width:880px;padding:0 1rem}
247
+ h1{font-size:1.3rem}h2{font-size:1rem;border-bottom:1px solid #8884;padding-bottom:.2rem;margin-top:2rem}
248
+ .score{font-size:2rem;font-weight:700}.good{color:#1a7f37}.warn{color:#9a6700}.bad{color:#cf222e}.na{color:#888}
249
+ table{border-collapse:collapse;width:100%}td,th{text-align:left;padding:.25rem .5rem;border-bottom:1px solid #8882}
250
+ .muted{color:#888}.tag{display:inline-block;background:#8882;border-radius:4px;padding:0 .35rem;font-size:.8em}
251
+ .state h4{margin:.6rem 0 .2rem}code{background:#8882;border-radius:4px;padding:0 .3rem}
252
+ </style></head><body>
253
+ <h1>AgentRig — harness dashboard</h1>
254
+ <p class="muted">${esc(d.repo)} · generated ${esc(d.generatedAt)}</p>
255
+ <h2>Harness Score</h2><p class="score ${scoreClass}">${d.harnessScore == null ? "n/a" : d.harnessScore + "%"}</p>
256
+ <h2>Agents (${d.roster.length})</h2><table><tr><th>Role</th><th>Model</th><th>Tier</th></tr>${rosterRows}</table>
257
+ <h2>Tasks</h2>${tasksHtml}
258
+ <h2>Evals</h2>${evalRows ? `<table><tr><th></th><th>Scenario</th><th>Score</th><th>Judge</th></tr>${evalRows}</table><p class="muted">overall ${d.evals.overall.toFixed(2)}</p>` : '<p class="muted">No dynamic eval runs yet.</p>'}
259
+ ${limits ? `<h2>Limits</h2><p>${limits}</p>` : ""}
260
+ </body></html>`;
261
+ }
@@ -0,0 +1,94 @@
1
+ # Harness evaluation rubric (principle 6)
2
+
3
+ Two layers. **Layer A** is deterministic and model-free; **Layer B** is an independent,
4
+ model-judged behavioral eval. Both write to `.agentrig/eval/results/` via `score.mjs`
5
+ (never hand-edit JSON). The machine-readable rubric registry lives in
6
+ [`axes.json`](./axes.json) — `score.mjs` validates every score against it.
7
+
8
+ ---
9
+
10
+ ## Layer A — Static harness audit
11
+ Scored automatically by `checks.json`. Each check maps to a principle and earns **0 / 0.5 / 1.0**.
12
+ The aggregate is the **Harness Score** (0–100%). Run:
13
+
14
+ ```bash
15
+ node .agentrig/eval/static-audit.mjs # or: agentrig eval --static
16
+ ```
17
+
18
+ Treat any principle scoring < 1.0 as a missing/weak artifact to fix.
19
+
20
+ ---
21
+
22
+ ## Layer B — Dynamic behavioral eval
23
+
24
+ For each scenario, run the task through the harness, then have an **independent judge model**
25
+ (different from the producer) score the result. Scoring is **strict 3-tier: 0 / 0.5 / 1.0**.
26
+
27
+ Three rules, enforced by `score.mjs` against `axes.json`:
28
+ 1. **Issue code required.** Any axis < 1.0 (and observed) must carry an issue code **from that
29
+ axis's bounded registry** plus a one-line **evidence** string. Invented codes are rejected.
30
+ 2. **Confidence-gated.** An axis you couldn't observe is scored `na` (confidence 0) and excluded
31
+ from rollups — partial observability never contaminates the total.
32
+ 3. **Rollups are recomputed from axes.** Category and aggregate scores come from the axis data, not
33
+ from anything the judge asserts.
34
+
35
+ ### Multi-rubric lifecycle
36
+ The eval covers the whole lifecycle, not just the final patch. Three rubric **types**, linked by the
37
+ same `--task` id so you get a spec → run → review view:
38
+
39
+ | `--type` | What it scores | Categories |
40
+ |----------|----------------|------------|
41
+ | `spec` | task/issue spec quality (before work) | spec_quality (clarity, acceptance_criteria, scope_bounded, testability, context) |
42
+ | `run` | the implementation run | output_quality, agent_behavior, long_term_impact |
43
+ | `review` | the reviewer's own behavior | review_quality (finding_correctness, severity_calibration, false_positive_rate, coverage, actionability, independence, blocking_decision) |
44
+
45
+ ### `run` axes (the most common)
46
+ - **Output Quality** — `correctness`, `scope`, `tests`, `clarity`
47
+ - **Agent Behavior** — `self_verification`, `gate_compliance`, `tool_discipline`, `escalation`
48
+ - **Long-Term Impact** — `memory`, `regression_risk`, `maintainability`
49
+
50
+ See `axes.json` for the full per-axis issue-code registries (e.g. `OQ-SCOPE-CHURN`,
51
+ `AB-VERIFY-REDHANDOFF`, `LT-REGRESS-LIKELY`).
52
+
53
+ ### Saving and reading scores
54
+ ```bash
55
+ # Save one rubric (any axis < 1.0 needs CODE:evidence; use `=na` for unobserved axes)
56
+ node .agentrig/eval/score.mjs save --type run --task add-small-feature \
57
+ --scenario add-small-feature --judge <model> [--variant v2] [--run RID] \
58
+ --axis 'correctness=1.0' \
59
+ --axis 'scope=0.5:OQ-SCOPE-CHURN:left package-lock churn in the diff' \
60
+ --axis 'tests=na'
61
+
62
+ node .agentrig/eval/score.mjs report # latest per scenario/variant + per-axis means
63
+ node .agentrig/eval/score.mjs compare --scenario <id> # A/B variants side by side
64
+ ```
65
+
66
+ ### A/B variant evaluation
67
+ Run the **same scenario** under different harness versions (a prompt/skill/rule change) and save each
68
+ under a `--variant`. `score.mjs compare` puts them side by side. **A change that lowers the score is
69
+ a regression even if it "feels" better.** For deeper diffing, keep each run's `diff.patch` /
70
+ `output` artifacts next to the score (see the `harness-eval` skill).
71
+
72
+ ### Harness lift — does it actually help? (with vs without)
73
+ Prove the harness earns its keep in *your* repo by comparing a harness-on run to a harness-off
74
+ baseline:
75
+
76
+ ```bash
77
+ agentrig eval --dynamic --scenario <id> --variant harness # harness ON
78
+ agentrig eval --dynamic --scenario <id> --variant baseline # bare agent, no AGENTS.md/rules/skills
79
+ node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
80
+ ```
81
+
82
+ `compare --baseline` prints the per-axis and aggregate **delta** and a `HELPS`/`HURTS` verdict. A
83
+ positive aggregate delta means installing AgentRig improved agent behavior here.
84
+
85
+ ### Threshold
86
+ A scenario passes if its aggregate ≥ **0.8** (`passThreshold` in `axes.json`) with no observed axis
87
+ at 0.
88
+
89
+ ---
90
+
91
+ ## Sandboxing
92
+ Run dynamic evals under the guardrails in [`sandbox/eval-rules.md`](./sandbox/eval-rules.md): the
93
+ agent works in a throwaway worktree and must **not push, open PRs, or merge** — the eval measures
94
+ behavior, it must not mutate real branches.
@@ -0,0 +1,56 @@
1
+ {
2
+ "$schema": "agentrig-eval-axes/1",
3
+ "description": "Rubric registry for the dynamic harness eval. Defines, per rubric TYPE, the categories, their axes, and a BOUNDED issue-code list per axis. score.mjs validates judge output against this: scores must be 0/0.5/1.0, and any axis < 1.0 (with confidence > 0) must carry an evidence string and an issue code drawn from that axis's list. Inspired by epichan's agent_scoring/issue_spec/review_scoring registries.",
4
+ "tiers": [0, 0.5, 1.0],
5
+ "passThreshold": 0.8,
6
+ "types": {
7
+ "run": {
8
+ "label": "Implementation run (the harness doing a task)",
9
+ "categories": {
10
+ "output_quality": {
11
+ "correctness": ["OQ-CORRECT-WRONG", "OQ-CORRECT-PARTIAL", "OQ-CORRECT-EDGE"],
12
+ "scope": ["OQ-SCOPE-CHURN", "OQ-SCOPE-UNRELATED", "OQ-SCOPE-INCOMPLETE"],
13
+ "tests": ["OQ-TESTS-MISSING", "OQ-TESTS-WEAK", "OQ-TESTS-BROKEN"],
14
+ "clarity": ["OQ-CLARITY-NAMING", "OQ-CLARITY-COMPLEXITY", "OQ-CLARITY-COMMENTS"]
15
+ },
16
+ "agent_behavior": {
17
+ "self_verification": ["AB-VERIFY-SKIPPED", "AB-VERIFY-REDHANDOFF", "AB-VERIFY-PARTIAL"],
18
+ "gate_compliance": ["AB-GATE-SKIPPED", "AB-GATE-HUMANLABEL", "AB-GATE-ORDER"],
19
+ "tool_discipline": ["AB-TOOLS-OVERLIMIT", "AB-TOOLS-UNSCOPED", "AB-TOOLS-NOISE"],
20
+ "escalation": ["AB-ESCALATE-LATE", "AB-ESCALATE-THRASH", "AB-ESCALATE-NONE"]
21
+ },
22
+ "long_term_impact": {
23
+ "memory": ["LT-MEMORY-NOLOG", "LT-MEMORY-REPEAT", "LT-MEMORY-DUP"],
24
+ "regression_risk": ["LT-REGRESS-LIKELY", "LT-REGRESS-UNTESTED"],
25
+ "maintainability": ["LT-MAINTAIN-DEBT", "LT-MAINTAIN-COUPLING"]
26
+ }
27
+ }
28
+ },
29
+ "spec": {
30
+ "label": "Task/issue spec quality (before implementation)",
31
+ "categories": {
32
+ "spec_quality": {
33
+ "clarity": ["SP-CLARITY-VAGUE", "SP-CLARITY-AMBIGUOUS"],
34
+ "acceptance_criteria": ["SP-AC-MISSING", "SP-AC-UNTESTABLE"],
35
+ "scope_bounded": ["SP-SCOPE-TOOBIG", "SP-SCOPE-UNBOUNDED"],
36
+ "testability": ["SP-TEST-NOORACLE", "SP-TEST-NOREPRO"],
37
+ "context": ["SP-CONTEXT-MISSING", "SP-CONTEXT-STALE"]
38
+ }
39
+ }
40
+ },
41
+ "review": {
42
+ "label": "Review process quality (the reviewer's behavior)",
43
+ "categories": {
44
+ "review_quality": {
45
+ "finding_correctness": ["RV-FIND-WRONG", "RV-FIND-UNSUPPORTED"],
46
+ "severity_calibration": ["RV-SEV-OVER", "RV-SEV-UNDER"],
47
+ "false_positive_rate": ["RV-FP-NOISE", "RV-FP-STYLE"],
48
+ "coverage": ["RV-COV-MISSEDBUG", "RV-COV-SHALLOW"],
49
+ "actionability": ["RV-ACT-VAGUE", "RV-ACT-NOREPRO"],
50
+ "independence": ["RV-IND-SAMEMODEL", "RV-IND-RUBBERSTAMP"],
51
+ "blocking_decision": ["RV-BLOCK-WRONGPASS", "RV-BLOCK-WRONGFAIL"]
52
+ }
53
+ }
54
+ }
55
+ }
56
+ }