@doidor/agentrig 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +224 -0
  3. package/dist/agent/claude.js +125 -0
  4. package/dist/agent/claude.js.map +1 -0
  5. package/dist/agent/copilot.js +147 -0
  6. package/dist/agent/copilot.js.map +1 -0
  7. package/dist/agent/index.js +17 -0
  8. package/dist/agent/index.js.map +1 -0
  9. package/dist/agent/provider.js +10 -0
  10. package/dist/agent/provider.js.map +1 -0
  11. package/dist/cli.js +169 -0
  12. package/dist/cli.js.map +1 -0
  13. package/dist/commands/compile.js +42 -0
  14. package/dist/commands/compile.js.map +1 -0
  15. package/dist/commands/dashboard.js +35 -0
  16. package/dist/commands/dashboard.js.map +1 -0
  17. package/dist/commands/doctor.js +40 -0
  18. package/dist/commands/doctor.js.map +1 -0
  19. package/dist/commands/eval.js +178 -0
  20. package/dist/commands/eval.js.map +1 -0
  21. package/dist/commands/init.js +100 -0
  22. package/dist/commands/init.js.map +1 -0
  23. package/dist/commands/update.js +176 -0
  24. package/dist/commands/update.js.map +1 -0
  25. package/dist/core/activity.js +80 -0
  26. package/dist/core/activity.js.map +1 -0
  27. package/dist/core/audit.js +112 -0
  28. package/dist/core/audit.js.map +1 -0
  29. package/dist/core/compile.js +250 -0
  30. package/dist/core/compile.js.map +1 -0
  31. package/dist/core/fsutil.js +45 -0
  32. package/dist/core/fsutil.js.map +1 -0
  33. package/dist/core/install.js +97 -0
  34. package/dist/core/install.js.map +1 -0
  35. package/dist/core/knowledge.js +34 -0
  36. package/dist/core/knowledge.js.map +1 -0
  37. package/dist/core/logger.js +31 -0
  38. package/dist/core/logger.js.map +1 -0
  39. package/dist/core/paths.js +22 -0
  40. package/dist/core/paths.js.map +1 -0
  41. package/dist/core/setupsteps.js +72 -0
  42. package/dist/core/setupsteps.js.map +1 -0
  43. package/dist/core/state.js +19 -0
  44. package/dist/core/state.js.map +1 -0
  45. package/dist/core/surfaces.js +62 -0
  46. package/dist/core/surfaces.js.map +1 -0
  47. package/dist/prompts/index.js +117 -0
  48. package/dist/prompts/index.js.map +1 -0
  49. package/dist/version.js +26 -0
  50. package/dist/version.js.map +1 -0
  51. package/knowledge/PRINCIPLES.md +106 -0
  52. package/knowledge/manifest.json +247 -0
  53. package/knowledge/templates/AGENTS.md +66 -0
  54. package/knowledge/templates/AGENTS.package.example.md +19 -0
  55. package/knowledge/templates/agents/README.md +33 -0
  56. package/knowledge/templates/agents/developer.md +7 -0
  57. package/knowledge/templates/agents/developer.yml +7 -0
  58. package/knowledge/templates/agents/judge.md +6 -0
  59. package/knowledge/templates/agents/judge.yml +6 -0
  60. package/knowledge/templates/agents/reviewer.md +6 -0
  61. package/knowledge/templates/agents/reviewer.yml +7 -0
  62. package/knowledge/templates/agents/triager.md +8 -0
  63. package/knowledge/templates/agents/triager.yml +8 -0
  64. package/knowledge/templates/dashboard/dashboard.mjs +261 -0
  65. package/knowledge/templates/eval/RUBRIC.md +94 -0
  66. package/knowledge/templates/eval/axes.json +56 -0
  67. package/knowledge/templates/eval/checks.json +304 -0
  68. package/knowledge/templates/eval/sandbox/eval-rules.md +23 -0
  69. package/knowledge/templates/eval/scenarios/README.md +24 -0
  70. package/knowledge/templates/eval/scenarios/add-small-feature.md +28 -0
  71. package/knowledge/templates/eval/scenarios/fix-failing-test.md +27 -0
  72. package/knowledge/templates/eval/scenarios/review-catches-bug.md +30 -0
  73. package/knowledge/templates/eval/score.mjs +257 -0
  74. package/knowledge/templates/eval/static-audit.mjs +112 -0
  75. package/knowledge/templates/harness/ORCHESTRATION.md +53 -0
  76. package/knowledge/templates/harness/state-machine.yml +105 -0
  77. package/knowledge/templates/mcp/mcp.json +12 -0
  78. package/knowledge/templates/rules/README.md +32 -0
  79. package/knowledge/templates/rules/code-review.md +26 -0
  80. package/knowledge/templates/rules/coding-standards.md +15 -0
  81. package/knowledge/templates/rules/no-debug-logging.md +16 -0
  82. package/knowledge/templates/rules/security.md +23 -0
  83. package/knowledge/templates/scripts/repair-worktrees.sh +124 -0
  84. package/knowledge/templates/skills/fix-ci/SKILL.md +17 -0
  85. package/knowledge/templates/skills/harness-eval/SKILL.md +83 -0
  86. package/knowledge/templates/skills/self-verify/SKILL.md +25 -0
  87. package/knowledge/templates/skills/skill-authoring/SKILL.md +35 -0
  88. package/knowledge/templates/skills/skill-improver/SKILL.md +23 -0
  89. package/knowledge/templates/skills/verify-loop/SKILL.md +35 -0
  90. package/knowledge/templates/wiki/README.md +23 -0
  91. package/knowledge/templates/wiki/_TEMPLATE.md +16 -0
  92. package/knowledge/templates/wiki/index.md +29 -0
  93. package/knowledge/templates/wiki/troubleshooting.md +14 -0
  94. package/package.json +70 -0
@@ -0,0 +1,257 @@
1
+ #!/usr/bin/env node
2
+ // AgentRig dynamic-eval aggregator (principle 6). Owns the results JSON shape and VALIDATES every
3
+ // score against the rubric registry in axes.json — so results are never hand-edited and a judge
4
+ // cannot invent axes, tiers, or issue codes. Inspired by epichan's pydantic-validated scoring.
5
+ //
6
+ // Usage:
7
+ // node score.mjs save --type run --task add-small-feature --scenario add-small-feature \
8
+ // --judge claude-opus-4.8 [--variant v1] [--run RID] \
9
+ // --axis 'correctness=1.0' \
10
+ // --axis 'scope=0.5:OQ-SCOPE-CHURN:left package-lock.json churn in the diff' \
11
+ // --axis 'tests=na' # na = unobserved (confidence 0, excluded from rollups)
12
+ // node score.mjs report [--type run] [--variant v1] [--json]
13
+ // node score.mjs compare --scenario add-small-feature # A/B variants side by side
14
+ //
15
+ // Score tiers: 0 / 0.5 / 1.0. Any axis < 1.0 (and observed) REQUIRES an issue code from that axis's
16
+ // registry plus an evidence string. Category and aggregate scores are recomputed from axis data.
17
+ import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "node:fs";
18
+ import { fileURLToPath } from "node:url";
19
+ import { dirname, join } from "node:path";
20
+
21
+ const scriptDir = dirname(fileURLToPath(import.meta.url));
22
+ const resultsDir = join(scriptDir, "results");
23
+ const axesPath = join(scriptDir, "axes.json");
24
+
25
+ function loadRegistry() {
26
+ if (!existsSync(axesPath)) {
27
+ console.error(`axes.json not found at ${axesPath}`);
28
+ process.exit(2);
29
+ }
30
+ return JSON.parse(readFileSync(axesPath, "utf8"));
31
+ }
32
+
33
+ /** Build axis -> { category, codes } lookup for a rubric type. */
34
+ function axisIndex(registry, type) {
35
+ const def = registry.types?.[type];
36
+ if (!def) {
37
+ console.error(`unknown rubric type "${type}". Valid: ${Object.keys(registry.types).join(", ")}`);
38
+ process.exit(2);
39
+ }
40
+ const index = new Map();
41
+ for (const [category, axes] of Object.entries(def.categories)) {
42
+ for (const [axis, codes] of Object.entries(axes)) index.set(axis, { category, codes });
43
+ }
44
+ return index;
45
+ }
46
+
47
+ function getOpt(args, name, repeat = false) {
48
+ const out = [];
49
+ for (let i = 0; i < args.length; i++) if (args[i] === name) out.push(args[i + 1]);
50
+ return repeat ? out : out[0];
51
+ }
52
+
53
+ function fail(msg) {
54
+ console.error(`error: ${msg}`);
55
+ process.exit(2);
56
+ }
57
+
58
+ const [cmd, ...args] = process.argv.slice(2);
59
+ const registry = loadRegistry();
60
+ const TIERS = new Set(registry.tiers ?? [0, 0.5, 1.0]);
61
+ const PASS = registry.passThreshold ?? 0.8;
62
+
63
+ if (cmd === "save") {
64
+ const type = getOpt(args, "--type") || "run";
65
+ const index = axisIndex(registry, type);
66
+ const scenario = getOpt(args, "--scenario") || getOpt(args, "--task");
67
+ const task = getOpt(args, "--task") || scenario;
68
+ const judge = getOpt(args, "--judge") || "unknown";
69
+ const variant = getOpt(args, "--variant") || null;
70
+ const run = getOpt(args, "--run") || null;
71
+ if (!scenario) fail("save requires --scenario <id> (or --task <id>)");
72
+
73
+ const rawAxes = getOpt(args, "--axis", true);
74
+ if (rawAxes.length === 0) fail("save requires at least one --axis name=score[:CODE[:evidence]]");
75
+
76
+ const axes = rawAxes.map((spec) => {
77
+ const eq = spec.indexOf("=");
78
+ if (eq < 0) fail(`bad --axis "${spec}" (expected name=score[:CODE[:evidence]])`);
79
+ const name = spec.slice(0, eq);
80
+ const rest = spec.slice(eq + 1);
81
+ const meta = index.get(name);
82
+ if (!meta) fail(`unknown axis "${name}" for type "${type}". Valid: ${[...index.keys()].join(", ")}`);
83
+
84
+ // "na" marks an unobserved axis (confidence 0) — excluded from rollups.
85
+ if (rest === "na") return { name, category: meta.category, score: 0, issue: null, evidence: "", confidence: 0 };
86
+
87
+ const [scoreStr, code, ...evidenceParts] = rest.split(":");
88
+ const score = Number(scoreStr);
89
+ if (!TIERS.has(score)) fail(`axis "${name}" score must be one of ${[...TIERS].join("/")} — got "${scoreStr}"`);
90
+ const evidence = evidenceParts.join(":").trim();
91
+ if (score < 1) {
92
+ if (!code) fail(`axis "${name}" scored ${score} < 1.0 but has no issue code — use name=score:CODE[:evidence]`);
93
+ if (!meta.codes.includes(code)) fail(`issue code "${code}" is not valid for axis "${name}". Valid: ${meta.codes.join(", ")}`);
94
+ if (!evidence) fail(`axis "${name}" scored ${score} < 1.0 but has no evidence — use name=score:CODE:evidence`);
95
+ }
96
+ return { name, category: meta.category, score, issue: code || null, evidence, confidence: 1 };
97
+ });
98
+
99
+ // Recompute rollups from axis data (never trust hand-supplied totals). Confidence-gated.
100
+ const observed = axes.filter((a) => a.confidence > 0);
101
+ const categories = {};
102
+ for (const a of observed) (categories[a.category] ||= []).push(a.score);
103
+ const categoryScores = Object.fromEntries(
104
+ Object.entries(categories).map(([c, xs]) => [c, round(xs.reduce((s, x) => s + x, 0) / xs.length)]),
105
+ );
106
+ const aggregate = observed.length ? round(observed.reduce((s, a) => s + a.score, 0) / observed.length) : 0;
107
+ const pass = observed.length > 0 && aggregate >= PASS && observed.every((a) => a.score > 0);
108
+
109
+ const record = {
110
+ type, task, scenario, variant, run, judge,
111
+ timestamp: new Date().toISOString(),
112
+ aggregate, pass, categoryScores, axes,
113
+ };
114
+
115
+ if (!existsSync(resultsDir)) mkdirSync(resultsDir, { recursive: true });
116
+ const safe = (s) => String(s).replace(/[^a-zA-Z0-9_.-]/g, "_");
117
+ const file = join(resultsDir, `${safe(type)}.${safe(scenario)}.${safe(variant || "base")}.${Date.now()}.json`);
118
+ writeFileSync(file, JSON.stringify(record, null, 2));
119
+ console.log(`Saved ${file}\n aggregate=${aggregate.toFixed(2)} ${pass ? "PASS" : "FAIL"} (${observed.length}/${axes.length} axes observed)`);
120
+ process.exit(0);
121
+ }
122
+
123
+ if (cmd === "report" || cmd === "compare") {
124
+ const asJson = args.includes("--json");
125
+ const filterType = getOpt(args, "--type");
126
+ const filterVariant = getOpt(args, "--variant");
127
+ const records = loadRecords();
128
+
129
+ if (cmd === "compare") {
130
+ compare(records, getOpt(args, "--scenario"), asJson, getOpt(args, "--baseline"));
131
+ process.exit(0);
132
+ }
133
+
134
+ let scoped = records;
135
+ if (filterType) scoped = scoped.filter((r) => r.type === filterType);
136
+ if (filterVariant) scoped = scoped.filter((r) => (r.variant || "base") === filterVariant);
137
+
138
+ // Latest record per (type, scenario, variant).
139
+ const latest = new Map();
140
+ for (const r of scoped.sort((a, b) => a.timestamp.localeCompare(b.timestamp))) {
141
+ latest.set(`${r.type}::${r.scenario}::${r.variant || "base"}`, r);
142
+ }
143
+ const rows = [...latest.values()];
144
+ const axisAgg = new Map();
145
+ for (const r of rows) for (const a of r.axes) {
146
+ if (a.confidence <= 0) continue;
147
+ const x = axisAgg.get(a.name) || { sum: 0, n: 0 };
148
+ x.sum += a.score; x.n += 1; axisAgg.set(a.name, x);
149
+ }
150
+ const overall = rows.length ? round(rows.reduce((s, r) => s + r.aggregate, 0) / rows.length) : 0;
151
+
152
+ if (asJson) {
153
+ console.log(JSON.stringify({
154
+ overall,
155
+ results: rows.map((r) => ({ type: r.type, scenario: r.scenario, variant: r.variant, aggregate: r.aggregate, pass: r.pass, judge: r.judge })),
156
+ axes: [...axisAgg.entries()].map(([name, v]) => ({ name, mean: round(v.sum / v.n) })),
157
+ }, null, 2));
158
+ } else {
159
+ console.log("AgentRig — dynamic eval report\n");
160
+ if (rows.length === 0) {
161
+ console.log(" No results yet. Run `score.mjs save ...` first.");
162
+ } else {
163
+ const byType = new Map();
164
+ for (const r of rows) {
165
+ if (!byType.has(r.type)) byType.set(r.type, []);
166
+ byType.get(r.type).push(r);
167
+ }
168
+ for (const [type, group] of byType) {
169
+ console.log(` ${type.toUpperCase()}`);
170
+ for (const r of group) {
171
+ const v = r.variant ? ` [${r.variant}]` : "";
172
+ console.log(` ${r.pass ? "PASS" : "FAIL"} ${(r.scenario + v).padEnd(30)} ${r.aggregate.toFixed(2)} (${r.judge})`);
173
+ }
174
+ }
175
+ console.log("\n Per-axis means (observed only):");
176
+ for (const [name, v] of axisAgg) console.log(` ${name.padEnd(22)} ${round(v.sum / v.n).toFixed(2)}`);
177
+ console.log(`\n Overall: ${overall.toFixed(2)} across ${rows.length} result(s)`);
178
+ }
179
+ }
180
+ process.exit(0);
181
+ }
182
+
183
+ console.error("Usage: score.mjs <save|report|compare> ...");
184
+ process.exit(2);
185
+
186
+ // --- helpers ---------------------------------------------------------------
187
+ function round(n) {
188
+ return Math.round(n * 10000) / 10000;
189
+ }
190
+
191
+ function loadRecords() {
192
+ if (!existsSync(resultsDir)) return [];
193
+ const out = [];
194
+ for (const f of readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
195
+ try {
196
+ out.push(JSON.parse(readFileSync(join(resultsDir, f), "utf8")));
197
+ } catch {
198
+ console.error(`warning: skipping corrupt result file ${f}`);
199
+ }
200
+ }
201
+ return out;
202
+ }
203
+
204
+ function compare(records, scenario, asJson, baseline) {
205
+ if (!scenario) fail("compare requires --scenario <id>");
206
+ const forScenario = records.filter((r) => r.scenario === scenario);
207
+ const latestByVariant = new Map();
208
+ for (const r of forScenario.sort((a, b) => a.timestamp.localeCompare(b.timestamp))) {
209
+ latestByVariant.set(r.variant || "base", r);
210
+ }
211
+ const variants = [...latestByVariant.values()];
212
+
213
+ // Harness-lift mode: delta of every other variant vs the baseline.
214
+ let lift = null;
215
+ if (baseline) {
216
+ const base = latestByVariant.get(baseline);
217
+ if (!base) fail(`no results for baseline variant "${baseline}" on scenario "${scenario}"`);
218
+ lift = variants
219
+ .filter((r) => (r.variant || "base") !== baseline)
220
+ .map((r) => {
221
+ const axisDelta = {};
222
+ const baseAxes = Object.fromEntries((base.axes || []).filter((a) => a.confidence > 0).map((a) => [a.name, a.score]));
223
+ for (const a of (r.axes || []).filter((a) => a.confidence > 0)) {
224
+ if (baseAxes[a.name] !== undefined) axisDelta[a.name] = round(a.score - baseAxes[a.name]);
225
+ }
226
+ return { variant: r.variant || "base", aggregateDelta: round(r.aggregate - base.aggregate), axisDelta };
227
+ });
228
+ }
229
+
230
+ if (asJson) {
231
+ console.log(JSON.stringify({
232
+ scenario,
233
+ variants: variants.map((r) => ({ variant: r.variant || "base", aggregate: r.aggregate, pass: r.pass, judge: r.judge, categoryScores: r.categoryScores })),
234
+ ...(lift ? { baseline, lift } : {}),
235
+ }, null, 2));
236
+ process.exit(0);
237
+ }
238
+
239
+ console.log(`AgentRig — variant comparison for "${scenario}"\n`);
240
+ if (variants.length === 0) console.log(" No results for that scenario.");
241
+ for (const r of variants) {
242
+ console.log(` ${(r.variant || "base").padEnd(12)} ${r.aggregate.toFixed(2)} ${r.pass ? "PASS" : "FAIL"} (${r.judge})`);
243
+ for (const [c, s] of Object.entries(r.categoryScores || {})) console.log(` ${c.padEnd(20)} ${s.toFixed(2)}`);
244
+ }
245
+ if (lift) {
246
+ console.log(`\n Harness lift vs baseline "${baseline}":`);
247
+ for (const l of lift) {
248
+ const sign = l.aggregateDelta > 0 ? "+" : "";
249
+ const verdict = l.aggregateDelta > 0 ? "HELPS" : l.aggregateDelta < 0 ? "HURTS" : "no change";
250
+ console.log(` ${l.variant.padEnd(12)} aggregate ${sign}${l.aggregateDelta.toFixed(2)} → harness ${verdict}`);
251
+ for (const [name, d] of Object.entries(l.axisDelta)) {
252
+ if (d !== 0) console.log(` ${name.padEnd(20)} ${d > 0 ? "+" : ""}${d.toFixed(2)}`);
253
+ }
254
+ }
255
+ }
256
+ process.exit(0);
257
+ }
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env node
2
+ // AgentRig static harness audit (principle 6) — deterministic, dependency-free, no model.
3
+ // Interprets checks.json (the single source of truth, shared with `agentrig eval --static`)
4
+ // against this repository and prints a Harness Score. Usage:
5
+ // node .agentrig/eval/static-audit.mjs human-readable report
6
+ // node .agentrig/eval/static-audit.mjs --json machine-readable
7
+ // node .agentrig/eval/static-audit.mjs --min 80 exit non-zero if score < 80%
8
+ import { readFileSync, existsSync, statSync, readdirSync } from "node:fs";
9
+ import { fileURLToPath } from "node:url";
10
+ import { dirname, join, resolve } from "node:path";
11
+
12
+ const scriptDir = dirname(fileURLToPath(import.meta.url));
13
+ const repoRoot = resolve(scriptDir, "..", "..");
14
+ const checksPath = join(scriptDir, "checks.json");
15
+
16
+ const args = process.argv.slice(2);
17
+ const asJson = args.includes("--json");
18
+ const minIdx = args.indexOf("--min");
19
+ const minPct = minIdx >= 0 ? Number(args[minIdx + 1]) : null;
20
+
21
+ const rel = (p) => resolve(repoRoot, p);
22
+ const read = (p) => (existsSync(rel(p)) ? readFileSync(rel(p), "utf8") : null);
23
+
24
+ function frontmatter(text) {
25
+ if (!text || !text.startsWith("---")) return null;
26
+ const end = text.indexOf("\n---", 3);
27
+ if (end < 0) return null;
28
+ return text.slice(3, end);
29
+ }
30
+ function extractValue(text, key) {
31
+ if (!text) return null;
32
+ const m = text.match(new RegExp("^\\s*" + key + "\\s*:\\s*(.+)\\s*$", "m"));
33
+ return m ? m[1].trim() : null;
34
+ }
35
+
36
+ function scoreCheck(c) {
37
+ switch (c.type) {
38
+ case "path-exists":
39
+ return { score: existsSync(rel(c.path)) ? 1 : 0, evidence: existsSync(rel(c.path)) ? "" : `missing ${c.path}` };
40
+ case "file-contains": {
41
+ const text = read(c.path);
42
+ if (text == null) return { score: 0, evidence: `missing ${c.path}` };
43
+ const missing = (c.patterns || []).filter((p) => !text.includes(p));
44
+ if (missing.length === 0) return { score: 1, evidence: "" };
45
+ return { score: 0.5, evidence: `present but missing markers: ${missing.join(", ")}` };
46
+ }
47
+ case "dir-min": {
48
+ const abs = rel(c.path);
49
+ if (!existsSync(abs) || !statSync(abs).isDirectory()) return { score: 0, evidence: `missing dir ${c.path}` };
50
+ const n = readdirSync(abs).filter((e) => !e.startsWith(".")).length;
51
+ if (n >= (c.min || 1)) return { score: 1, evidence: "" };
52
+ return { score: 0.5, evidence: `${n} entr${n === 1 ? "y" : "ies"}, need ${c.min}` };
53
+ }
54
+ case "frontmatter-keys": {
55
+ const fm = frontmatter(read(c.path));
56
+ if (fm == null) return { score: 0, evidence: `no frontmatter in ${c.path}` };
57
+ const missing = (c.keys || []).filter((k) => !new RegExp("^\\s*" + k + "\\s*:", "m").test(fm));
58
+ if (missing.length === 0) return { score: 1, evidence: "" };
59
+ return { score: 0.5, evidence: `missing keys: ${missing.join(", ")}` };
60
+ }
61
+ case "roles-distinct-models": {
62
+ const dev = extractValue(read(c.developer), c.key || "model");
63
+ const rev = extractValue(read(c.reviewer), c.key || "model");
64
+ if (!dev || !rev) return { score: 0, evidence: "developer/reviewer model not declared" };
65
+ if (dev !== rev) return { score: 1, evidence: "" };
66
+ return { score: 0.5, evidence: `developer and reviewer share model "${dev}"` };
67
+ }
68
+ default:
69
+ return { score: 0, evidence: `unknown check type ${c.type}` };
70
+ }
71
+ }
72
+
73
+ if (!existsSync(checksPath)) {
74
+ console.error(`checks.json not found at ${checksPath}`);
75
+ process.exit(2);
76
+ }
77
+ const { checks } = JSON.parse(readFileSync(checksPath, "utf8"));
78
+ const results = checks.map((c) => ({ ...c, ...scoreCheck(c) }));
79
+
80
+ let wSum = 0, wScore = 0;
81
+ const byPrinciple = new Map();
82
+ for (const r of results) {
83
+ const w = r.weight ?? 1;
84
+ wSum += w;
85
+ wScore += w * r.score;
86
+ const p = byPrinciple.get(r.principle) || { sum: 0, n: 0 };
87
+ p.sum += r.score; p.n += 1;
88
+ byPrinciple.set(r.principle, p);
89
+ }
90
+ const aggregate = wSum ? wScore / wSum : 0;
91
+ const pct = Math.round(aggregate * 1000) / 10;
92
+
93
+ if (asJson) {
94
+ console.log(JSON.stringify({
95
+ harnessScore: pct,
96
+ aggregate,
97
+ principles: [...byPrinciple.entries()].sort((a, b) => a[0] - b[0]).map(([principle, v]) => ({ principle, score: v.sum / v.n })),
98
+ checks: results.map((r) => ({ id: r.id, principle: r.principle, title: r.title, score: r.score, evidence: r.evidence })),
99
+ }, null, 2));
100
+ } else {
101
+ console.log("AgentRig — harness audit\n");
102
+ for (const r of results.sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id))) {
103
+ const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
104
+ console.log(` [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n ↳ ${r.evidence}` : ""));
105
+ }
106
+ console.log(`\n Harness Score: ${pct}% (${results.filter((r) => r.score === 1).length}/${results.length} checks full credit)`);
107
+ }
108
+
109
+ if (minPct != null && pct < minPct) {
110
+ if (!asJson) console.error(`\nHarness Score ${pct}% is below required ${minPct}%`);
111
+ process.exit(1);
112
+ }
@@ -0,0 +1,53 @@
1
+ # Orchestration contract (principles 1, 3, 10)
2
+
3
+ How AgentRig expects a harness engine to drive `.agentrig/harness/state-machine.yml`. AgentRig
4
+ *installs* this contract as plain text; a runner (the epi-platform engine, a CI job, or your own
5
+ script) executes it. Synthesized from epichan's engine.
6
+
7
+ ## 1. Triggers: who may move state
8
+ Every transition declares a `trigger` kind (`triggers.kinds` in the state machine):
9
+ - **agent** — a role does the work (triager/developer/reviewer/judge).
10
+ - **script** — a poller/reconciler asserts state from GitHub on a cadence.
11
+ - **auto** — a deterministic immediate transition.
12
+ - **event** — a GitHub webhook maps straight to a state (`event_to_state`).
13
+ - **human** — a person performs a low-reversibility action (e.g. merge).
14
+
15
+ Agents may only drive `agent` transitions. They must never fabricate `script`/`event`/`human` ones.
16
+
17
+ ## 2. Hybrid event + polling reconciliation
18
+ - **Events** give low-latency reactions: `pull_request.synchronize → reviewing`,
19
+ `check_suite.completed.failure → implementing`, etc. (`event_to_state`).
20
+ - **Pollers** repair anything events missed, on per-state cadences (`reconciliation`): ingest ready
21
+ issues (60s), route task PRs (300s), confirm merges (120s).
22
+ - The engine never *assumes* a PR merged — it flips `ready_to_merge → merged` only after GitHub
23
+ reports it merged.
24
+
25
+ If the engine crashes, GitHub still holds the truth and the pollers re-derive engine state.
26
+
27
+ ## 3. Compare-and-set transitions (no double-work)
28
+ Every state change carries the **status it expects to replace** (`transitions_policy.require_expected_status`).
29
+ If the engine reports a conflict (409), another agent or poller already advanced the task — **skip,
30
+ don't clobber**. This is the core guard that lets a multi-agent pool run safely.
31
+
32
+ ## 4. Claim grace + stuck recovery
33
+ - A task isn't "reclaimable" until `recovery.claim_grace_seconds` (300s) after it's claimed — avoids
34
+ yanking work that just started.
35
+ - A recovery sweep every `recovery.scan_seconds` (120s) re-queues anything stuck past
36
+ `recovery.stuck_after_hours` (4h) back to `requeue_to` (`queued`).
37
+
38
+ ## 5. Hooks gate irreversible actions
39
+ `hooks.pre_pr` and `hooks.pre_merge` run before opening a PR and before merging. A failing hook
40
+ blocks the transition. Put `self-verify` on `pre_pr` and `harness-eval` on `pre_merge`.
41
+
42
+ ## 6. Hard limits and runaway protection
43
+ `limits` caps concurrency, review iterations, diff size, and a token `runaway_token_cap`. These keep
44
+ an agent pool from melting the repo. Protected/human-only labels (`labels.human_only`) require a
45
+ person.
46
+
47
+ ## 7. Model tiers
48
+ Roles reference a `model_tier` (cheap/standard/premium), not a hardcoded model, so cost/quality is
49
+ re-routable in one place. Keep adjacent pipeline roles on different model families.
50
+
51
+ ## 8. Progress visibility
52
+ `issue_comments.on` posts a GitHub comment on task creation, each state transition, failure, and PR
53
+ open — so humans can follow the pool without watching the engine.
@@ -0,0 +1,105 @@
1
+ # AgentRig harness state machine
2
+ # Principle 1 (explicit state machine), 3 (system of record), 9 (human gates), 10 (hard limits).
3
+ # The DAG is the contract. Agents do not invent transitions; reviewers cannot skip gates.
4
+
5
+ version: 1
6
+
7
+ states:
8
+ - name: ingested # issue/task picked up
9
+ - name: queued # ready for an agent
10
+ - name: implementing # developer role is working
11
+ - name: reviewing # reviewer role is judging the diff
12
+ - name: judging # independent judge scores against the rubric
13
+ - name: ready_to_merge # all gates green
14
+ - name: merged
15
+ - name: closed
16
+ - name: parked # self-parked: needs a human (low reversibility)
17
+
18
+ transitions:
19
+ - from: ingested to: queued trigger: agent role: triager gate: human_approval
20
+ - from: queued to: implementing trigger: agent role: developer
21
+ - from: implementing to: reviewing trigger: agent role: developer gate: self_verify_passed
22
+ - from: reviewing to: judging trigger: agent role: reviewer
23
+ - from: reviewing to: implementing trigger: agent role: reviewer reason: changes_requested
24
+ - from: judging to: ready_to_merge trigger: agent role: judge gate: rubric_passed
25
+ - from: judging to: implementing trigger: agent role: judge reason: below_threshold
26
+ - from: ready_to_merge to: merged trigger: human gate: human_approval # principle 9
27
+ - from: any to: parked trigger: auto reason: low_reversibility_or_stuck
28
+ - from: any to: closed trigger: human
29
+
30
+ # Principle 3: GitHub labels mirror DAG state. Human-only labels must never be applied by an agent.
31
+ labels:
32
+ state_map:
33
+ queued: agentrig-ready
34
+ implementing: agentrig-started
35
+ reviewing: agentrig-in-review
36
+ ready_to_merge: agentrig-approved
37
+ human_only:
38
+ - acknowledge-breaking-change
39
+ - override-protected-files
40
+
41
+ # Principle 10: hard limits and safety nets.
42
+ limits:
43
+ max_concurrent_agents: 4
44
+ max_review_iterations: 5
45
+ max_diff_chars: 50000
46
+ runaway_token_cap: 5000000
47
+ recovery_scan_seconds: 120
48
+ requeue_if_stuck_hours: 4
49
+
50
+ hooks:
51
+ pre_pr:
52
+ - self-verify
53
+ pre_merge:
54
+ - harness-eval
55
+
56
+ # --- Trigger taxonomy (principle 1) ------------------------------------------
57
+ # Every transition above is driven by one of these trigger kinds. Making the kind explicit keeps
58
+ # orchestration debuggable: agents only drive `agent` transitions; everything else is automation.
59
+ triggers:
60
+ kinds:
61
+ agent: "An agent role performs the work and reports the result."
62
+ script: "A poller/reconciler script asserts state from GitHub on a cadence."
63
+ auto: "A deterministic, immediate transition (no work, e.g. legacy normalization)."
64
+ event: "A GitHub webhook event maps directly to a state (see event_to_state)."
65
+ human: "A person performs a low-reversibility action (e.g. merge approval)."
66
+
67
+ # Reactive transitions: GitHub webhook events map straight to a state (low latency), while the
68
+ # pollers below repair anything missed (self-healing). Adopted from epichan.
69
+ event_to_state:
70
+ "pull_request.review_comment": implementing # reviewer asked for changes -> back to dev
71
+ "check_suite.completed.failure": implementing # CI went red -> fix it
72
+ "pull_request.synchronize": reviewing # new commits pushed -> re-review
73
+ "conflict_detected": implementing
74
+
75
+ # Concurrency-safe transitions: a state change must declare the status it expects to replace
76
+ # (compare-and-set). A mismatch (409) means another agent/poller already moved it — skip, don't
77
+ # clobber. This is what prevents double-work in a multi-agent pool.
78
+ transitions_policy:
79
+ require_expected_status: true # every transition passes --expected-status; 409 => safe skip
80
+
81
+ # --- Reconciliation & recovery (principle 3, 10) -----------------------------
82
+ # GitHub is the system of record; pollers re-assert engine state from it on a cadence, and a
83
+ # recovery sweep re-queues abandoned work. Latency vs. cost is tuned per cadence.
84
+ reconciliation:
85
+ poll_ready_issues_seconds: 60 # ingest newly-ready issues
86
+ poll_task_prs_seconds: 300 # route open task PRs by live health
87
+ reconcile_merged_seconds: 120 # only mark merged once GitHub says merged
88
+ recovery:
89
+ enabled: true
90
+ scan_seconds: 120
91
+ claim_grace_seconds: 300 # don't reclaim a task until the claim has had time to start
92
+ stuck_after_hours: 4 # past this with no progress -> requeue
93
+ requeue_to: queued
94
+
95
+ # Progress visibility: post a GitHub comment on these lifecycle events so humans can follow along.
96
+ issue_comments:
97
+ on: [task_created, state_transition, task_failed, pr_opened]
98
+
99
+ # --- Model tiers (principle 2) -----------------------------------------------
100
+ # Roles reference a tier, not a hardcoded model, so you can re-route cost/quality in one place.
101
+ # Keep adjacent pipeline roles on DIFFERENT model families (single-model-bias mitigation).
102
+ model_tiers:
103
+ cheap: { models: [claude-haiku-4.5, gpt-5-mini], use: "triage, high-volume analysis" }
104
+ standard: { models: [claude-sonnet-4.5, gpt-5.4], use: "implementation" }
105
+ premium: { models: [claude-opus-4.5, gpt-5], use: "review, judging, auditing" }
@@ -0,0 +1,12 @@
1
+ {
2
+ "$comment": "AgentRig MCP config (principle 11). The same servers should be mirrored to .vscode/mcp.json and .github/copilot/mcp.json so every vendor CLI sees the same tools.",
3
+ "mcpServers": {
4
+ "github": {
5
+ "command": "npx",
6
+ "args": ["-y", "@modelcontextprotocol/server-github"],
7
+ "env": {
8
+ "GITHUB_PERSONAL_ACCESS_TOKEN": "${GITHUB_PERSONAL_ACCESS_TOKEN}"
9
+ }
10
+ }
11
+ }
12
+ }
@@ -0,0 +1,32 @@
1
+ # Rules (principle 4)
2
+
3
+ Rules are **reflexes**: short, glob-scoped instructions auto-loaded when a matching file is edited.
4
+ Unlike skills (which are procedures you invoke), rules apply passively to every edit in scope.
5
+
6
+ ## Priority order
7
+ Each rule declares a `priority` in its frontmatter. When multiple rules match, lower numbers win on
8
+ conflict:
9
+ 1. **Specialized / security** (`security.md`, framework- or area-specific rules) — `priority: 1`
10
+ 2. **Review & accessibility** (`code-review.md`, any a11y rules) — `priority: 2`
11
+ 3. **Baseline coding standards** (`coding-standards.md`, `no-debug-logging.md`) — `priority: 3`
12
+
13
+ ## Default rules installed
14
+ - `security.md` — secrets, input validation, injection, least privilege (priority 1).
15
+ - `code-review.md` — what a reviewer should/shouldn't flag, to keep review high-signal (priority 2).
16
+ - `coding-standards.md` — baseline change discipline (priority 3).
17
+ - `no-debug-logging.md` — no stray debug output/`debugger` in committed code (priority 3).
18
+
19
+ ## Authoring a rule
20
+ Start each rule with frontmatter declaring its glob scope, a one-line description, and a priority:
21
+
22
+ ```markdown
23
+ ---
24
+ globs: ["src/**/*.ts"]
25
+ description: One-line summary of the reflex.
26
+ priority: 1
27
+ ---
28
+ ```
29
+
30
+ Keep rules to a handful of imperative bullets. If a rule grows into a procedure, promote it to a
31
+ skill under `.agents/skills/`. Replace these generic defaults with repo-specific standards and add
32
+ specialized, glob-scoped rules alongside them.
@@ -0,0 +1,26 @@
1
+ ---
2
+ globs: ["**/*"]
3
+ description: What an AI reviewer should and should not flag. Keeps review high-signal.
4
+ priority: 2
5
+ ---
6
+
7
+ # Code review rules (reflex)
8
+
9
+ When reviewing a diff, keep signal high. **Do NOT:**
10
+ - Suggest adding comments/JSDoc/docstrings to self-explanatory code.
11
+ - Comment on formatting/style handled by a formatter or linter.
12
+ - Flag missing imports or type errors the compiler/build already catches.
13
+ - Suggest adding `console.log`/`print`/debug statements.
14
+ - Claim the build will fail when CI is green.
15
+ - Bikeshed naming when the existing name is clear enough.
16
+ - Re-review unchanged lines or restate what the diff obviously does.
17
+
18
+ **DO flag (these are the point of review):**
19
+ - Correctness bugs, logic errors, off-by-ones, unhandled edge cases.
20
+ - Security issues (see `security.md`) and unsafe input handling.
21
+ - Concurrency/race conditions and resource leaks.
22
+ - Missing or wrong tests for changed behavior.
23
+ - Public API/contract breaks and likely regressions.
24
+
25
+ Score findings by confidence and **only surface blocking issues plus genuinely useful suggestions**.
26
+ If you would request changes, give a concrete, testable reason.
@@ -0,0 +1,15 @@
1
+ ---
2
+ globs: ["**/*"]
3
+ description: Baseline coding standards applied to every change in this repo.
4
+ ---
5
+
6
+ # Coding standards (reflex)
7
+
8
+ - Make the smallest change that fully solves the task; do not refactor unrelated code.
9
+ - Match the surrounding style; do not introduce a new formatter/linter without being asked.
10
+ - No secrets in source. No disabling tests to make CI green.
11
+ - Add or update tests for behavior you change.
12
+ - Prefer clear names over comments; comment only non-obvious intent.
13
+
14
+ > AgentRig installs this as a generic baseline. Replace it with repo-specific standards and add
15
+ > specialized, glob-scoped rules alongside it.
@@ -0,0 +1,16 @@
1
+ ---
2
+ globs: ["**/*"]
3
+ description: No stray debug output or debugger statements in committed code.
4
+ priority: 3
5
+ ---
6
+
7
+ # No debug logging left behind (reflex)
8
+
9
+ - Don't commit `console.log`/`console.debug`, `print`, `dbg!`, `fmt.Println` debug spew, or
10
+ `debugger;` statements added while investigating.
11
+ - Use the project's existing logger/abstraction for intentional, structured logging — match what the
12
+ surrounding code already uses; don't introduce a new logging mechanism unasked.
13
+ - Temporary diagnostics are fine while iterating, but remove them before `self-verify`/handoff.
14
+
15
+ This is a baseline. If the repo has a specific logger convention, encode it as a specialized,
16
+ glob-scoped rule that takes priority over this one.
@@ -0,0 +1,23 @@
1
+ ---
2
+ globs: ["**/*"]
3
+ description: Security reflexes applied to every change. Specialized — highest priority.
4
+ priority: 1
5
+ ---
6
+
7
+ # Security rules (reflex)
8
+
9
+ Apply on every edit. When in doubt, stop and flag rather than guess.
10
+
11
+ - **No secrets in source.** Never commit tokens, keys, passwords, or connection strings. Read them
12
+ from environment/secret stores. If you spot a committed secret, stop and report it.
13
+ - **Validate and sanitize all external input** (request bodies, query params, CLI args, file
14
+ contents, env). Reject/normalize before use.
15
+ - **No injection.** Use parameterized queries; never string-concatenate SQL/shell/HTML from input.
16
+ Avoid `eval`, dynamic `require`, and shelling out with unsanitized input.
17
+ - **Escape on output** to prevent XSS; use the framework's escaping, not hand-rolled.
18
+ - **Least privilege.** Don't broaden file, network, or token scopes to make something work.
19
+ - **Don't disable security controls** (auth checks, CSRF, TLS verification, lint security rules) to
20
+ pass a test or unblock a build.
21
+ - **Dependencies:** prefer maintained, pinned versions; don't add a dependency to avoid a few lines.
22
+
23
+ If a change touches auth, crypto, or input boundaries, call it out explicitly for review.