@doidor/agentrig 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -27
- package/dist/agent/copilot.js +46 -5
- package/dist/agent/copilot.js.map +1 -1
- package/dist/cli.js +30 -5
- package/dist/cli.js.map +1 -1
- package/dist/commands/doctor.js +53 -8
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/eval-dynamic.js +316 -0
- package/dist/commands/eval-dynamic.js.map +1 -0
- package/dist/commands/eval-scaffold.js +173 -0
- package/dist/commands/eval-scaffold.js.map +1 -0
- package/dist/commands/eval.js +184 -55
- package/dist/commands/eval.js.map +1 -1
- package/dist/core/audit.js +237 -9
- package/dist/core/audit.js.map +1 -1
- package/dist/core/model-family.js +31 -0
- package/dist/core/model-family.js.map +1 -0
- package/dist/core/scenario-runner.js +298 -0
- package/dist/core/scenario-runner.js.map +1 -0
- package/dist/prompts/index.js +121 -30
- package/dist/prompts/index.js.map +1 -1
- package/knowledge/PRINCIPLES.md +2 -2
- package/knowledge/manifest.json +16 -1
- package/knowledge/templates/AGENTS.md +7 -6
- package/knowledge/templates/agents/README.md +4 -4
- package/knowledge/templates/agents/developer.yml +1 -1
- package/knowledge/templates/agents/judge.yml +1 -1
- package/knowledge/templates/agents/reviewer.yml +1 -1
- package/knowledge/templates/agents/triager.yml +5 -4
- package/knowledge/templates/dashboard/dashboard.mjs +12 -5
- package/knowledge/templates/eval/RUBRIC.md +87 -64
- package/knowledge/templates/eval/axes.json +25 -25
- package/knowledge/templates/eval/calibration/README.md +54 -0
- package/knowledge/templates/eval/calibration/review/seed-correct.yml +43 -0
- package/knowledge/templates/eval/calibration/run/seed-correct.yml +35 -0
- package/knowledge/templates/eval/calibration/run/seed-no-verify.yml +34 -0
- package/knowledge/templates/eval/checks.json +88 -11
- package/knowledge/templates/eval/scenarios/add-small-feature/README.md +17 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/SPEC.md +25 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/package.json +9 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/src/slugify.js +5 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/fixture/tests/feature.test.js +31 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/judge_brief.md +25 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/oracle.yml +41 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/prompt.md +17 -0
- package/knowledge/templates/eval/scenarios/add-small-feature/scenario.yml +22 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/README.md +18 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/package.json +9 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/src/math.js +13 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/add.test.js +7 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/divide.test.js +11 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/fixture/tests/multiply.test.js +7 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/judge_brief.md +20 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/oracle.yml +33 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/prompt.md +12 -0
- package/knowledge/templates/eval/scenarios/fix-failing-test/scenario.yml +23 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/README.md +17 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/package.json +6 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/format.js +4 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/baseline/src/pagination.js +7 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/format.js +6 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/fixture/change/src/pagination.js +7 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/judge_brief.md +38 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/oracle.yml +29 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/prompt.md +33 -0
- package/knowledge/templates/eval/scenarios/review-catches-bug/scenario.yml +23 -0
- package/knowledge/templates/eval/score.mjs +368 -42
- package/knowledge/templates/eval/static-audit.mjs +204 -17
- package/knowledge/templates/harness/state-machine.yml +18 -12
- package/knowledge/templates/skills/harness-eval/SKILL.md +59 -54
- package/knowledge/templates/skills/log-gotcha/SKILL.md +68 -0
- package/knowledge/templates/skills/self-verify/SKILL.md +32 -8
- package/package.json +4 -3
- package/knowledge/templates/eval/scenarios/README.md +0 -24
- package/knowledge/templates/eval/scenarios/add-small-feature.md +0 -28
- package/knowledge/templates/eval/scenarios/fix-failing-test.md +0 -27
- package/knowledge/templates/eval/scenarios/review-catches-bug.md +0 -30
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
// AgentRig static harness audit (principle 6) — deterministic, dependency-free, no model.
|
|
3
3
|
// Interprets checks.json (the single source of truth, shared with `agentrig eval --static`)
|
|
4
|
-
// against this repository and prints
|
|
4
|
+
// against this repository and prints an Install Completeness + Quality Probes report. Usage:
|
|
5
5
|
// node .agentrig/eval/static-audit.mjs human-readable report
|
|
6
6
|
// node .agentrig/eval/static-audit.mjs --json machine-readable
|
|
7
|
-
// node .agentrig/eval/static-audit.mjs --min 80 exit non-zero if
|
|
7
|
+
// node .agentrig/eval/static-audit.mjs --min 80 exit non-zero if completeness < 80%
|
|
8
8
|
import { readFileSync, existsSync, statSync, readdirSync } from "node:fs";
|
|
9
9
|
import { fileURLToPath } from "node:url";
|
|
10
10
|
import { dirname, join, resolve } from "node:path";
|
|
@@ -33,6 +33,66 @@ function extractValue(text, key) {
|
|
|
33
33
|
return m ? m[1].trim() : null;
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
+
// Mirror of src/core/model-family.ts. Kept inline to keep this script dep-free.
|
|
37
|
+
const FAMILY_PATTERNS = [
|
|
38
|
+
["anthropic-claude", /^(anthropic[\.\/-])?claude([-_\.]|$)/i],
|
|
39
|
+
["openai-gpt", /^(openai[\.\/-])?(gpt|o[1-9]|codex|davinci|chatgpt)([-_\.]|$)/i],
|
|
40
|
+
["google-gemini", /^(google[\.\/-])?(gemini|palm|bard|flash)([-_\.]|$)/i],
|
|
41
|
+
["mistral", /^(mistral|mixtral|codestral|ministral)([-_\.]|$)/i],
|
|
42
|
+
["deepseek", /^deepseek([-_\.]|$)/i],
|
|
43
|
+
["meta-llama", /^(meta[\.\/-])?(llama|code-?llama)([-_\.]|$)/i],
|
|
44
|
+
["xai-grok", /^(xai[\.\/-])?grok([-_\.]|$)/i],
|
|
45
|
+
["cohere", /^(cohere[\.\/-])?(command|aya)([-_\.]|$)/i],
|
|
46
|
+
["qwen", /^qwen([-_\.]|$)/i],
|
|
47
|
+
];
|
|
48
|
+
function modelFamily(id) {
|
|
49
|
+
if (!id) return "";
|
|
50
|
+
for (const [name, rx] of FAMILY_PATTERNS) if (rx.test(id)) return name;
|
|
51
|
+
const m = id.match(/^([a-z0-9]+)/i);
|
|
52
|
+
return m ? `unknown:${m[1].toLowerCase()}` : `unknown:${id}`;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Line-oriented mini-YAML reader good enough for state-machine.yml.
|
|
56
|
+
function readStateMachine(text) {
|
|
57
|
+
if (!text) return { states: [], transitions: [] };
|
|
58
|
+
const states = [];
|
|
59
|
+
const transitions = [];
|
|
60
|
+
let inStates = false;
|
|
61
|
+
let inTransitions = false;
|
|
62
|
+
for (const raw of text.split(/\r?\n/)) {
|
|
63
|
+
if (/^states:\s*$/.test(raw)) { inStates = true; inTransitions = false; continue; }
|
|
64
|
+
if (/^transitions:\s*$/.test(raw)) { inTransitions = true; inStates = false; continue; }
|
|
65
|
+
if (/^\S/.test(raw)) { inStates = false; inTransitions = false; continue; }
|
|
66
|
+
const line = raw.replace(/#.*$/, "").trimEnd();
|
|
67
|
+
if (!line.trim().startsWith("-")) continue;
|
|
68
|
+
if (inStates) {
|
|
69
|
+
const m = line.match(/-\s*name:\s*(\S+)/);
|
|
70
|
+
if (m) states.push(m[1]);
|
|
71
|
+
} else if (inTransitions) {
|
|
72
|
+
// Accept both legacy ("- from: A to: B trigger: ...") and proper flow-mapping
|
|
73
|
+
// ("- { from: A, to: B, trigger: ... }") syntaxes.
|
|
74
|
+
const item = line.replace(/^\s*-\s*/, "").replace(/^\{|\}$/g, "");
|
|
75
|
+
const get = (k) => (item.match(new RegExp("\\b" + k + ":\\s*([^,\\s}]+)")) || [])[1];
|
|
76
|
+
const from = get("from"), to = get("to"), trigger = get("trigger");
|
|
77
|
+
if (from && to) transitions.push({ from, to, trigger });
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return { states, transitions };
|
|
81
|
+
}
|
|
82
|
+
function hasPath(adj, src, dst) {
|
|
83
|
+
if (src === dst) return true;
|
|
84
|
+
const seen = new Set([src]);
|
|
85
|
+
const q = [src];
|
|
86
|
+
while (q.length) {
|
|
87
|
+
const cur = q.shift();
|
|
88
|
+
for (const n of adj.get(cur) || []) {
|
|
89
|
+
if (n === dst) return true;
|
|
90
|
+
if (!seen.has(n)) { seen.add(n); q.push(n); }
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return false;
|
|
94
|
+
}
|
|
95
|
+
|
|
36
96
|
function scoreCheck(c) {
|
|
37
97
|
switch (c.type) {
|
|
38
98
|
case "path-exists":
|
|
@@ -58,6 +118,25 @@ function scoreCheck(c) {
|
|
|
58
118
|
if (missing.length === 0) return { score: 1, evidence: "" };
|
|
59
119
|
return { score: 0.5, evidence: `missing keys: ${missing.join(", ")}` };
|
|
60
120
|
}
|
|
121
|
+
case "frontmatter-keys-all": {
|
|
122
|
+
const dir = c.path, fileName = c.file || "SKILL.md";
|
|
123
|
+
const abs = rel(dir);
|
|
124
|
+
if (!existsSync(abs) || !statSync(abs).isDirectory()) return { score: 0, evidence: `missing dir ${dir}` };
|
|
125
|
+
const keys = c.keys || [];
|
|
126
|
+
const offenders = [];
|
|
127
|
+
for (const entry of readdirSync(abs)) {
|
|
128
|
+
if (entry.startsWith(".") || entry.startsWith("_")) continue;
|
|
129
|
+
const subAbs = join(abs, entry);
|
|
130
|
+
if (!statSync(subAbs).isDirectory()) continue;
|
|
131
|
+
const filePath = join(subAbs, fileName);
|
|
132
|
+
if (!existsSync(filePath)) { offenders.push(`${entry}/${fileName} missing`); continue; }
|
|
133
|
+
const fm = frontmatter(readFileSync(filePath, "utf8"));
|
|
134
|
+
if (fm == null) { offenders.push(`${entry} no frontmatter`); continue; }
|
|
135
|
+
const missing = keys.filter((k) => !new RegExp("^\\s*" + k + "\\s*:", "m").test(fm));
|
|
136
|
+
if (missing.length) offenders.push(`${entry} missing ${missing.join("/")}`);
|
|
137
|
+
}
|
|
138
|
+
return offenders.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: offenders.join("; ") };
|
|
139
|
+
}
|
|
61
140
|
case "roles-distinct-models": {
|
|
62
141
|
const dev = extractValue(read(c.developer), c.key || "model");
|
|
63
142
|
const rev = extractValue(read(c.reviewer), c.key || "model");
|
|
@@ -65,6 +144,95 @@ function scoreCheck(c) {
|
|
|
65
144
|
if (dev !== rev) return { score: 1, evidence: "" };
|
|
66
145
|
return { score: 0.5, evidence: `developer and reviewer share model "${dev}"` };
|
|
67
146
|
}
|
|
147
|
+
case "roles-distinct-families": {
|
|
148
|
+
const dev = extractValue(read(c.developer), c.key || "model");
|
|
149
|
+
const rev = extractValue(read(c.reviewer), c.key || "model");
|
|
150
|
+
if (!dev || !rev) return { score: 0, evidence: "developer/reviewer model not declared" };
|
|
151
|
+
const sameFamily = modelFamily(dev) === modelFamily(rev);
|
|
152
|
+
if (!sameFamily) return { score: 1, evidence: "" };
|
|
153
|
+
return { score: 0, evidence: `developer "${dev}" and reviewer "${rev}" share a model family` };
|
|
154
|
+
}
|
|
155
|
+
case "state-machine-dag": {
|
|
156
|
+
const text = read(c.path);
|
|
157
|
+
if (text == null) return { score: 0, evidence: `missing ${c.path}` };
|
|
158
|
+
const { states, transitions } = readStateMachine(text);
|
|
159
|
+
const minStates = c.minStates ?? 6;
|
|
160
|
+
const requirePath = c.requirePath || "queued->merged";
|
|
161
|
+
const problems = [];
|
|
162
|
+
if (states.length < minStates) problems.push(`${states.length} states, need ≥${minStates}`);
|
|
163
|
+
const stateSet = new Set(states);
|
|
164
|
+
const adj = new Map();
|
|
165
|
+
for (const t of transitions) {
|
|
166
|
+
if (t.from === "any") {
|
|
167
|
+
for (const s of stateSet) {
|
|
168
|
+
if (!adj.has(s)) adj.set(s, new Set());
|
|
169
|
+
adj.get(s).add(t.to);
|
|
170
|
+
}
|
|
171
|
+
} else {
|
|
172
|
+
if (!adj.has(t.from)) adj.set(t.from, new Set());
|
|
173
|
+
adj.get(t.from).add(t.to);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
const [src, dst] = requirePath.split("->");
|
|
177
|
+
if (src && dst && !hasPath(adj, src, dst)) problems.push(`no path ${src}→${dst}`);
|
|
178
|
+
return problems.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: problems.join("; ") };
|
|
179
|
+
}
|
|
180
|
+
case "quality-probe": {
|
|
181
|
+
const probe = c.probe, p = c.path;
|
|
182
|
+
if (probe === "no-unfilled-placeholders") {
|
|
183
|
+
const text = read(p);
|
|
184
|
+
if (text == null) return { score: 0, evidence: `missing ${p}` };
|
|
185
|
+
// Strip code blocks + inline code so we don't false-positive on docs that *describe*
|
|
186
|
+
// placeholder syntax (e.g. "{{VAR}} substitution" in an architecture overview).
|
|
187
|
+
const stripped = text
|
|
188
|
+
.replace(/```[\s\S]*?```/g, "")
|
|
189
|
+
.replace(/`[^`\n]*`/g, "");
|
|
190
|
+
const tokens = stripped.match(/\{\{[A-Z_]+\}\}/g) || [];
|
|
191
|
+
return tokens.length === 0
|
|
192
|
+
? { score: 1, evidence: "" }
|
|
193
|
+
: { score: 0, evidence: `unfilled tokens in ${p}: ${[...new Set(tokens)].join(", ")}` };
|
|
194
|
+
}
|
|
195
|
+
if (probe === "axes-json-coherent") {
|
|
196
|
+
const text = read(p);
|
|
197
|
+
if (text == null) return { score: 0, evidence: `missing ${p}` };
|
|
198
|
+
let j;
|
|
199
|
+
try { j = JSON.parse(text); } catch (e) { return { score: 0, evidence: `${p} not valid JSON: ${e.message}` }; }
|
|
200
|
+
if (!j.types) return { score: 0, evidence: `${p} missing "types"` };
|
|
201
|
+
const issues = [];
|
|
202
|
+
for (const [tname, t] of Object.entries(j.types)) {
|
|
203
|
+
if (!t.categories) { issues.push(`${tname}: no categories`); continue; }
|
|
204
|
+
for (const [cname, cat] of Object.entries(t.categories)) {
|
|
205
|
+
for (const [axis, spec] of Object.entries(cat)) {
|
|
206
|
+
// Both shapes: v1 = ["CODE",...]; v2 = { codes: [...], weight, veto }
|
|
207
|
+
const codes = Array.isArray(spec) ? spec : spec && spec.codes;
|
|
208
|
+
if (!Array.isArray(codes) || codes.length === 0) issues.push(`${tname}/${cname}/${axis}: no issue codes`);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
return issues.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: issues.join("; ") };
|
|
213
|
+
}
|
|
214
|
+
if (probe === "checks-json-coherent") {
|
|
215
|
+
const text = read(p);
|
|
216
|
+
if (text == null) return { score: 0, evidence: `missing ${p}` };
|
|
217
|
+
let j;
|
|
218
|
+
try { j = JSON.parse(text); } catch (e) { return { score: 0, evidence: `${p} not valid JSON: ${e.message}` }; }
|
|
219
|
+
const checks = j.checks || [];
|
|
220
|
+
const known = new Set(["path-exists","file-contains","dir-min","frontmatter-keys","frontmatter-keys-all","roles-distinct-models","roles-distinct-families","state-machine-dag","quality-probe"]);
|
|
221
|
+
const ids = checks.map((x) => x.id);
|
|
222
|
+
const dupIds = ids.filter((id, i) => id && ids.indexOf(id) !== i);
|
|
223
|
+
const badTypes = checks.filter((x) => !known.has(x.type));
|
|
224
|
+
const issues = [];
|
|
225
|
+
if (dupIds.length) issues.push(`duplicate ids: ${[...new Set(dupIds)].join(", ")}`);
|
|
226
|
+
if (badTypes.length) issues.push(`unknown check types: ${badTypes.map((x) => x.type).join(", ")}`);
|
|
227
|
+
return issues.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: issues.join("; ") };
|
|
228
|
+
}
|
|
229
|
+
if (probe === "context-md-present") {
|
|
230
|
+
return existsSync(rel(p))
|
|
231
|
+
? { score: 1, evidence: "" }
|
|
232
|
+
: { score: 0.5, evidence: `${p} missing — run \`agentrig init\` to investigate the repo` };
|
|
233
|
+
}
|
|
234
|
+
return { score: 0, evidence: `unknown quality probe "${probe}"` };
|
|
235
|
+
}
|
|
68
236
|
default:
|
|
69
237
|
return { score: 0, evidence: `unknown check type ${c.type}` };
|
|
70
238
|
}
|
|
@@ -75,38 +243,57 @@ if (!existsSync(checksPath)) {
|
|
|
75
243
|
process.exit(2);
|
|
76
244
|
}
|
|
77
245
|
const { checks } = JSON.parse(readFileSync(checksPath, "utf8"));
|
|
78
|
-
const results = checks.map((c) => ({ ...c, ...scoreCheck(c) }));
|
|
246
|
+
const results = checks.map((c) => ({ ...c, ...scoreCheck(c), layer: c.layer === "quality" ? "quality" : "completeness" }));
|
|
79
247
|
|
|
80
|
-
let
|
|
248
|
+
let cwSum = 0, cwScore = 0, qwSum = 0, qwScore = 0;
|
|
81
249
|
const byPrinciple = new Map();
|
|
82
250
|
for (const r of results) {
|
|
83
251
|
const w = r.weight ?? 1;
|
|
84
|
-
|
|
85
|
-
|
|
252
|
+
if (r.layer === "quality") { qwSum += w; qwScore += w * r.score; }
|
|
253
|
+
else { cwSum += w; cwScore += w * r.score; }
|
|
86
254
|
const p = byPrinciple.get(r.principle) || { sum: 0, n: 0 };
|
|
87
255
|
p.sum += r.score; p.n += 1;
|
|
88
256
|
byPrinciple.set(r.principle, p);
|
|
89
257
|
}
|
|
90
|
-
const
|
|
91
|
-
const
|
|
258
|
+
const completenessAgg = cwSum ? cwScore / cwSum : 0;
|
|
259
|
+
const qualityAgg = qwSum ? qwScore / qwSum : 0;
|
|
260
|
+
const pct = Math.round(completenessAgg * 1000) / 10;
|
|
261
|
+
const qpct = Math.round(qualityAgg * 1000) / 10;
|
|
92
262
|
|
|
93
263
|
if (asJson) {
|
|
94
264
|
console.log(JSON.stringify({
|
|
95
|
-
|
|
96
|
-
|
|
265
|
+
installCompleteness: pct,
|
|
266
|
+
qualityProbes: qpct,
|
|
267
|
+
aggregate: completenessAgg,
|
|
268
|
+
qualityAggregate: qualityAgg,
|
|
97
269
|
principles: [...byPrinciple.entries()].sort((a, b) => a[0] - b[0]).map(([principle, v]) => ({ principle, score: v.sum / v.n })),
|
|
98
|
-
checks: results.map((r) => ({ id: r.id, principle: r.principle, title: r.title, score: r.score, evidence: r.evidence })),
|
|
270
|
+
checks: results.map((r) => ({ id: r.id, principle: r.principle, layer: r.layer, title: r.title, score: r.score, evidence: r.evidence })),
|
|
99
271
|
}, null, 2));
|
|
100
272
|
} else {
|
|
101
|
-
console.log("AgentRig —
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
273
|
+
console.log("AgentRig — install completeness audit\n");
|
|
274
|
+
const completeness = results.filter((r) => r.layer === "completeness").sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id));
|
|
275
|
+
const quality = results.filter((r) => r.layer === "quality").sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id));
|
|
276
|
+
if (completeness.length) {
|
|
277
|
+
console.log(" Layer A1 — structural completeness");
|
|
278
|
+
for (const r of completeness) {
|
|
279
|
+
const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
|
|
280
|
+
console.log(` [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n ↳ ${r.evidence}` : ""));
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
if (quality.length) {
|
|
284
|
+
console.log("\n Layer A2 — quality probes");
|
|
285
|
+
for (const r of quality) {
|
|
286
|
+
const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
|
|
287
|
+
console.log(` [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n ↳ ${r.evidence}` : ""));
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
console.log(`\n Install Completeness: ${pct}% (${completeness.filter((r) => r.score === 1).length}/${completeness.length} checks full credit)`);
|
|
291
|
+
if (quality.length) {
|
|
292
|
+
console.log(` Quality Probes: ${qpct}% (${quality.filter((r) => r.score === 1).length}/${quality.length} checks full credit)`);
|
|
105
293
|
}
|
|
106
|
-
console.log(`\n Harness Score: ${pct}% (${results.filter((r) => r.score === 1).length}/${results.length} checks full credit)`);
|
|
107
294
|
}
|
|
108
295
|
|
|
109
296
|
if (minPct != null && pct < minPct) {
|
|
110
|
-
if (!asJson) console.error(`\
|
|
297
|
+
if (!asJson) console.error(`\nInstall Completeness ${pct}% is below required ${minPct}%`);
|
|
111
298
|
process.exit(1);
|
|
112
299
|
}
|
|
@@ -16,16 +16,16 @@ states:
|
|
|
16
16
|
- name: parked # self-parked: needs a human (low reversibility)
|
|
17
17
|
|
|
18
18
|
transitions:
|
|
19
|
-
- from: ingested to: queued trigger: agent
|
|
20
|
-
- from: queued to: implementing trigger: agent
|
|
21
|
-
- from: implementing to: reviewing trigger: agent
|
|
22
|
-
- from: reviewing to: judging trigger: agent
|
|
23
|
-
- from: reviewing to: implementing trigger: agent
|
|
24
|
-
- from: judging to: ready_to_merge trigger: agent
|
|
25
|
-
- from: judging to: implementing trigger: agent
|
|
26
|
-
- from: ready_to_merge to: merged trigger: human
|
|
27
|
-
- from: any to: parked trigger: auto
|
|
28
|
-
- from: any to: closed trigger: human
|
|
19
|
+
- { from: ingested, to: queued, trigger: agent, role: triager, gate: human_approval }
|
|
20
|
+
- { from: queued, to: implementing, trigger: agent, role: developer }
|
|
21
|
+
- { from: implementing, to: reviewing, trigger: agent, role: developer, gate: self_verify_passed }
|
|
22
|
+
- { from: reviewing, to: judging, trigger: agent, role: reviewer }
|
|
23
|
+
- { from: reviewing, to: implementing, trigger: agent, role: reviewer, reason: changes_requested }
|
|
24
|
+
- { from: judging, to: ready_to_merge, trigger: agent, role: judge, gate: rubric_passed }
|
|
25
|
+
- { from: judging, to: implementing, trigger: agent, role: judge, reason: below_threshold }
|
|
26
|
+
- { from: ready_to_merge, to: merged, trigger: human, gate: human_approval } # principle 9
|
|
27
|
+
- { from: any, to: parked, trigger: auto, reason: low_reversibility_or_stuck }
|
|
28
|
+
- { from: any, to: closed, trigger: human }
|
|
29
29
|
|
|
30
30
|
# Principle 3: GitHub labels mirror DAG state. Human-only labels must never be applied by an agent.
|
|
31
31
|
labels:
|
|
@@ -48,9 +48,15 @@ limits:
|
|
|
48
48
|
requeue_if_stuck_hours: 4
|
|
49
49
|
|
|
50
50
|
hooks:
|
|
51
|
+
# Cheap, deterministic, no model — safe on every PR.
|
|
51
52
|
pre_pr:
|
|
52
53
|
- self-verify
|
|
53
54
|
pre_merge:
|
|
55
|
+
- install-completeness # Layer A: structural audit + quality probes (no model)
|
|
56
|
+
# Layer B (dynamic behavioral eval) is expensive: it spawns the harness end-to-end
|
|
57
|
+
# under a separate judge model with N>=5 trials per scenario. Run nightly on main,
|
|
58
|
+
# NOT per-PR. See .agentrig/eval/RUBRIC.md for cost notes.
|
|
59
|
+
nightly:
|
|
54
60
|
- harness-eval
|
|
55
61
|
|
|
56
62
|
# --- Trigger taxonomy (principle 1) ------------------------------------------
|
|
@@ -101,5 +107,5 @@ issue_comments:
|
|
|
101
107
|
# Keep adjacent pipeline roles on DIFFERENT model families (single-model-bias mitigation).
|
|
102
108
|
model_tiers:
|
|
103
109
|
cheap: { models: [claude-haiku-4.5, gpt-5-mini], use: "triage, high-volume analysis" }
|
|
104
|
-
standard: { models: [claude-sonnet-4.
|
|
105
|
-
premium: { models: [claude-opus-4.
|
|
110
|
+
standard: { models: [claude-sonnet-4.6, gpt-5.4], use: "implementation" }
|
|
111
|
+
premium: { models: [claude-opus-4.8, gpt-5.5], use: "review, judging, auditing" }
|
|
@@ -1,83 +1,88 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: harness-eval
|
|
3
|
-
description: Evaluate THIS repository's agent harness — a deterministic structure audit plus
|
|
4
|
-
triggers:
|
|
5
|
-
- "evaluate the harness"
|
|
6
|
-
- pre_merge hook
|
|
7
|
-
- "did my harness change make things better or worse?"
|
|
3
|
+
description: Evaluate THIS repository's agent harness — a deterministic structure audit (A1) plus content quality probes (A2), plus an isolated producer/judge dynamic eval (B) with paired sign-test A/B variant comparison.
|
|
8
4
|
allowed-tools: Bash Read Grep Glob
|
|
9
|
-
argument-hint: "[--static|--dynamic] [--scenario id] [--variant v]"
|
|
5
|
+
argument-hint: "[--static|--dynamic] [--scenario id] [--variant v] [--n trials]"
|
|
10
6
|
---
|
|
11
7
|
|
|
12
8
|
# harness-eval (principle 6 — evaluate the harness itself)
|
|
13
9
|
|
|
14
|
-
A harness you cannot measure is a harness you cannot improve. This skill scores the harness on
|
|
15
|
-
complementary layers and writes results to `.agentrig/eval/results/` (validated
|
|
10
|
+
A harness you cannot measure is a harness you cannot improve. This skill scores the harness on
|
|
11
|
+
three complementary layers and writes results to `.agentrig/eval/results/` (validated on write
|
|
12
|
+
*and* on read; never hand-edit JSON).
|
|
16
13
|
|
|
17
|
-
## Layer
|
|
18
|
-
|
|
14
|
+
## Layer A1 — install completeness (deterministic, no model)
|
|
15
|
+
Every canonical artifact present at the path the manifest declares.
|
|
19
16
|
|
|
20
17
|
```bash
|
|
21
|
-
node .agentrig/eval/static-audit.mjs
|
|
22
|
-
node .agentrig/eval/static-audit.mjs --json # machine-readable, for CI gates
|
|
18
|
+
node .agentrig/eval/static-audit.mjs --json # Install Completeness %
|
|
23
19
|
```
|
|
24
20
|
|
|
25
|
-
|
|
21
|
+
## Layer A2 — quality probes (deterministic, no model)
|
|
22
|
+
Cheap content sanity: YAML parseable, no unfilled `{{PLACEHOLDER}}` in `AGENTS.md`, every skill has
|
|
23
|
+
the required frontmatter, axes.json has an issue code per axis, developer/reviewer **model
|
|
24
|
+
families** differ (not just the model id strings).
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
judge** (a different model than the producer) against `.agentrig/eval/RUBRIC.md` and the registry in
|
|
30
|
-
`.agentrig/eval/axes.json`.
|
|
26
|
+
A1 + A2 are what CI gates on. Both surface in the same `--static` report under "Layer A1" and
|
|
27
|
+
"Layer A2" sections.
|
|
31
28
|
|
|
32
|
-
|
|
33
|
-
open PRs, or merge.
|
|
29
|
+
## Layer B — dynamic behavioral eval (agentic, independent judge, fixture-based)
|
|
34
30
|
|
|
35
|
-
|
|
36
|
-
the scenario: `spec` (task quality), `run` (implementation), `review` (the reviewer's behavior).
|
|
37
|
-
Link them with a shared `--task` id.
|
|
31
|
+
For each scenario in `.agentrig/eval/scenarios/*/`:
|
|
38
32
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
33
|
+
1. **Seed** a throwaway worktree from `scenarios/<id>/fixture/` (or `baseline/`+`change/` for
|
|
34
|
+
review scenarios).
|
|
35
|
+
2. **Producer** model runs in that worktree against `scenarios/<id>/prompt.md`. For
|
|
36
|
+
`--variant harness`, the AgentRig harness is staged into the worktree first; for
|
|
37
|
+
`--variant baseline`, the agent runs bare.
|
|
38
|
+
3. **Oracle** (`scenarios/<id>/oracle.yml`) deterministically scores the hard axes (correctness,
|
|
39
|
+
tests, scope, regression_risk, …) by running commands / inspecting the diff. **No LLM.**
|
|
40
|
+
4. **Judge** model — explicitly a **different family** from the producer — runs in a separate
|
|
41
|
+
`provider.startConversation()` call in its own cwd containing only `prompt.md`, `diff.patch`,
|
|
42
|
+
`transcript.md`, `oracle.json`, and `judge_brief.md`. It does NOT see the producer worktree or
|
|
43
|
+
reasoning trace. It writes `<artifactsDir>/<scenario>.trial<N>.judge.json`; the orchestrator
|
|
44
|
+
reads, validates, and persists via `score.mjs save`.
|
|
45
|
+
|
|
46
|
+
**Family-divergence is enforced.** `score.mjs save` rejects a producer/judge pair in the same
|
|
47
|
+
family unless `--allow-same-family` is set (and records the override). Bare CLI:
|
|
42
48
|
|
|
43
49
|
```bash
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
--axis 'tests=na'
|
|
48
|
-
node .agentrig/eval/score.mjs report
|
|
50
|
+
agentrig eval --dynamic --variant harness --n 5 --producer-model claude-sonnet-4.6 --judge-model gpt-5.5
|
|
51
|
+
agentrig eval --dynamic --variant baseline --n 5 --producer-model claude-sonnet-4.6 --judge-model gpt-5.5
|
|
52
|
+
node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
|
|
49
53
|
```
|
|
50
54
|
|
|
51
|
-
**
|
|
52
|
-
|
|
55
|
+
**Aggregation: weighted + veto.** axes.json declares `weight` and `veto: true` per axis.
|
|
56
|
+
A veto axis < 1.0 fails the scenario regardless of aggregate (e.g. correctness can never be
|
|
57
|
+
papered over by clarity).
|
|
53
58
|
|
|
54
|
-
##
|
|
55
|
-
To know whether a prompt/skill/rule change helped, run the **same** scenario before and after under
|
|
56
|
-
different `--variant`s, then:
|
|
59
|
+
## Statistical lift
|
|
57
60
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
+
Single-trial deltas are coin flips. The eval requires `n ≥ 3` paired trials for any verdict
|
|
62
|
+
other than **INCONCLUSIVE**. `score.mjs compare` runs a paired binomial sign test and reports
|
|
63
|
+
median delta + p-value:
|
|
61
64
|
|
|
62
|
-
|
|
63
|
-
|
|
65
|
+
- **HELPS** — p < 0.05 and median > 0.05
|
|
66
|
+
- **HURTS** — p < 0.05 and median < -0.05
|
|
67
|
+
- **INCONCLUSIVE** — n < 3, p ≥ 0.05, or |median| < 0.05
|
|
64
68
|
|
|
65
|
-
|
|
66
|
-
The most important question for a consumer: *does installing AgentRig's harness make agents better
|
|
67
|
-
in THIS repo?* Measure it by running the same scenarios twice and comparing:
|
|
69
|
+
A change that doesn't clear `HELPS` is a regression risk even if individual trials looked good.
|
|
68
70
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
## Sandbox
|
|
72
|
+
Obey `.agentrig/eval/sandbox/eval-rules.md`: throwaway worktree under `$TMPDIR/agentrig-eval/`,
|
|
73
|
+
never push / open PRs / merge / mutate real labels. The eval measures behavior; it must not
|
|
74
|
+
mutate real branches.
|
|
72
75
|
|
|
73
|
-
|
|
74
|
-
agentrig eval --dynamic --scenario <id> --variant baseline
|
|
76
|
+
## Calibrate the judge before trusting it
|
|
75
77
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
+
A lazy judge that returns 1.0 everywhere passes every `score.mjs save` validation. Run the judge
|
|
79
|
+
over the hand-labeled `calibration/` instances and require ≥ 80% agreement before publishing
|
|
80
|
+
results:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
node .agentrig/eval/score.mjs calibrate --judge <model> --instance .agentrig/eval/calibration/run/seed-correct.yml --judge-scores /tmp/judge-out.json
|
|
84
|
+
node .agentrig/eval/score.mjs calibrate --report
|
|
85
|
+
agentrig doctor # flags any judge below the 80% threshold
|
|
78
86
|
```
|
|
79
87
|
|
|
80
|
-
|
|
81
|
-
surfaces moved aside (`AGENTS.md`, `.agents/`, `.github/instructions/`, `CLAUDE.md`, `.cursor/`), so
|
|
82
|
-
the agent genuinely has no harness guidance. A positive aggregate delta means the harness helps in
|
|
83
|
-
this repo; track it over time as you tune rules/skills/prompts.
|
|
88
|
+
See `.agentrig/eval/calibration/README.md` for the instance format.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: log-gotcha
|
|
3
|
+
description: Record a newly-discovered gotcha to `.agents/wiki/` BEFORE handoff — the harness's feedback loop. The wiki is how the next agent doesn't repeat your discovery.
|
|
4
|
+
triggers:
|
|
5
|
+
- hit something non-obvious during the task
|
|
6
|
+
- silent failure / suspicious default / quirk in a library or runtime
|
|
7
|
+
- before handoff if anything surprised you
|
|
8
|
+
allowed-tools: Bash Read Write Edit Grep Glob
|
|
9
|
+
argument-hint: "[--topic <area>]"
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
# log-gotcha (principle 8)
|
|
13
|
+
|
|
14
|
+
Every mistake is a prompt bug. The wiki is **how the harness learns**: every entry there is one
|
|
15
|
+
agent-turn the next agent skips because they already know what you discovered. Logging is part of
|
|
16
|
+
the task, not a separate "good-to-have" step.
|
|
17
|
+
|
|
18
|
+
## When to log
|
|
19
|
+
|
|
20
|
+
You should log a gotcha if **any** of these apply to what you just did:
|
|
21
|
+
|
|
22
|
+
- A test, framework, or runtime did something surprising (e.g. `divide(1, 0)` returns `Infinity`
|
|
23
|
+
not throws; `node --test some-dir` resolves the dir as a module; `console.log` after
|
|
24
|
+
`process.exit` silently truncates piped output).
|
|
25
|
+
- A library default bit you (silent overwrite, surprising coercion, hidden API contract).
|
|
26
|
+
- An AGENTS.md rule wasn't loud enough — you almost violated it, or did, until you caught yourself.
|
|
27
|
+
- A non-obvious cross-file dependency that someone touching one file would miss.
|
|
28
|
+
- A flaky test, an environment-specific assumption, a build-cache surprise.
|
|
29
|
+
|
|
30
|
+
**Do not log** taste opinions, style preferences, or things that are already in CONTRIBUTING.md.
|
|
31
|
+
|
|
32
|
+
## How to log
|
|
33
|
+
|
|
34
|
+
1. **Check the wiki first.** Run `ls .agents/wiki/` and `grep -ri "<keyword>" .agents/wiki/` for
|
|
35
|
+
the most natural keywords. **If an existing entry covers it, SHARPEN that entry instead of
|
|
36
|
+
adding a near-duplicate** (the wiki README has a strict admission test on duplication).
|
|
37
|
+
2. **Pick a topic file.** Either an existing one (e.g. `troubleshooting.md`) or create
|
|
38
|
+
`.agents/wiki/<topic>.md` if the area is new (e.g. `node-test-runner.md`,
|
|
39
|
+
`html-templates.md`). Topic names are kebab-case nouns.
|
|
40
|
+
3. **Write a stub entry** using the template below. Keep it terse — 5 lines max.
|
|
41
|
+
4. **Commit it as part of your fix's diff.** Wiki entries are not "after-the-fact paperwork" —
|
|
42
|
+
they go in the SAME commit/PR as the fix that revealed them, so reviewers can see them.
|
|
43
|
+
|
|
44
|
+
## Entry template
|
|
45
|
+
|
|
46
|
+
```markdown
|
|
47
|
+
### <short noun-phrase title>
|
|
48
|
+
- **Symptom:** what went wrong / how it showed up
|
|
49
|
+
- **Cause:** the real root cause (not the symptom)
|
|
50
|
+
- **Fix:** the change you made (or wider remediation)
|
|
51
|
+
- **Prevention:** one-line rule that would have spared you this discovery
|
|
52
|
+
- **Discovered:** <date> in <scenario or task id>
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Skill failure modes (explicitly)
|
|
56
|
+
|
|
57
|
+
- **"I didn't really hit a gotcha."** Most fixes DO reveal one — you just didn't notice because the
|
|
58
|
+
fix took less than 5 minutes. The discriminating question is *"could the next agent have known
|
|
59
|
+
this from the existing instructions?"* — if no, log it.
|
|
60
|
+
- **"It's too small."** Small gotchas are exactly the ones that vanish from memory by tomorrow.
|
|
61
|
+
- **"I'll log later."** No — log it in the SAME commit. "Later" is how wikis die.
|
|
62
|
+
|
|
63
|
+
## Verification
|
|
64
|
+
|
|
65
|
+
Before considering this skill complete, confirm with `git diff --cached --stat` that your wiki
|
|
66
|
+
entry shows in the staged diff. The `memory` axis in the harness eval explicitly checks for this:
|
|
67
|
+
"diff contains a `.agents/wiki/` entry" → 1.0; "mentioned in summary but not committed" → 0.5;
|
|
68
|
+
"silent" → 0.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: self-verify
|
|
3
|
-
description: Run the project's own build/test/lint and converge before handing work to a reviewer.
|
|
3
|
+
description: Run the project's own build/test/lint and converge before handing work to a reviewer. Requires explicit baseline → after evidence — the suite must be shown to change state, not just be "green at the end".
|
|
4
4
|
triggers:
|
|
5
5
|
- before requesting review
|
|
6
6
|
- before opening a PR
|
|
@@ -13,13 +13,37 @@ argument-hint: "[--max-iterations N]"
|
|
|
13
13
|
After producing changes, **verify your own work before handoff**. Do not invoke the reviewer until
|
|
14
14
|
this loop converges.
|
|
15
15
|
|
|
16
|
-
## Steps
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
16
|
+
## Steps (do them in order; do not skip)
|
|
17
|
+
|
|
18
|
+
1. **Baseline.** Run the install/build/test/lint commands from `AGENTS.md`'s `commands` block
|
|
19
|
+
**once before you make any edit related to the failing symptom**. Capture the result:
|
|
20
|
+
- For a fix scenario: confirm the suite is RED in the expected way (the target test fails).
|
|
21
|
+
- For a feature scenario: confirm the suite is GREEN (so you know your changes are what break it
|
|
22
|
+
if it goes red later).
|
|
23
|
+
- Surface this baseline in your transcript — e.g. *"baseline: `npm test` → 1 fail (divide-by-zero)"*.
|
|
24
|
+
|
|
25
|
+
2. **Iterate.** Make the change; re-run the commands. Cap at **N=3** iterations.
|
|
26
|
+
|
|
27
|
+
3. **After.** Re-run the full suite at the end and surface the new state explicitly —
|
|
28
|
+
e.g. *"after fix: `npm test` → 0 fails, all 4 tests pass"*. The transition from baseline → after
|
|
29
|
+
is the evidence that your work did what you claim. Reporting only "tests pass" without the
|
|
30
|
+
baseline is half a self-verification.
|
|
31
|
+
|
|
32
|
+
4. **Self-park if still red.** Leave a precise note (what failed, what you tried) and move the task
|
|
33
|
+
to `parked`. Never hand a red build to a reviewer.
|
|
34
|
+
|
|
35
|
+
## Handoff checklist (run BEFORE you declare done)
|
|
36
|
+
|
|
37
|
+
- [ ] Baseline output captured + surfaced in transcript
|
|
38
|
+
- [ ] After output captured + surfaced in transcript
|
|
39
|
+
- [ ] Diff is on-target (no unrelated churn — check `git diff --stat`)
|
|
40
|
+
- [ ] **Did you hit any non-obvious behavior or surprise?** → run the `log-gotcha` skill before
|
|
41
|
+
handing off. This includes silently-passing-yet-wrong APIs, JS-floating-point quirks, framework
|
|
42
|
+
defaults that bit you, environment surprises, etc. Wiki entries are how the next agent avoids
|
|
43
|
+
repeating your discovery.
|
|
22
44
|
|
|
23
45
|
## Notes
|
|
46
|
+
|
|
24
47
|
- Pin verification to your own HEAD; do not trust stale CI from an earlier commit.
|
|
25
|
-
-
|
|
48
|
+
- If the build is too expensive to run a full baseline (10+ min), at minimum run the **smallest
|
|
49
|
+
set of tests that demonstrates the symptom** before AND after your fix.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@doidor/agentrig",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.0",
|
|
4
4
|
"description": "AgentRig — an agentic meta-harness. A CLI that investigates a repository and installs (and evaluates) a best-practice agent harness.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -55,6 +55,7 @@
|
|
|
55
55
|
"license": "MIT",
|
|
56
56
|
"dependencies": {
|
|
57
57
|
"@github/copilot-sdk": "^1.0.0",
|
|
58
|
+
"yaml": "^2.9.0",
|
|
58
59
|
"zod": "^4.3.6"
|
|
59
60
|
},
|
|
60
61
|
"peerDependencies": {
|
|
@@ -68,8 +69,8 @@
|
|
|
68
69
|
"devDependencies": {
|
|
69
70
|
"@changesets/changelog-github": "^0.7.0",
|
|
70
71
|
"@changesets/cli": "^2.31.0",
|
|
71
|
-
"@doidor/markbook": "^0.
|
|
72
|
-
"@doidor/markbook-core": "^0.
|
|
72
|
+
"@doidor/markbook": "^0.2.0",
|
|
73
|
+
"@doidor/markbook-core": "^0.2.0",
|
|
73
74
|
"@types/node": "^22.0.0",
|
|
74
75
|
"typescript": "^5.6.0"
|
|
75
76
|
}
|