npm - @doidor/agentrig - Versions diffs - 0.9.0 → 0.10.0 - Mend

@doidor/agentrig 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

package/knowledge/templates/eval/static-audit.mjs CHANGED Viewed

@@ -1,10 +1,10 @@
 #!/usr/bin/env node
 // AgentRig static harness audit (principle 6) — deterministic, dependency-free, no model.
 // Interprets checks.json (the single source of truth, shared with `agentrig eval --static`)
-// against this repository and prints a Harness Score. Usage:
+// against this repository and prints an Install Completeness + Quality Probes report. Usage:
 //   node .agentrig/eval/static-audit.mjs            human-readable report
 //   node .agentrig/eval/static-audit.mjs --json     machine-readable
-//   node .agentrig/eval/static-audit.mjs --min 80   exit non-zero if score < 80%
+//   node .agentrig/eval/static-audit.mjs --min 80   exit non-zero if completeness < 80%
 import { readFileSync, existsSync, statSync, readdirSync } from "node:fs";
 import { fileURLToPath } from "node:url";
 import { dirname, join, resolve } from "node:path";
@@ -33,6 +33,66 @@ function extractValue(text, key) {
   return m ? m[1].trim() : null;
 }
+// Mirror of src/core/model-family.ts. Kept inline to keep this script dep-free.
+const FAMILY_PATTERNS = [
+  ["anthropic-claude", /^(anthropic[\.\/-])?claude([-_\.]|$)/i],
+  ["openai-gpt", /^(openai[\.\/-])?(gpt|o[1-9]|codex|davinci|chatgpt)([-_\.]|$)/i],
+  ["google-gemini", /^(google[\.\/-])?(gemini|palm|bard|flash)([-_\.]|$)/i],
+  ["mistral", /^(mistral|mixtral|codestral|ministral)([-_\.]|$)/i],
+  ["deepseek", /^deepseek([-_\.]|$)/i],
+  ["meta-llama", /^(meta[\.\/-])?(llama|code-?llama)([-_\.]|$)/i],
+  ["xai-grok", /^(xai[\.\/-])?grok([-_\.]|$)/i],
+  ["cohere", /^(cohere[\.\/-])?(command|aya)([-_\.]|$)/i],
+  ["qwen", /^qwen([-_\.]|$)/i],
+];
+function modelFamily(id) {
+  if (!id) return "";
+  for (const [name, rx] of FAMILY_PATTERNS) if (rx.test(id)) return name;
+  const m = id.match(/^([a-z0-9]+)/i);
+  return m ? `unknown:${m[1].toLowerCase()}` : `unknown:${id}`;
+}
+// Line-oriented mini-YAML reader good enough for state-machine.yml.
+function readStateMachine(text) {
+  if (!text) return { states: [], transitions: [] };
+  const states = [];
+  const transitions = [];
+  let inStates = false;
+  let inTransitions = false;
+  for (const raw of text.split(/\r?\n/)) {
+    if (/^states:\s*$/.test(raw)) { inStates = true; inTransitions = false; continue; }
+    if (/^transitions:\s*$/.test(raw)) { inTransitions = true; inStates = false; continue; }
+    if (/^\S/.test(raw)) { inStates = false; inTransitions = false; continue; }
+    const line = raw.replace(/#.*$/, "").trimEnd();
+    if (!line.trim().startsWith("-")) continue;
+    if (inStates) {
+      const m = line.match(/-\s*name:\s*(\S+)/);
+      if (m) states.push(m[1]);
+    } else if (inTransitions) {
+      // Accept both legacy ("- from: A to: B trigger: ...") and proper flow-mapping
+      // ("- { from: A, to: B, trigger: ... }") syntaxes.
+      const item = line.replace(/^\s*-\s*/, "").replace(/^\{|\}$/g, "");
+      const get = (k) => (item.match(new RegExp("\\b" + k + ":\\s*([^,\\s}]+)")) || [])[1];
+      const from = get("from"), to = get("to"), trigger = get("trigger");
+      if (from && to) transitions.push({ from, to, trigger });
+    }
+  }
+  return { states, transitions };
+}
+function hasPath(adj, src, dst) {
+  if (src === dst) return true;
+  const seen = new Set([src]);
+  const q = [src];
+  while (q.length) {
+    const cur = q.shift();
+    for (const n of adj.get(cur) || []) {
+      if (n === dst) return true;
+      if (!seen.has(n)) { seen.add(n); q.push(n); }
+    }
+  }
+  return false;
+}
 function scoreCheck(c) {
   switch (c.type) {
     case "path-exists":
@@ -58,6 +118,25 @@ function scoreCheck(c) {
       if (missing.length === 0) return { score: 1, evidence: "" };
       return { score: 0.5, evidence: `missing keys: ${missing.join(", ")}` };
     }
+    case "frontmatter-keys-all": {
+      const dir = c.path, fileName = c.file || "SKILL.md";
+      const abs = rel(dir);
+      if (!existsSync(abs) || !statSync(abs).isDirectory()) return { score: 0, evidence: `missing dir ${dir}` };
+      const keys = c.keys || [];
+      const offenders = [];
+      for (const entry of readdirSync(abs)) {
+        if (entry.startsWith(".") || entry.startsWith("_")) continue;
+        const subAbs = join(abs, entry);
+        if (!statSync(subAbs).isDirectory()) continue;
+        const filePath = join(subAbs, fileName);
+        if (!existsSync(filePath)) { offenders.push(`${entry}/${fileName} missing`); continue; }
+        const fm = frontmatter(readFileSync(filePath, "utf8"));
+        if (fm == null) { offenders.push(`${entry} no frontmatter`); continue; }
+        const missing = keys.filter((k) => !new RegExp("^\\s*" + k + "\\s*:", "m").test(fm));
+        if (missing.length) offenders.push(`${entry} missing ${missing.join("/")}`);
+      }
+      return offenders.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: offenders.join("; ") };
+    }
     case "roles-distinct-models": {
       const dev = extractValue(read(c.developer), c.key || "model");
       const rev = extractValue(read(c.reviewer), c.key || "model");
@@ -65,6 +144,95 @@ function scoreCheck(c) {
       if (dev !== rev) return { score: 1, evidence: "" };
       return { score: 0.5, evidence: `developer and reviewer share model "${dev}"` };
     }
+    case "roles-distinct-families": {
+      const dev = extractValue(read(c.developer), c.key || "model");
+      const rev = extractValue(read(c.reviewer), c.key || "model");
+      if (!dev || !rev) return { score: 0, evidence: "developer/reviewer model not declared" };
+      const sameFamily = modelFamily(dev) === modelFamily(rev);
+      if (!sameFamily) return { score: 1, evidence: "" };
+      return { score: 0, evidence: `developer "${dev}" and reviewer "${rev}" share a model family` };
+    }
+    case "state-machine-dag": {
+      const text = read(c.path);
+      if (text == null) return { score: 0, evidence: `missing ${c.path}` };
+      const { states, transitions } = readStateMachine(text);
+      const minStates = c.minStates ?? 6;
+      const requirePath = c.requirePath || "queued->merged";
+      const problems = [];
+      if (states.length < minStates) problems.push(`${states.length} states, need ≥${minStates}`);
+      const stateSet = new Set(states);
+      const adj = new Map();
+      for (const t of transitions) {
+        if (t.from === "any") {
+          for (const s of stateSet) {
+            if (!adj.has(s)) adj.set(s, new Set());
+            adj.get(s).add(t.to);
+          }
+        } else {
+          if (!adj.has(t.from)) adj.set(t.from, new Set());
+          adj.get(t.from).add(t.to);
+        }
+      }
+      const [src, dst] = requirePath.split("->");
+      if (src && dst && !hasPath(adj, src, dst)) problems.push(`no path ${src}→${dst}`);
+      return problems.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: problems.join("; ") };
+    }
+    case "quality-probe": {
+      const probe = c.probe, p = c.path;
+      if (probe === "no-unfilled-placeholders") {
+        const text = read(p);
+        if (text == null) return { score: 0, evidence: `missing ${p}` };
+        // Strip code blocks + inline code so we don't false-positive on docs that *describe*
+        // placeholder syntax (e.g. "{{VAR}} substitution" in an architecture overview).
+        const stripped = text
+          .replace(/```[\s\S]*?```/g, "")
+          .replace(/`[^`\n]*`/g, "");
+        const tokens = stripped.match(/\{\{[A-Z_]+\}\}/g) || [];
+        return tokens.length === 0
+          ? { score: 1, evidence: "" }
+          : { score: 0, evidence: `unfilled tokens in ${p}: ${[...new Set(tokens)].join(", ")}` };
+      }
+      if (probe === "axes-json-coherent") {
+        const text = read(p);
+        if (text == null) return { score: 0, evidence: `missing ${p}` };
+        let j;
+        try { j = JSON.parse(text); } catch (e) { return { score: 0, evidence: `${p} not valid JSON: ${e.message}` }; }
+        if (!j.types) return { score: 0, evidence: `${p} missing "types"` };
+        const issues = [];
+        for (const [tname, t] of Object.entries(j.types)) {
+          if (!t.categories) { issues.push(`${tname}: no categories`); continue; }
+          for (const [cname, cat] of Object.entries(t.categories)) {
+            for (const [axis, spec] of Object.entries(cat)) {
+              // Both shapes: v1 = ["CODE",...]; v2 = { codes: [...], weight, veto }
+              const codes = Array.isArray(spec) ? spec : spec && spec.codes;
+              if (!Array.isArray(codes) || codes.length === 0) issues.push(`${tname}/${cname}/${axis}: no issue codes`);
+            }
+          }
+        }
+        return issues.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: issues.join("; ") };
+      }
+      if (probe === "checks-json-coherent") {
+        const text = read(p);
+        if (text == null) return { score: 0, evidence: `missing ${p}` };
+        let j;
+        try { j = JSON.parse(text); } catch (e) { return { score: 0, evidence: `${p} not valid JSON: ${e.message}` }; }
+        const checks = j.checks || [];
+        const known = new Set(["path-exists","file-contains","dir-min","frontmatter-keys","frontmatter-keys-all","roles-distinct-models","roles-distinct-families","state-machine-dag","quality-probe"]);
+        const ids = checks.map((x) => x.id);
+        const dupIds = ids.filter((id, i) => id && ids.indexOf(id) !== i);
+        const badTypes = checks.filter((x) => !known.has(x.type));
+        const issues = [];
+        if (dupIds.length) issues.push(`duplicate ids: ${[...new Set(dupIds)].join(", ")}`);
+        if (badTypes.length) issues.push(`unknown check types: ${badTypes.map((x) => x.type).join(", ")}`);
+        return issues.length === 0 ? { score: 1, evidence: "" } : { score: 0.5, evidence: issues.join("; ") };
+      }
+      if (probe === "context-md-present") {
+        return existsSync(rel(p))
+          ? { score: 1, evidence: "" }
+          : { score: 0.5, evidence: `${p} missing — run \`agentrig init\` to investigate the repo` };
+      }
+      return { score: 0, evidence: `unknown quality probe "${probe}"` };
+    }
     default:
       return { score: 0, evidence: `unknown check type ${c.type}` };
   }
@@ -75,38 +243,57 @@ if (!existsSync(checksPath)) {
   process.exit(2);
 }
 const { checks } = JSON.parse(readFileSync(checksPath, "utf8"));
-const results = checks.map((c) => ({ ...c, ...scoreCheck(c) }));
+const results = checks.map((c) => ({ ...c, ...scoreCheck(c), layer: c.layer === "quality" ? "quality" : "completeness" }));
-let wSum = 0, wScore = 0;
+let cwSum = 0, cwScore = 0, qwSum = 0, qwScore = 0;
 const byPrinciple = new Map();
 for (const r of results) {
   const w = r.weight ?? 1;
-  wSum += w;
-  wScore += w * r.score;
+  if (r.layer === "quality") { qwSum += w; qwScore += w * r.score; }
+  else { cwSum += w; cwScore += w * r.score; }
   const p = byPrinciple.get(r.principle) || { sum: 0, n: 0 };
   p.sum += r.score; p.n += 1;
   byPrinciple.set(r.principle, p);
 }
-const aggregate = wSum ? wScore / wSum : 0;
-const pct = Math.round(aggregate * 1000) / 10;
+const completenessAgg = cwSum ? cwScore / cwSum : 0;
+const qualityAgg = qwSum ? qwScore / qwSum : 0;
+const pct = Math.round(completenessAgg * 1000) / 10;
+const qpct = Math.round(qualityAgg * 1000) / 10;
 if (asJson) {
   console.log(JSON.stringify({
-    harnessScore: pct,
-    aggregate,
+    installCompleteness: pct,
+    qualityProbes: qpct,
+    aggregate: completenessAgg,
+    qualityAggregate: qualityAgg,
     principles: [...byPrinciple.entries()].sort((a, b) => a[0] - b[0]).map(([principle, v]) => ({ principle, score: v.sum / v.n })),
-    checks: results.map((r) => ({ id: r.id, principle: r.principle, title: r.title, score: r.score, evidence: r.evidence })),
+    checks: results.map((r) => ({ id: r.id, principle: r.principle, layer: r.layer, title: r.title, score: r.score, evidence: r.evidence })),
   }, null, 2));
 } else {
-  console.log("AgentRig — harness audit\n");
-  for (const r of results.sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id))) {
-    const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
-    console.log(`  [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n         ↳ ${r.evidence}` : ""));
+  console.log("AgentRig — install completeness audit\n");
+  const completeness = results.filter((r) => r.layer === "completeness").sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id));
+  const quality = results.filter((r) => r.layer === "quality").sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id));
+  if (completeness.length) {
+    console.log("  Layer A1 — structural completeness");
+    for (const r of completeness) {
+      const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
+      console.log(`  [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n         ↳ ${r.evidence}` : ""));
+    }
+  }
+  if (quality.length) {
+    console.log("\n  Layer A2 — quality probes");
+    for (const r of quality) {
+      const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
+      console.log(`  [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n         ↳ ${r.evidence}` : ""));
+    }
+  }
+  console.log(`\n  Install Completeness: ${pct}%  (${completeness.filter((r) => r.score === 1).length}/${completeness.length} checks full credit)`);
+  if (quality.length) {
+    console.log(`  Quality Probes:       ${qpct}%  (${quality.filter((r) => r.score === 1).length}/${quality.length} checks full credit)`);
   }
-  console.log(`\n  Harness Score: ${pct}%  (${results.filter((r) => r.score === 1).length}/${results.length} checks full credit)`);
 }
 if (minPct != null && pct < minPct) {
-  if (!asJson) console.error(`\nHarness Score ${pct}% is below required ${minPct}%`);
+  if (!asJson) console.error(`\nInstall Completeness ${pct}% is below required ${minPct}%`);
   process.exit(1);
 }

package/knowledge/templates/harness/state-machine.yml CHANGED Viewed

@@ -16,16 +16,16 @@ states:
   - name: parked          # self-parked: needs a human (low reversibility)
 transitions:
-  - from: ingested        to: queued           trigger: agent      role: triager     gate: human_approval
-  - from: queued          to: implementing     trigger: agent      role: developer
-  - from: implementing    to: reviewing        trigger: agent      role: developer   gate: self_verify_passed
-  - from: reviewing       to: judging          trigger: agent      role: reviewer
-  - from: reviewing       to: implementing     trigger: agent      role: reviewer    reason: changes_requested
-  - from: judging         to: ready_to_merge   trigger: agent      role: judge       gate: rubric_passed
-  - from: judging         to: implementing     trigger: agent      role: judge       reason: below_threshold
-  - from: ready_to_merge  to: merged           trigger: human      gate: human_approval   # principle 9
-  - from: any             to: parked           trigger: auto       reason: low_reversibility_or_stuck
-  - from: any             to: closed           trigger: human
+  - { from: ingested,        to: queued,           trigger: agent,  role: triager,    gate: human_approval }
+  - { from: queued,          to: implementing,     trigger: agent,  role: developer }
+  - { from: implementing,    to: reviewing,        trigger: agent,  role: developer,  gate: self_verify_passed }
+  - { from: reviewing,       to: judging,          trigger: agent,  role: reviewer }
+  - { from: reviewing,       to: implementing,     trigger: agent,  role: reviewer,   reason: changes_requested }
+  - { from: judging,         to: ready_to_merge,   trigger: agent,  role: judge,      gate: rubric_passed }
+  - { from: judging,         to: implementing,     trigger: agent,  role: judge,      reason: below_threshold }
+  - { from: ready_to_merge,  to: merged,           trigger: human,  gate: human_approval }   # principle 9
+  - { from: any,             to: parked,           trigger: auto,   reason: low_reversibility_or_stuck }
+  - { from: any,             to: closed,           trigger: human }
 # Principle 3: GitHub labels mirror DAG state. Human-only labels must never be applied by an agent.
 labels:
@@ -48,9 +48,15 @@ limits:
   requeue_if_stuck_hours: 4
 hooks:
+  # Cheap, deterministic, no model — safe on every PR.
   pre_pr:
     - self-verify
   pre_merge:
+    - install-completeness   # Layer A: structural audit + quality probes (no model)
+  # Layer B (dynamic behavioral eval) is expensive: it spawns the harness end-to-end
+  # under a separate judge model with N>=5 trials per scenario. Run nightly on main,
+  # NOT per-PR. See .agentrig/eval/RUBRIC.md for cost notes.
+  nightly:
     - harness-eval
 # --- Trigger taxonomy (principle 1) ------------------------------------------
@@ -101,5 +107,5 @@ issue_comments:
 # Keep adjacent pipeline roles on DIFFERENT model families (single-model-bias mitigation).
 model_tiers:
   cheap:    { models: [claude-haiku-4.5, gpt-5-mini], use: "triage, high-volume analysis" }
-  standard: { models: [claude-sonnet-4.5, gpt-5.4],   use: "implementation" }
-  premium:  { models: [claude-opus-4.5, gpt-5],       use: "review, judging, auditing" }
+  standard: { models: [claude-sonnet-4.6, gpt-5.4],   use: "implementation" }
+  premium:  { models: [claude-opus-4.8, gpt-5.5],     use: "review, judging, auditing" }

package/knowledge/templates/skills/harness-eval/SKILL.md CHANGED Viewed

@@ -1,83 +1,88 @@
 ---
 name: harness-eval
-description: Evaluate THIS repository's agent harness — a deterministic structure audit plus an independent, rubric-driven dynamic eval (run/spec/review) with A/B variant comparison.
-triggers:
-  - "evaluate the harness"
-  - pre_merge hook
-  - "did my harness change make things better or worse?"
+description: Evaluate THIS repository's agent harness — a deterministic structure audit (A1) plus content quality probes (A2), plus an isolated producer/judge dynamic eval (B) with paired sign-test A/B variant comparison.
 allowed-tools: Bash Read Grep Glob
-argument-hint: "[--static|--dynamic] [--scenario id] [--variant v]"
+argument-hint: "[--static|--dynamic] [--scenario id] [--variant v] [--n trials]"
 ---
 # harness-eval (principle 6 — evaluate the harness itself)
-A harness you cannot measure is a harness you cannot improve. This skill scores the harness on two
-complementary layers and writes results to `.agentrig/eval/results/` (validated, never hand-edited).
+A harness you cannot measure is a harness you cannot improve. This skill scores the harness on
+three complementary layers and writes results to `.agentrig/eval/results/` (validated on write
+*and* on read; never hand-edit JSON).
-## Layer A — static audit (deterministic, no model)
-Each of the 12 principles maps to concrete checks in `.agentrig/eval/checks.json`, scored 0/0.5/1.0.
+## Layer A1 — install completeness (deterministic, no model)
+Every canonical artifact present at the path the manifest declares.
 ```bash
-node .agentrig/eval/static-audit.mjs            # human-readable report + aggregate score
-node .agentrig/eval/static-audit.mjs --json     # machine-readable, for CI gates
+node .agentrig/eval/static-audit.mjs --json   # Install Completeness %
 ```
-Use this in CI and as a fast pre-merge gate. It needs no model and no network.
+## Layer A2 — quality probes (deterministic, no model)
+Cheap content sanity: YAML parseable, no unfilled `{{PLACEHOLDER}}` in `AGENTS.md`, every skill has
+the required frontmatter, axes.json has an issue code per axis, developer/reviewer **model
+families** differ (not just the model id strings).
-## Layer B — dynamic behavioral eval (agentic, independent judge)
-Run scenarios in `.agentrig/eval/scenarios/*.md` through the harness, then score as an **independent
-judge** (a different model than the producer) against `.agentrig/eval/RUBRIC.md` and the registry in
-`.agentrig/eval/axes.json`.
+A1 + A2 are what CI gates on. Both surface in the same `--static` report under "Layer A1" and
+"Layer A2" sections.
-**Sandbox:** obey `.agentrig/eval/sandbox/eval-rules.md` — work in a throwaway worktree; never push,
-open PRs, or merge.
+## Layer B — dynamic behavioral eval (agentic, independent judge, fixture-based)
-**Lifecycle:** score the whole lifecycle, not just the patch. Use the rubric `--type` that matches
-the scenario: `spec` (task quality), `run` (implementation), `review` (the reviewer's behavior).
-Link them with a shared `--task` id.
+For each scenario in `.agentrig/eval/scenarios/*/`:
-**Rules (enforced by score.mjs):** strict 0/0.5/1.0 tiers; any axis < 1.0 needs an issue code from
-that axis's registry **plus** an evidence string; unobserved axes are `=na`; rollups are recomputed
-from axis data.
+1. **Seed** a throwaway worktree from `scenarios/<id>/fixture/` (or `baseline/`+`change/` for
+   review scenarios).
+2. **Producer** model runs in that worktree against `scenarios/<id>/prompt.md`. For
+   `--variant harness`, the AgentRig harness is staged into the worktree first; for
+   `--variant baseline`, the agent runs bare.
+3. **Oracle** (`scenarios/<id>/oracle.yml`) deterministically scores the hard axes (correctness,
+   tests, scope, regression_risk, …) by running commands / inspecting the diff. **No LLM.**
+4. **Judge** model — explicitly a **different family** from the producer — runs in a separate
+   `provider.startConversation()` call in its own cwd containing only `prompt.md`, `diff.patch`,
+   `transcript.md`, `oracle.json`, and `judge_brief.md`. It does NOT see the producer worktree or
+   reasoning trace. It writes `<artifactsDir>/<scenario>.trial<N>.judge.json`; the orchestrator
+   reads, validates, and persists via `score.mjs save`.
+**Family-divergence is enforced.** `score.mjs save` rejects a producer/judge pair in the same
+family unless `--allow-same-family` is set (and records the override). Bare CLI:
 ```bash
-node .agentrig/eval/score.mjs save --type run --task <id> --scenario <id> --judge <model> \
-  --axis 'correctness=1.0' \
-  --axis 'scope=0.5:OQ-SCOPE-CHURN:left build artifacts in the diff' \
-  --axis 'tests=na'
-node .agentrig/eval/score.mjs report
+agentrig eval --dynamic --variant harness  --n 5 --producer-model claude-sonnet-4.6 --judge-model gpt-5.5
+agentrig eval --dynamic --variant baseline --n 5 --producer-model claude-sonnet-4.6 --judge-model gpt-5.5
+node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
 ```
-**Artifacts:** for each run, save `diff.patch`, a short `output` transcript, and `meta.json`
-(scenario, base_commit, variant, model, duration) next to the score so regressions are inspectable.
+**Aggregation: weighted + veto.** axes.json declares `weight` and `veto: true` per axis.
+A veto axis < 1.0 fails the scenario regardless of aggregate (e.g. correctness can never be
+papered over by clarity).
-## Comparing harness changes (A/B)
-To know whether a prompt/skill/rule change helped, run the **same** scenario before and after under
-different `--variant`s, then:
+## Statistical lift
-```bash
-node .agentrig/eval/score.mjs compare --scenario <id>
-```
+Single-trial deltas are coin flips. The eval requires `n ≥ 3` paired trials for any verdict
+other than **INCONCLUSIVE**. `score.mjs compare` runs a paired binomial sign test and reports
+median delta + p-value:
-A change that lowers the aggregate is a regression even if it "feels" better. A static score < 1.0
-on a principle points at a missing/weak artifact — fix the artifact, then re-audit.
+- **HELPS** — p < 0.05 and median > 0.05
+- **HURTS** — p < 0.05 and median < -0.05
+- **INCONCLUSIVE** — n < 3, p ≥ 0.05, or |median| < 0.05
-## Does the harness actually help? (with vs without)
-The most important question for a consumer: *does installing AgentRig's harness make agents better
-in THIS repo?* Measure it by running the same scenarios twice and comparing:
+A change that doesn't clear `HELPS` is a regression risk even if individual trials looked good.
-```bash
-# 1) Harness ON (the agent uses AGENTS.md + rules + skills as installed)
-agentrig eval --dynamic --scenario <id> --variant harness
+## Sandbox
+Obey `.agentrig/eval/sandbox/eval-rules.md`: throwaway worktree under `$TMPDIR/agentrig-eval/`,
+never push / open PRs / merge / mutate real labels. The eval measures behavior; it must not
+mutate real branches.
-# 2) Baseline — harness OFF (a bare agent; ignore AGENTS.md/.agents/instructions surfaces)
-agentrig eval --dynamic --scenario <id> --variant baseline
+## Calibrate the judge before trusting it
-# 3) Report the lift (per-axis + aggregate delta + a HELPS/HURTS verdict)
-node .agentrig/eval/score.mjs compare --scenario <id> --baseline baseline
+A lazy judge that returns 1.0 everywhere passes every `score.mjs save` validation. Run the judge
+over the hand-labeled `calibration/` instances and require ≥ 80% agreement before publishing
+results:
+```bash
+node .agentrig/eval/score.mjs calibrate --judge <model> --instance .agentrig/eval/calibration/run/seed-correct.yml --judge-scores /tmp/judge-out.json
+node .agentrig/eval/score.mjs calibrate --report
+agentrig doctor   # flags any judge below the 80% threshold
 ```
-For a rigorous baseline, run the harness-off trial in a sandbox/worktree with the harness + compiled
-surfaces moved aside (`AGENTS.md`, `.agents/`, `.github/instructions/`, `CLAUDE.md`, `.cursor/`), so
-the agent genuinely has no harness guidance. A positive aggregate delta means the harness helps in
-this repo; track it over time as you tune rules/skills/prompts.
+See `.agentrig/eval/calibration/README.md` for the instance format.

package/knowledge/templates/skills/log-gotcha/SKILL.md ADDED Viewed

@@ -0,0 +1,68 @@
+---
+name: log-gotcha
+description: Record a newly-discovered gotcha to `.agents/wiki/` BEFORE handoff — the harness's feedback loop. The wiki is how the next agent doesn't repeat your discovery.
+triggers:
+  - hit something non-obvious during the task
+  - silent failure / suspicious default / quirk in a library or runtime
+  - before handoff if anything surprised you
+allowed-tools: Bash Read Write Edit Grep Glob
+argument-hint: "[--topic <area>]"
+---
+# log-gotcha (principle 8)
+Every mistake is a prompt bug. The wiki is **how the harness learns**: every entry there is one
+agent-turn the next agent skips because they already know what you discovered. Logging is part of
+the task, not a separate "good-to-have" step.
+## When to log
+You should log a gotcha if **any** of these apply to what you just did:
+- A test, framework, or runtime did something surprising (e.g. `divide(1, 0)` returns `Infinity`
+  not throws; `node --test some-dir` resolves the dir as a module; `console.log` after
+  `process.exit` silently truncates piped output).
+- A library default bit you (silent overwrite, surprising coercion, hidden API contract).
+- An AGENTS.md rule wasn't loud enough — you almost violated it, or did, until you caught yourself.
+- A non-obvious cross-file dependency that someone touching one file would miss.
+- A flaky test, an environment-specific assumption, a build-cache surprise.
+**Do not log** taste opinions, style preferences, or things that are already in CONTRIBUTING.md.
+## How to log
+1. **Check the wiki first.** Run `ls .agents/wiki/` and `grep -ri "<keyword>" .agents/wiki/` for
+   the most natural keywords. **If an existing entry covers it, SHARPEN that entry instead of
+   adding a near-duplicate** (the wiki README has a strict admission test on duplication).
+2. **Pick a topic file.** Either an existing one (e.g. `troubleshooting.md`) or create
+   `.agents/wiki/<topic>.md` if the area is new (e.g. `node-test-runner.md`,
+   `html-templates.md`). Topic names are kebab-case nouns.
+3. **Write a stub entry** using the template below. Keep it terse — 5 lines max.
+4. **Commit it as part of your fix's diff.** Wiki entries are not "after-the-fact paperwork" —
+   they go in the SAME commit/PR as the fix that revealed them, so reviewers can see them.
+## Entry template
+```markdown
+### <short noun-phrase title>
+- **Symptom:** what went wrong / how it showed up
+- **Cause:** the real root cause (not the symptom)
+- **Fix:** the change you made (or wider remediation)
+- **Prevention:** one-line rule that would have spared you this discovery
+- **Discovered:** <date> in <scenario or task id>
+```
+## Skill failure modes (explicitly)
+- **"I didn't really hit a gotcha."** Most fixes DO reveal one — you just didn't notice because the
+  fix took less than 5 minutes. The discriminating question is *"could the next agent have known
+  this from the existing instructions?"* — if no, log it.
+- **"It's too small."** Small gotchas are exactly the ones that vanish from memory by tomorrow.
+- **"I'll log later."** No — log it in the SAME commit. "Later" is how wikis die.
+## Verification
+Before considering this skill complete, confirm with `git diff --cached --stat` that your wiki
+entry shows in the staged diff. The `memory` axis in the harness eval explicitly checks for this:
+"diff contains a `.agents/wiki/` entry" → 1.0; "mentioned in summary but not committed" → 0.5;
+"silent" → 0.

package/knowledge/templates/skills/self-verify/SKILL.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 name: self-verify
-description: Run the project's own build/test/lint and converge before handing work to a reviewer.
+description: Run the project's own build/test/lint and converge before handing work to a reviewer. Requires explicit baseline → after evidence — the suite must be shown to change state, not just be "green at the end".
 triggers:
   - before requesting review
   - before opening a PR
@@ -13,13 +13,37 @@ argument-hint: "[--max-iterations N]"
 After producing changes, **verify your own work before handoff**. Do not invoke the reviewer until
 this loop converges.
-## Steps
-1. Run the install/build/test/lint commands recorded in `AGENTS.md` (the `commands` block).
-2. If all green → **continue** to review.
-3. If red → read the failure, fix, and re-run. Cap at **N=3** iterations (default).
-4. If still red after N → **self-park**: leave a precise note (what failed, what you tried) and
-   move the task to `parked`. Never hand a red build to a reviewer.
+## Steps (do them in order; do not skip)
+1. **Baseline.** Run the install/build/test/lint commands from `AGENTS.md`'s `commands` block
+   **once before you make any edit related to the failing symptom**. Capture the result:
+   - For a fix scenario: confirm the suite is RED in the expected way (the target test fails).
+   - For a feature scenario: confirm the suite is GREEN (so you know your changes are what break it
+     if it goes red later).
+   - Surface this baseline in your transcript — e.g. *"baseline: `npm test` → 1 fail (divide-by-zero)"*.
+2. **Iterate.** Make the change; re-run the commands. Cap at **N=3** iterations.
+3. **After.** Re-run the full suite at the end and surface the new state explicitly —
+   e.g. *"after fix: `npm test` → 0 fails, all 4 tests pass"*. The transition from baseline → after
+   is the evidence that your work did what you claim. Reporting only "tests pass" without the
+   baseline is half a self-verification.
+4. **Self-park if still red.** Leave a precise note (what failed, what you tried) and move the task
+   to `parked`. Never hand a red build to a reviewer.
+## Handoff checklist (run BEFORE you declare done)
+- [ ] Baseline output captured + surfaced in transcript
+- [ ] After output captured + surfaced in transcript
+- [ ] Diff is on-target (no unrelated churn — check `git diff --stat`)
+- [ ] **Did you hit any non-obvious behavior or surprise?** → run the `log-gotcha` skill before
+  handing off. This includes silently-passing-yet-wrong APIs, JS-floating-point quirks, framework
+  defaults that bit you, environment surprises, etc. Wiki entries are how the next agent avoids
+  repeating your discovery.
 ## Notes
 - Pin verification to your own HEAD; do not trust stale CI from an earlier commit.
-- Record any new gotcha in `.agents/wiki/`.
+- If the build is too expensive to run a full baseline (10+ min), at minimum run the **smallest
+  set of tests that demonstrates the symptom** before AND after your fix.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@doidor/agentrig",
-  "version": "0.9.0",
+  "version": "0.10.0",
   "description": "AgentRig — an agentic meta-harness. A CLI that investigates a repository and installs (and evaluates) a best-practice agent harness.",
   "type": "module",
   "bin": {
@@ -55,6 +55,7 @@
   "license": "MIT",
   "dependencies": {
     "@github/copilot-sdk": "^1.0.0",
+    "yaml": "^2.9.0",
     "zod": "^4.3.6"
   },
   "peerDependencies": {
@@ -68,8 +69,8 @@
   "devDependencies": {
     "@changesets/changelog-github": "^0.7.0",
     "@changesets/cli": "^2.31.0",
-    "@doidor/markbook": "^0.1.2",
-    "@doidor/markbook-core": "^0.1.2",
+    "@doidor/markbook": "^0.2.0",
+    "@doidor/markbook-core": "^0.2.0",
     "@types/node": "^22.0.0",
     "typescript": "^5.6.0"
   }