npm - @doidor/agentrig - Versions diffs - 0.5.3 - Mend

@doidor/agentrig 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/LICENSE +21 -0
package/README.md +224 -0
package/dist/agent/claude.js +125 -0
package/dist/agent/claude.js.map +1 -0
package/dist/agent/copilot.js +147 -0
package/dist/agent/copilot.js.map +1 -0
package/dist/agent/index.js +17 -0
package/dist/agent/index.js.map +1 -0
package/dist/agent/provider.js +10 -0
package/dist/agent/provider.js.map +1 -0
package/dist/cli.js +169 -0
package/dist/cli.js.map +1 -0
package/dist/commands/compile.js +42 -0
package/dist/commands/compile.js.map +1 -0
package/dist/commands/dashboard.js +35 -0
package/dist/commands/dashboard.js.map +1 -0
package/dist/commands/doctor.js +40 -0
package/dist/commands/doctor.js.map +1 -0
package/dist/commands/eval.js +178 -0
package/dist/commands/eval.js.map +1 -0
package/dist/commands/init.js +100 -0
package/dist/commands/init.js.map +1 -0
package/dist/commands/update.js +176 -0
package/dist/commands/update.js.map +1 -0
package/dist/core/activity.js +80 -0
package/dist/core/activity.js.map +1 -0
package/dist/core/audit.js +112 -0
package/dist/core/audit.js.map +1 -0
package/dist/core/compile.js +250 -0
package/dist/core/compile.js.map +1 -0
package/dist/core/fsutil.js +45 -0
package/dist/core/fsutil.js.map +1 -0
package/dist/core/install.js +97 -0
package/dist/core/install.js.map +1 -0
package/dist/core/knowledge.js +34 -0
package/dist/core/knowledge.js.map +1 -0
package/dist/core/logger.js +31 -0
package/dist/core/logger.js.map +1 -0
package/dist/core/paths.js +22 -0
package/dist/core/paths.js.map +1 -0
package/dist/core/setupsteps.js +72 -0
package/dist/core/setupsteps.js.map +1 -0
package/dist/core/state.js +19 -0
package/dist/core/state.js.map +1 -0
package/dist/core/surfaces.js +62 -0
package/dist/core/surfaces.js.map +1 -0
package/dist/prompts/index.js +117 -0
package/dist/prompts/index.js.map +1 -0
package/dist/version.js +26 -0
package/dist/version.js.map +1 -0
package/knowledge/PRINCIPLES.md +106 -0
package/knowledge/manifest.json +247 -0
package/knowledge/templates/AGENTS.md +66 -0
package/knowledge/templates/AGENTS.package.example.md +19 -0
package/knowledge/templates/agents/README.md +33 -0
package/knowledge/templates/agents/developer.md +7 -0
package/knowledge/templates/agents/developer.yml +7 -0
package/knowledge/templates/agents/judge.md +6 -0
package/knowledge/templates/agents/judge.yml +6 -0
package/knowledge/templates/agents/reviewer.md +6 -0
package/knowledge/templates/agents/reviewer.yml +7 -0
package/knowledge/templates/agents/triager.md +8 -0
package/knowledge/templates/agents/triager.yml +8 -0
package/knowledge/templates/dashboard/dashboard.mjs +261 -0
package/knowledge/templates/eval/RUBRIC.md +94 -0
package/knowledge/templates/eval/axes.json +56 -0
package/knowledge/templates/eval/checks.json +304 -0
package/knowledge/templates/eval/sandbox/eval-rules.md +23 -0
package/knowledge/templates/eval/scenarios/README.md +24 -0
package/knowledge/templates/eval/scenarios/add-small-feature.md +28 -0
package/knowledge/templates/eval/scenarios/fix-failing-test.md +27 -0
package/knowledge/templates/eval/scenarios/review-catches-bug.md +30 -0
package/knowledge/templates/eval/score.mjs +257 -0
package/knowledge/templates/eval/static-audit.mjs +112 -0
package/knowledge/templates/harness/ORCHESTRATION.md +53 -0
package/knowledge/templates/harness/state-machine.yml +105 -0
package/knowledge/templates/mcp/mcp.json +12 -0
package/knowledge/templates/rules/README.md +32 -0
package/knowledge/templates/rules/code-review.md +26 -0
package/knowledge/templates/rules/coding-standards.md +15 -0
package/knowledge/templates/rules/no-debug-logging.md +16 -0
package/knowledge/templates/rules/security.md +23 -0
package/knowledge/templates/scripts/repair-worktrees.sh +124 -0
package/knowledge/templates/skills/fix-ci/SKILL.md +17 -0
package/knowledge/templates/skills/harness-eval/SKILL.md +83 -0
package/knowledge/templates/skills/self-verify/SKILL.md +25 -0
package/knowledge/templates/skills/skill-authoring/SKILL.md +35 -0
package/knowledge/templates/skills/skill-improver/SKILL.md +23 -0
package/knowledge/templates/skills/verify-loop/SKILL.md +35 -0
package/knowledge/templates/wiki/README.md +23 -0
package/knowledge/templates/wiki/_TEMPLATE.md +16 -0
package/knowledge/templates/wiki/index.md +29 -0
package/knowledge/templates/wiki/troubleshooting.md +14 -0
package/package.json +70 -0

package/knowledge/templates/eval/score.mjs ADDED Viewed

@@ -0,0 +1,257 @@
+#!/usr/bin/env node
+// AgentRig dynamic-eval aggregator (principle 6). Owns the results JSON shape and VALIDATES every
+// score against the rubric registry in axes.json — so results are never hand-edited and a judge
+// cannot invent axes, tiers, or issue codes. Inspired by epichan's pydantic-validated scoring.
+//
+// Usage:
+//   node score.mjs save --type run --task add-small-feature --scenario add-small-feature \
+//        --judge claude-opus-4.8 [--variant v1] [--run RID] \
+//        --axis 'correctness=1.0' \
+//        --axis 'scope=0.5:OQ-SCOPE-CHURN:left package-lock.json churn in the diff' \
+//        --axis 'tests=na'                 # na = unobserved (confidence 0, excluded from rollups)
+//   node score.mjs report [--type run] [--variant v1] [--json]
+//   node score.mjs compare --scenario add-small-feature   # A/B variants side by side
+//
+// Score tiers: 0 / 0.5 / 1.0. Any axis < 1.0 (and observed) REQUIRES an issue code from that axis's
+// registry plus an evidence string. Category and aggregate scores are recomputed from axis data.
+import { readFileSync, writeFileSync, existsSync, mkdirSync, readdirSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+import { dirname, join } from "node:path";
+const scriptDir = dirname(fileURLToPath(import.meta.url));
+const resultsDir = join(scriptDir, "results");
+const axesPath = join(scriptDir, "axes.json");
+function loadRegistry() {
+  if (!existsSync(axesPath)) {
+    console.error(`axes.json not found at ${axesPath}`);
+    process.exit(2);
+  }
+  return JSON.parse(readFileSync(axesPath, "utf8"));
+}
+/** Build axis -> { category, codes } lookup for a rubric type. */
+function axisIndex(registry, type) {
+  const def = registry.types?.[type];
+  if (!def) {
+    console.error(`unknown rubric type "${type}". Valid: ${Object.keys(registry.types).join(", ")}`);
+    process.exit(2);
+  }
+  const index = new Map();
+  for (const [category, axes] of Object.entries(def.categories)) {
+    for (const [axis, codes] of Object.entries(axes)) index.set(axis, { category, codes });
+  }
+  return index;
+}
+function getOpt(args, name, repeat = false) {
+  const out = [];
+  for (let i = 0; i < args.length; i++) if (args[i] === name) out.push(args[i + 1]);
+  return repeat ? out : out[0];
+}
+function fail(msg) {
+  console.error(`error: ${msg}`);
+  process.exit(2);
+}
+const [cmd, ...args] = process.argv.slice(2);
+const registry = loadRegistry();
+const TIERS = new Set(registry.tiers ?? [0, 0.5, 1.0]);
+const PASS = registry.passThreshold ?? 0.8;
+if (cmd === "save") {
+  const type = getOpt(args, "--type") || "run";
+  const index = axisIndex(registry, type);
+  const scenario = getOpt(args, "--scenario") || getOpt(args, "--task");
+  const task = getOpt(args, "--task") || scenario;
+  const judge = getOpt(args, "--judge") || "unknown";
+  const variant = getOpt(args, "--variant") || null;
+  const run = getOpt(args, "--run") || null;
+  if (!scenario) fail("save requires --scenario <id> (or --task <id>)");
+  const rawAxes = getOpt(args, "--axis", true);
+  if (rawAxes.length === 0) fail("save requires at least one --axis name=score[:CODE[:evidence]]");
+  const axes = rawAxes.map((spec) => {
+    const eq = spec.indexOf("=");
+    if (eq < 0) fail(`bad --axis "${spec}" (expected name=score[:CODE[:evidence]])`);
+    const name = spec.slice(0, eq);
+    const rest = spec.slice(eq + 1);
+    const meta = index.get(name);
+    if (!meta) fail(`unknown axis "${name}" for type "${type}". Valid: ${[...index.keys()].join(", ")}`);
+    // "na" marks an unobserved axis (confidence 0) — excluded from rollups.
+    if (rest === "na") return { name, category: meta.category, score: 0, issue: null, evidence: "", confidence: 0 };
+    const [scoreStr, code, ...evidenceParts] = rest.split(":");
+    const score = Number(scoreStr);
+    if (!TIERS.has(score)) fail(`axis "${name}" score must be one of ${[...TIERS].join("/")} — got "${scoreStr}"`);
+    const evidence = evidenceParts.join(":").trim();
+    if (score < 1) {
+      if (!code) fail(`axis "${name}" scored ${score} < 1.0 but has no issue code — use name=score:CODE[:evidence]`);
+      if (!meta.codes.includes(code)) fail(`issue code "${code}" is not valid for axis "${name}". Valid: ${meta.codes.join(", ")}`);
+      if (!evidence) fail(`axis "${name}" scored ${score} < 1.0 but has no evidence — use name=score:CODE:evidence`);
+    }
+    return { name, category: meta.category, score, issue: code || null, evidence, confidence: 1 };
+  });
+  // Recompute rollups from axis data (never trust hand-supplied totals). Confidence-gated.
+  const observed = axes.filter((a) => a.confidence > 0);
+  const categories = {};
+  for (const a of observed) (categories[a.category] ||= []).push(a.score);
+  const categoryScores = Object.fromEntries(
+    Object.entries(categories).map(([c, xs]) => [c, round(xs.reduce((s, x) => s + x, 0) / xs.length)]),
+  );
+  const aggregate = observed.length ? round(observed.reduce((s, a) => s + a.score, 0) / observed.length) : 0;
+  const pass = observed.length > 0 && aggregate >= PASS && observed.every((a) => a.score > 0);
+  const record = {
+    type, task, scenario, variant, run, judge,
+    timestamp: new Date().toISOString(),
+    aggregate, pass, categoryScores, axes,
+  };
+  if (!existsSync(resultsDir)) mkdirSync(resultsDir, { recursive: true });
+  const safe = (s) => String(s).replace(/[^a-zA-Z0-9_.-]/g, "_");
+  const file = join(resultsDir, `${safe(type)}.${safe(scenario)}.${safe(variant || "base")}.${Date.now()}.json`);
+  writeFileSync(file, JSON.stringify(record, null, 2));
+  console.log(`Saved ${file}\n  aggregate=${aggregate.toFixed(2)} ${pass ? "PASS" : "FAIL"} (${observed.length}/${axes.length} axes observed)`);
+  process.exit(0);
+}
+if (cmd === "report" || cmd === "compare") {
+  const asJson = args.includes("--json");
+  const filterType = getOpt(args, "--type");
+  const filterVariant = getOpt(args, "--variant");
+  const records = loadRecords();
+  if (cmd === "compare") {
+    compare(records, getOpt(args, "--scenario"), asJson, getOpt(args, "--baseline"));
+    process.exit(0);
+  }
+  let scoped = records;
+  if (filterType) scoped = scoped.filter((r) => r.type === filterType);
+  if (filterVariant) scoped = scoped.filter((r) => (r.variant || "base") === filterVariant);
+  // Latest record per (type, scenario, variant).
+  const latest = new Map();
+  for (const r of scoped.sort((a, b) => a.timestamp.localeCompare(b.timestamp))) {
+    latest.set(`${r.type}::${r.scenario}::${r.variant || "base"}`, r);
+  }
+  const rows = [...latest.values()];
+  const axisAgg = new Map();
+  for (const r of rows) for (const a of r.axes) {
+    if (a.confidence <= 0) continue;
+    const x = axisAgg.get(a.name) || { sum: 0, n: 0 };
+    x.sum += a.score; x.n += 1; axisAgg.set(a.name, x);
+  }
+  const overall = rows.length ? round(rows.reduce((s, r) => s + r.aggregate, 0) / rows.length) : 0;
+  if (asJson) {
+    console.log(JSON.stringify({
+      overall,
+      results: rows.map((r) => ({ type: r.type, scenario: r.scenario, variant: r.variant, aggregate: r.aggregate, pass: r.pass, judge: r.judge })),
+      axes: [...axisAgg.entries()].map(([name, v]) => ({ name, mean: round(v.sum / v.n) })),
+    }, null, 2));
+  } else {
+    console.log("AgentRig — dynamic eval report\n");
+    if (rows.length === 0) {
+      console.log("  No results yet. Run `score.mjs save ...` first.");
+    } else {
+      const byType = new Map();
+      for (const r of rows) {
+        if (!byType.has(r.type)) byType.set(r.type, []);
+        byType.get(r.type).push(r);
+      }
+      for (const [type, group] of byType) {
+        console.log(`  ${type.toUpperCase()}`);
+        for (const r of group) {
+          const v = r.variant ? ` [${r.variant}]` : "";
+          console.log(`    ${r.pass ? "PASS" : "FAIL"}  ${(r.scenario + v).padEnd(30)} ${r.aggregate.toFixed(2)}  (${r.judge})`);
+        }
+      }
+      console.log("\n  Per-axis means (observed only):");
+      for (const [name, v] of axisAgg) console.log(`    ${name.padEnd(22)} ${round(v.sum / v.n).toFixed(2)}`);
+      console.log(`\n  Overall: ${overall.toFixed(2)} across ${rows.length} result(s)`);
+    }
+  }
+  process.exit(0);
+}
+console.error("Usage: score.mjs <save|report|compare> ...");
+process.exit(2);
+// --- helpers ---------------------------------------------------------------
+function round(n) {
+  return Math.round(n * 10000) / 10000;
+}
+function loadRecords() {
+  if (!existsSync(resultsDir)) return [];
+  const out = [];
+  for (const f of readdirSync(resultsDir).filter((f) => f.endsWith(".json"))) {
+    try {
+      out.push(JSON.parse(readFileSync(join(resultsDir, f), "utf8")));
+    } catch {
+      console.error(`warning: skipping corrupt result file ${f}`);
+    }
+  }
+  return out;
+}
+function compare(records, scenario, asJson, baseline) {
+  if (!scenario) fail("compare requires --scenario <id>");
+  const forScenario = records.filter((r) => r.scenario === scenario);
+  const latestByVariant = new Map();
+  for (const r of forScenario.sort((a, b) => a.timestamp.localeCompare(b.timestamp))) {
+    latestByVariant.set(r.variant || "base", r);
+  }
+  const variants = [...latestByVariant.values()];
+  // Harness-lift mode: delta of every other variant vs the baseline.
+  let lift = null;
+  if (baseline) {
+    const base = latestByVariant.get(baseline);
+    if (!base) fail(`no results for baseline variant "${baseline}" on scenario "${scenario}"`);
+    lift = variants
+      .filter((r) => (r.variant || "base") !== baseline)
+      .map((r) => {
+        const axisDelta = {};
+        const baseAxes = Object.fromEntries((base.axes || []).filter((a) => a.confidence > 0).map((a) => [a.name, a.score]));
+        for (const a of (r.axes || []).filter((a) => a.confidence > 0)) {
+          if (baseAxes[a.name] !== undefined) axisDelta[a.name] = round(a.score - baseAxes[a.name]);
+        }
+        return { variant: r.variant || "base", aggregateDelta: round(r.aggregate - base.aggregate), axisDelta };
+      });
+  }
+  if (asJson) {
+    console.log(JSON.stringify({
+      scenario,
+      variants: variants.map((r) => ({ variant: r.variant || "base", aggregate: r.aggregate, pass: r.pass, judge: r.judge, categoryScores: r.categoryScores })),
+      ...(lift ? { baseline, lift } : {}),
+    }, null, 2));
+    process.exit(0);
+  }
+  console.log(`AgentRig — variant comparison for "${scenario}"\n`);
+  if (variants.length === 0) console.log("  No results for that scenario.");
+  for (const r of variants) {
+    console.log(`  ${(r.variant || "base").padEnd(12)} ${r.aggregate.toFixed(2)} ${r.pass ? "PASS" : "FAIL"}  (${r.judge})`);
+    for (const [c, s] of Object.entries(r.categoryScores || {})) console.log(`      ${c.padEnd(20)} ${s.toFixed(2)}`);
+  }
+  if (lift) {
+    console.log(`\n  Harness lift vs baseline "${baseline}":`);
+    for (const l of lift) {
+      const sign = l.aggregateDelta > 0 ? "+" : "";
+      const verdict = l.aggregateDelta > 0 ? "HELPS" : l.aggregateDelta < 0 ? "HURTS" : "no change";
+      console.log(`    ${l.variant.padEnd(12)} aggregate ${sign}${l.aggregateDelta.toFixed(2)}  → harness ${verdict}`);
+      for (const [name, d] of Object.entries(l.axisDelta)) {
+        if (d !== 0) console.log(`        ${name.padEnd(20)} ${d > 0 ? "+" : ""}${d.toFixed(2)}`);
+      }
+    }
+  }
+  process.exit(0);
+}

package/knowledge/templates/eval/static-audit.mjs ADDED Viewed

@@ -0,0 +1,112 @@
+#!/usr/bin/env node
+// AgentRig static harness audit (principle 6) — deterministic, dependency-free, no model.
+// Interprets checks.json (the single source of truth, shared with `agentrig eval --static`)
+// against this repository and prints a Harness Score. Usage:
+//   node .agentrig/eval/static-audit.mjs            human-readable report
+//   node .agentrig/eval/static-audit.mjs --json     machine-readable
+//   node .agentrig/eval/static-audit.mjs --min 80   exit non-zero if score < 80%
+import { readFileSync, existsSync, statSync, readdirSync } from "node:fs";
+import { fileURLToPath } from "node:url";
+import { dirname, join, resolve } from "node:path";
+const scriptDir = dirname(fileURLToPath(import.meta.url));
+const repoRoot = resolve(scriptDir, "..", "..");
+const checksPath = join(scriptDir, "checks.json");
+const args = process.argv.slice(2);
+const asJson = args.includes("--json");
+const minIdx = args.indexOf("--min");
+const minPct = minIdx >= 0 ? Number(args[minIdx + 1]) : null;
+const rel = (p) => resolve(repoRoot, p);
+const read = (p) => (existsSync(rel(p)) ? readFileSync(rel(p), "utf8") : null);
+function frontmatter(text) {
+  if (!text || !text.startsWith("---")) return null;
+  const end = text.indexOf("\n---", 3);
+  if (end < 0) return null;
+  return text.slice(3, end);
+}
+function extractValue(text, key) {
+  if (!text) return null;
+  const m = text.match(new RegExp("^\\s*" + key + "\\s*:\\s*(.+)\\s*$", "m"));
+  return m ? m[1].trim() : null;
+}
+function scoreCheck(c) {
+  switch (c.type) {
+    case "path-exists":
+      return { score: existsSync(rel(c.path)) ? 1 : 0, evidence: existsSync(rel(c.path)) ? "" : `missing ${c.path}` };
+    case "file-contains": {
+      const text = read(c.path);
+      if (text == null) return { score: 0, evidence: `missing ${c.path}` };
+      const missing = (c.patterns || []).filter((p) => !text.includes(p));
+      if (missing.length === 0) return { score: 1, evidence: "" };
+      return { score: 0.5, evidence: `present but missing markers: ${missing.join(", ")}` };
+    }
+    case "dir-min": {
+      const abs = rel(c.path);
+      if (!existsSync(abs) || !statSync(abs).isDirectory()) return { score: 0, evidence: `missing dir ${c.path}` };
+      const n = readdirSync(abs).filter((e) => !e.startsWith(".")).length;
+      if (n >= (c.min || 1)) return { score: 1, evidence: "" };
+      return { score: 0.5, evidence: `${n} entr${n === 1 ? "y" : "ies"}, need ${c.min}` };
+    }
+    case "frontmatter-keys": {
+      const fm = frontmatter(read(c.path));
+      if (fm == null) return { score: 0, evidence: `no frontmatter in ${c.path}` };
+      const missing = (c.keys || []).filter((k) => !new RegExp("^\\s*" + k + "\\s*:", "m").test(fm));
+      if (missing.length === 0) return { score: 1, evidence: "" };
+      return { score: 0.5, evidence: `missing keys: ${missing.join(", ")}` };
+    }
+    case "roles-distinct-models": {
+      const dev = extractValue(read(c.developer), c.key || "model");
+      const rev = extractValue(read(c.reviewer), c.key || "model");
+      if (!dev || !rev) return { score: 0, evidence: "developer/reviewer model not declared" };
+      if (dev !== rev) return { score: 1, evidence: "" };
+      return { score: 0.5, evidence: `developer and reviewer share model "${dev}"` };
+    }
+    default:
+      return { score: 0, evidence: `unknown check type ${c.type}` };
+  }
+}
+if (!existsSync(checksPath)) {
+  console.error(`checks.json not found at ${checksPath}`);
+  process.exit(2);
+}
+const { checks } = JSON.parse(readFileSync(checksPath, "utf8"));
+const results = checks.map((c) => ({ ...c, ...scoreCheck(c) }));
+let wSum = 0, wScore = 0;
+const byPrinciple = new Map();
+for (const r of results) {
+  const w = r.weight ?? 1;
+  wSum += w;
+  wScore += w * r.score;
+  const p = byPrinciple.get(r.principle) || { sum: 0, n: 0 };
+  p.sum += r.score; p.n += 1;
+  byPrinciple.set(r.principle, p);
+}
+const aggregate = wSum ? wScore / wSum : 0;
+const pct = Math.round(aggregate * 1000) / 10;
+if (asJson) {
+  console.log(JSON.stringify({
+    harnessScore: pct,
+    aggregate,
+    principles: [...byPrinciple.entries()].sort((a, b) => a[0] - b[0]).map(([principle, v]) => ({ principle, score: v.sum / v.n })),
+    checks: results.map((r) => ({ id: r.id, principle: r.principle, title: r.title, score: r.score, evidence: r.evidence })),
+  }, null, 2));
+} else {
+  console.log("AgentRig — harness audit\n");
+  for (const r of results.sort((a, b) => a.principle - b.principle || a.id.localeCompare(b.id))) {
+    const tag = r.score === 1 ? "PASS" : r.score === 0.5 ? "PART" : "FAIL";
+    console.log(`  [${tag}] P${r.principle} ${r.title}` + (r.evidence ? `\n         ↳ ${r.evidence}` : ""));
+  }
+  console.log(`\n  Harness Score: ${pct}%  (${results.filter((r) => r.score === 1).length}/${results.length} checks full credit)`);
+}
+if (minPct != null && pct < minPct) {
+  if (!asJson) console.error(`\nHarness Score ${pct}% is below required ${minPct}%`);
+  process.exit(1);
+}

package/knowledge/templates/harness/ORCHESTRATION.md ADDED Viewed

@@ -0,0 +1,53 @@
+# Orchestration contract (principles 1, 3, 10)
+How AgentRig expects a harness engine to drive `.agentrig/harness/state-machine.yml`. AgentRig
+*installs* this contract as plain text; a runner (the epi-platform engine, a CI job, or your own
+script) executes it. Synthesized from epichan's engine.
+## 1. Triggers: who may move state
+Every transition declares a `trigger` kind (`triggers.kinds` in the state machine):
+- **agent** — a role does the work (triager/developer/reviewer/judge).
+- **script** — a poller/reconciler asserts state from GitHub on a cadence.
+- **auto** — a deterministic immediate transition.
+- **event** — a GitHub webhook maps straight to a state (`event_to_state`).
+- **human** — a person performs a low-reversibility action (e.g. merge).
+Agents may only drive `agent` transitions. They must never fabricate `script`/`event`/`human` ones.
+## 2. Hybrid event + polling reconciliation
+- **Events** give low-latency reactions: `pull_request.synchronize → reviewing`,
+  `check_suite.completed.failure → implementing`, etc. (`event_to_state`).
+- **Pollers** repair anything events missed, on per-state cadences (`reconciliation`): ingest ready
+  issues (60s), route task PRs (300s), confirm merges (120s).
+- The engine never *assumes* a PR merged — it flips `ready_to_merge → merged` only after GitHub
+  reports it merged.
+If the engine crashes, GitHub still holds the truth and the pollers re-derive engine state.
+## 3. Compare-and-set transitions (no double-work)
+Every state change carries the **status it expects to replace** (`transitions_policy.require_expected_status`).
+If the engine reports a conflict (409), another agent or poller already advanced the task — **skip,
+don't clobber**. This is the core guard that lets a multi-agent pool run safely.
+## 4. Claim grace + stuck recovery
+- A task isn't "reclaimable" until `recovery.claim_grace_seconds` (300s) after it's claimed — avoids
+  yanking work that just started.
+- A recovery sweep every `recovery.scan_seconds` (120s) re-queues anything stuck past
+  `recovery.stuck_after_hours` (4h) back to `requeue_to` (`queued`).
+## 5. Hooks gate irreversible actions
+`hooks.pre_pr` and `hooks.pre_merge` run before opening a PR and before merging. A failing hook
+blocks the transition. Put `self-verify` on `pre_pr` and `harness-eval` on `pre_merge`.
+## 6. Hard limits and runaway protection
+`limits` caps concurrency, review iterations, diff size, and a token `runaway_token_cap`. These keep
+an agent pool from melting the repo. Protected/human-only labels (`labels.human_only`) require a
+person.
+## 7. Model tiers
+Roles reference a `model_tier` (cheap/standard/premium), not a hardcoded model, so cost/quality is
+re-routable in one place. Keep adjacent pipeline roles on different model families.
+## 8. Progress visibility
+`issue_comments.on` posts a GitHub comment on task creation, each state transition, failure, and PR
+open — so humans can follow the pool without watching the engine.

package/knowledge/templates/harness/state-machine.yml ADDED Viewed

@@ -0,0 +1,105 @@
+# AgentRig harness state machine
+# Principle 1 (explicit state machine), 3 (system of record), 9 (human gates), 10 (hard limits).
+# The DAG is the contract. Agents do not invent transitions; reviewers cannot skip gates.
+version: 1
+states:
+  - name: ingested        # issue/task picked up
+  - name: queued          # ready for an agent
+  - name: implementing    # developer role is working
+  - name: reviewing       # reviewer role is judging the diff
+  - name: judging         # independent judge scores against the rubric
+  - name: ready_to_merge  # all gates green
+  - name: merged
+  - name: closed
+  - name: parked          # self-parked: needs a human (low reversibility)
+transitions:
+  - from: ingested        to: queued           trigger: agent      role: triager     gate: human_approval
+  - from: queued          to: implementing     trigger: agent      role: developer
+  - from: implementing    to: reviewing        trigger: agent      role: developer   gate: self_verify_passed
+  - from: reviewing       to: judging          trigger: agent      role: reviewer
+  - from: reviewing       to: implementing     trigger: agent      role: reviewer    reason: changes_requested
+  - from: judging         to: ready_to_merge   trigger: agent      role: judge       gate: rubric_passed
+  - from: judging         to: implementing     trigger: agent      role: judge       reason: below_threshold
+  - from: ready_to_merge  to: merged           trigger: human      gate: human_approval   # principle 9
+  - from: any             to: parked           trigger: auto       reason: low_reversibility_or_stuck
+  - from: any             to: closed           trigger: human
+# Principle 3: GitHub labels mirror DAG state. Human-only labels must never be applied by an agent.
+labels:
+  state_map:
+    queued: agentrig-ready
+    implementing: agentrig-started
+    reviewing: agentrig-in-review
+    ready_to_merge: agentrig-approved
+  human_only:
+    - acknowledge-breaking-change
+    - override-protected-files
+# Principle 10: hard limits and safety nets.
+limits:
+  max_concurrent_agents: 4
+  max_review_iterations: 5
+  max_diff_chars: 50000
+  runaway_token_cap: 5000000
+  recovery_scan_seconds: 120
+  requeue_if_stuck_hours: 4
+hooks:
+  pre_pr:
+    - self-verify
+  pre_merge:
+    - harness-eval
+# --- Trigger taxonomy (principle 1) ------------------------------------------
+# Every transition above is driven by one of these trigger kinds. Making the kind explicit keeps
+# orchestration debuggable: agents only drive `agent` transitions; everything else is automation.
+triggers:
+  kinds:
+    agent: "An agent role performs the work and reports the result."
+    script: "A poller/reconciler script asserts state from GitHub on a cadence."
+    auto: "A deterministic, immediate transition (no work, e.g. legacy normalization)."
+    event: "A GitHub webhook event maps directly to a state (see event_to_state)."
+    human: "A person performs a low-reversibility action (e.g. merge approval)."
+# Reactive transitions: GitHub webhook events map straight to a state (low latency), while the
+# pollers below repair anything missed (self-healing). Adopted from epichan.
+event_to_state:
+  "pull_request.review_comment": implementing   # reviewer asked for changes -> back to dev
+  "check_suite.completed.failure": implementing  # CI went red -> fix it
+  "pull_request.synchronize": reviewing          # new commits pushed -> re-review
+  "conflict_detected": implementing
+# Concurrency-safe transitions: a state change must declare the status it expects to replace
+# (compare-and-set). A mismatch (409) means another agent/poller already moved it — skip, don't
+# clobber. This is what prevents double-work in a multi-agent pool.
+transitions_policy:
+  require_expected_status: true   # every transition passes --expected-status; 409 => safe skip
+# --- Reconciliation & recovery (principle 3, 10) -----------------------------
+# GitHub is the system of record; pollers re-assert engine state from it on a cadence, and a
+# recovery sweep re-queues abandoned work. Latency vs. cost is tuned per cadence.
+reconciliation:
+  poll_ready_issues_seconds: 60     # ingest newly-ready issues
+  poll_task_prs_seconds: 300        # route open task PRs by live health
+  reconcile_merged_seconds: 120     # only mark merged once GitHub says merged
+recovery:
+  enabled: true
+  scan_seconds: 120
+  claim_grace_seconds: 300          # don't reclaim a task until the claim has had time to start
+  stuck_after_hours: 4              # past this with no progress -> requeue
+  requeue_to: queued
+# Progress visibility: post a GitHub comment on these lifecycle events so humans can follow along.
+issue_comments:
+  on: [task_created, state_transition, task_failed, pr_opened]
+# --- Model tiers (principle 2) -----------------------------------------------
+# Roles reference a tier, not a hardcoded model, so you can re-route cost/quality in one place.
+# Keep adjacent pipeline roles on DIFFERENT model families (single-model-bias mitigation).
+model_tiers:
+  cheap:    { models: [claude-haiku-4.5, gpt-5-mini], use: "triage, high-volume analysis" }
+  standard: { models: [claude-sonnet-4.5, gpt-5.4],   use: "implementation" }
+  premium:  { models: [claude-opus-4.5, gpt-5],       use: "review, judging, auditing" }

package/knowledge/templates/mcp/mcp.json ADDED Viewed

@@ -0,0 +1,12 @@
+{
+  "$comment": "AgentRig MCP config (principle 11). The same servers should be mirrored to .vscode/mcp.json and .github/copilot/mcp.json so every vendor CLI sees the same tools.",
+  "mcpServers": {
+    "github": {
+      "command": "npx",
+      "args": ["-y", "@modelcontextprotocol/server-github"],
+      "env": {
+        "GITHUB_PERSONAL_ACCESS_TOKEN": "${GITHUB_PERSONAL_ACCESS_TOKEN}"
+      }
+    }
+  }
+}

package/knowledge/templates/rules/README.md ADDED Viewed

@@ -0,0 +1,32 @@
+# Rules (principle 4)
+Rules are **reflexes**: short, glob-scoped instructions auto-loaded when a matching file is edited.
+Unlike skills (which are procedures you invoke), rules apply passively to every edit in scope.
+## Priority order
+Each rule declares a `priority` in its frontmatter. When multiple rules match, lower numbers win on
+conflict:
+1. **Specialized / security** (`security.md`, framework- or area-specific rules) — `priority: 1`
+2. **Review & accessibility** (`code-review.md`, any a11y rules) — `priority: 2`
+3. **Baseline coding standards** (`coding-standards.md`, `no-debug-logging.md`) — `priority: 3`
+## Default rules installed
+- `security.md` — secrets, input validation, injection, least privilege (priority 1).
+- `code-review.md` — what a reviewer should/shouldn't flag, to keep review high-signal (priority 2).
+- `coding-standards.md` — baseline change discipline (priority 3).
+- `no-debug-logging.md` — no stray debug output/`debugger` in committed code (priority 3).
+## Authoring a rule
+Start each rule with frontmatter declaring its glob scope, a one-line description, and a priority:
+```markdown
+---
+globs: ["src/**/*.ts"]
+description: One-line summary of the reflex.
+priority: 1
+---
+```
+Keep rules to a handful of imperative bullets. If a rule grows into a procedure, promote it to a
+skill under `.agents/skills/`. Replace these generic defaults with repo-specific standards and add
+specialized, glob-scoped rules alongside them.

package/knowledge/templates/rules/code-review.md ADDED Viewed

@@ -0,0 +1,26 @@
+---
+globs: ["**/*"]
+description: What an AI reviewer should and should not flag. Keeps review high-signal.
+priority: 2
+---
+# Code review rules (reflex)
+When reviewing a diff, keep signal high. **Do NOT:**
+- Suggest adding comments/JSDoc/docstrings to self-explanatory code.
+- Comment on formatting/style handled by a formatter or linter.
+- Flag missing imports or type errors the compiler/build already catches.
+- Suggest adding `console.log`/`print`/debug statements.
+- Claim the build will fail when CI is green.
+- Bikeshed naming when the existing name is clear enough.
+- Re-review unchanged lines or restate what the diff obviously does.
+**DO flag (these are the point of review):**
+- Correctness bugs, logic errors, off-by-ones, unhandled edge cases.
+- Security issues (see `security.md`) and unsafe input handling.
+- Concurrency/race conditions and resource leaks.
+- Missing or wrong tests for changed behavior.
+- Public API/contract breaks and likely regressions.
+Score findings by confidence and **only surface blocking issues plus genuinely useful suggestions**.
+If you would request changes, give a concrete, testable reason.

package/knowledge/templates/rules/coding-standards.md ADDED Viewed

@@ -0,0 +1,15 @@
+---
+globs: ["**/*"]
+description: Baseline coding standards applied to every change in this repo.
+---
+# Coding standards (reflex)
+- Make the smallest change that fully solves the task; do not refactor unrelated code.
+- Match the surrounding style; do not introduce a new formatter/linter without being asked.
+- No secrets in source. No disabling tests to make CI green.
+- Add or update tests for behavior you change.
+- Prefer clear names over comments; comment only non-obvious intent.
+> AgentRig installs this as a generic baseline. Replace it with repo-specific standards and add
+> specialized, glob-scoped rules alongside it.

package/knowledge/templates/rules/no-debug-logging.md ADDED Viewed

@@ -0,0 +1,16 @@
+---
+globs: ["**/*"]
+description: No stray debug output or debugger statements in committed code.
+priority: 3
+---
+# No debug logging left behind (reflex)
+- Don't commit `console.log`/`console.debug`, `print`, `dbg!`, `fmt.Println` debug spew, or
+  `debugger;` statements added while investigating.
+- Use the project's existing logger/abstraction for intentional, structured logging — match what the
+  surrounding code already uses; don't introduce a new logging mechanism unasked.
+- Temporary diagnostics are fine while iterating, but remove them before `self-verify`/handoff.
+This is a baseline. If the repo has a specific logger convention, encode it as a specialized,
+glob-scoped rule that takes priority over this one.

package/knowledge/templates/rules/security.md ADDED Viewed

@@ -0,0 +1,23 @@
+---
+globs: ["**/*"]
+description: Security reflexes applied to every change. Specialized — highest priority.
+priority: 1
+---
+# Security rules (reflex)
+Apply on every edit. When in doubt, stop and flag rather than guess.
+- **No secrets in source.** Never commit tokens, keys, passwords, or connection strings. Read them
+  from environment/secret stores. If you spot a committed secret, stop and report it.
+- **Validate and sanitize all external input** (request bodies, query params, CLI args, file
+  contents, env). Reject/normalize before use.
+- **No injection.** Use parameterized queries; never string-concatenate SQL/shell/HTML from input.
+  Avoid `eval`, dynamic `require`, and shelling out with unsanitized input.
+- **Escape on output** to prevent XSS; use the framework's escaping, not hand-rolled.
+- **Least privilege.** Don't broaden file, network, or token scopes to make something work.
+- **Don't disable security controls** (auth checks, CSRF, TLS verification, lint security rules) to
+  pass a test or unblock a build.
+- **Dependencies:** prefer maintained, pinned versions; don't add a dependency to avoid a few lines.
+If a change touches auth, crypto, or input boundaries, call it out explicitly for review.