npm - runcap - Versions diffs - 0.3.0 → 0.5.0 - Mend

runcap 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +211 -9
package/bin/runcap.mjs +153 -0
package/examples/outcome-demo/agent-fixes.mjs +24 -0
package/examples/outcome-demo/agent-spins.mjs +20 -0
package/examples/outcome-demo/broken.mjs +5 -0
package/examples/outcome-demo/verify.mjs +7 -0
package/package.json +11 -2
package/scripts/guard-test.mjs +76 -0
package/scripts/loop-e2e.mjs +137 -0
package/scripts/loop-test.mjs +45 -1
package/scripts/make-demo-svg.mjs +20 -19
package/scripts/make-linkedin-loop-video.mjs +338 -0
package/scripts/mission-test.mjs +148 -0
package/scripts/outcome-test.mjs +48 -0
package/scripts/policy-test.mjs +121 -0
package/scripts/render-media-screenshots.mjs +37 -0
package/src/compressor.mjs +77 -9
package/src/mission-control.mjs +475 -8
package/src/policy.mjs +208 -0

package/scripts/make-linkedin-loop-video.mjs ADDED Viewed

@@ -0,0 +1,338 @@
+// Renders a LinkedIn-ready MP4 for the Runcap loop-detection post.
+// Narrative: a circling agent looks busy but burns money -> Runcap catches the
+// loop in real time -> proven 37.9% compression -> hard cap stops the run.
+// Output: docs/assets/media/runcap-linkedin-loop-demo.mp4
+// Requires: playwright + ffmpeg available on the machine.
+import { spawnSync } from "node:child_process";
+import { mkdirSync, readdirSync, rmSync } from "node:fs";
+import { dirname, join, resolve } from "node:path";
+import { fileURLToPath } from "node:url";
+import { chromium } from "playwright";
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const root = resolve(__dirname, "..");
+const outDir = resolve(root, "docs/assets/media");
+const framesDir = "/private/tmp/runcap-linkedin-loop-frames";
+const outFile = join(outDir, "runcap-linkedin-loop-demo.mp4");
+const width = 1080;
+const height = 1080;
+const fps = 30;
+const duration = 13;
+const frameCount = fps * duration;
+mkdirSync(outDir, { recursive: true });
+mkdirSync(framesDir, { recursive: true });
+for (const file of readdirSync(framesDir)) {
+  if (file.startsWith("frame-") && file.endsWith(".png")) {
+    rmSync(join(framesDir, file));
+  }
+}
+const html = `<!doctype html>
+<html>
+<head>
+  <meta charset="utf-8" />
+  <style>
+    * { box-sizing: border-box; }
+    html, body {
+      margin: 0;
+      width: ${width}px;
+      height: ${height}px;
+      overflow: hidden;
+      background: #f4f6fb;
+      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+      color: #f8fafc;
+    }
+    .stage {
+      width: ${width}px;
+      height: ${height}px;
+      padding: 58px;
+      display: grid;
+      place-items: center;
+      background:
+        radial-gradient(circle at 15% 10%, rgba(167, 139, 250, .2), transparent 32%),
+        radial-gradient(circle at 85% 12%, rgba(34, 211, 238, .16), transparent 34%),
+        linear-gradient(135deg, #eef2ff, #f8fafc);
+    }
+    .card {
+      width: 964px;
+      height: 964px;
+      border-radius: 42px;
+      padding: 42px;
+      background: #080b12;
+      box-shadow: 0 36px 90px rgba(15, 23, 42, .25);
+      position: relative;
+      overflow: hidden;
+    }
+    .card::before {
+      content: "";
+      position: absolute;
+      inset: 0;
+      background:
+        radial-gradient(circle at 50% -10%, rgba(167, 139, 250, .18), transparent 36%),
+        linear-gradient(180deg, rgba(255,255,255,.06), transparent 28%);
+      pointer-events: none;
+    }
+    .top {
+      position: relative;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      color: #94a3b8;
+      font-size: 23px;
+      letter-spacing: -0.02em;
+    }
+    .brand {
+      display: flex;
+      gap: 14px;
+      align-items: center;
+      font-weight: 800;
+      color: #fff;
+      font-size: 30px;
+    }
+    .logo {
+      width: 42px;
+      height: 42px;
+      border-radius: 13px;
+      display: grid;
+      place-items: center;
+      background: linear-gradient(135deg, #22d3ee, #34d399);
+      color: #021014;
+      font-weight: 900;
+    }
+    .pill {
+      border: 1px solid rgba(148, 163, 184, .28);
+      background: rgba(15, 23, 42, .68);
+      color: #cbd5e1;
+      border-radius: 999px;
+      padding: 10px 16px;
+      font-size: 18px;
+      font-weight: 650;
+    }
+    .content {
+      position: relative;
+      height: 818px;
+      padding-top: 44px;
+    }
+    .headline {
+      margin: 0;
+      color: #f8fafc;
+      font-size: 68px;
+      line-height: .98;
+      letter-spacing: -0.06em;
+      max-width: 840px;
+    }
+    .sub {
+      margin-top: 22px;
+      color: #cbd5e1;
+      font-size: 29px;
+      line-height: 1.28;
+      letter-spacing: -0.03em;
+      max-width: 820px;
+    }
+    .accent { color: #67e8f9; }
+    .green { color: #34d399; }
+    .red { color: #fb7185; }
+    .violet { color: #a78bfa; }
+    .mono {
+      font-family: "SF Mono", "JetBrains Mono", Menlo, Consolas, monospace;
+      letter-spacing: -0.04em;
+    }
+    .terminal {
+      margin-top: 38px;
+      border: 1px solid rgba(148, 163, 184, .22);
+      background: rgba(2, 6, 23, .82);
+      border-radius: 24px;
+      padding: 26px;
+      font-size: 24px;
+      line-height: 1.5;
+      color: #dbeafe;
+      box-shadow: inset 0 1px 0 rgba(255,255,255,.05);
+    }
+    .terminal .line { opacity: 1; }
+    .warning {
+      margin-top: 28px;
+      border: 1px solid rgba(167, 139, 250, .4);
+      background: rgba(167, 139, 250, .12);
+      color: #ddd6fe;
+      border-radius: 22px;
+      padding: 22px 26px;
+      font-size: 28px;
+      font-weight: 850;
+      letter-spacing: -0.04em;
+    }
+    .numbers {
+      margin-top: 46px;
+      display: grid;
+      grid-template-columns: 1fr 1fr;
+      gap: 28px;
+      align-items: end;
+    }
+    .number-card {
+      border-radius: 26px;
+      padding: 28px;
+      background: rgba(15, 23, 42, .9);
+      border: 1px solid rgba(148, 163, 184, .22);
+    }
+    .label {
+      color: #94a3b8;
+      font-size: 22px;
+      margin-bottom: 12px;
+      letter-spacing: -0.03em;
+    }
+    .big {
+      font-size: 78px;
+      line-height: .9;
+      font-weight: 900;
+      letter-spacing: -0.08em;
+    }
+    .bar {
+      margin-top: 32px;
+      height: 34px;
+      border-radius: 999px;
+      background: rgba(148, 163, 184, .16);
+      overflow: hidden;
+      border: 1px solid rgba(148, 163, 184, .24);
+    }
+    .fill {
+      height: 100%;
+      width: 37.9%;
+      border-radius: 999px;
+      background: linear-gradient(90deg, #22d3ee, #34d399);
+    }
+    .footer {
+      position: absolute;
+      left: 42px;
+      right: 42px;
+      bottom: 34px;
+      display: flex;
+      justify-content: space-between;
+      align-items: center;
+      color: #94a3b8;
+      font-size: 20px;
+    }
+    .scene {
+      position: absolute;
+      inset: 44px 0 0 0;
+      opacity: 0;
+      transform: translateY(24px) scale(.985);
+      transition: opacity .24s ease, transform .24s ease;
+    }
+    .scene.active {
+      opacity: 1;
+      transform: translateY(0) scale(1);
+    }
+  </style>
+</head>
+<body>
+  <div class="stage">
+    <div class="card">
+      <div class="top">
+        <div class="brand"><div class="logo">R</div> Runcap</div>
+        <div class="pill">local-first AI cost control</div>
+      </div>
+      <div class="content">
+        <section class="scene active" id="s0">
+          <h1 class="headline">Your AI agent looks busy. It is just circling.</h1>
+          <p class="sub">Same failure, reworded every turn. It produces output, makes no progress, and keeps spending your tokens.</p>
+          <div class="terminal mono">
+            <div class="line">attempt 1: guard the undefined with an if check</div>
+            <div class="line">attempt 2: add an optional chain before .id</div>
+            <div class="line">attempt 3: default the object to {} first</div>
+            <div class="line red">test still fails. budget still draining.</div>
+          </div>
+        </section>
+        <section class="scene" id="s1">
+          <h1 class="headline">Plain hashing never catches this.</h1>
+          <p class="sub">The prompt is similar but never byte-identical between loops, so the hash changes every turn and nothing trips.</p>
+          <div class="terminal mono">
+            <div class="line">hash(attempt 1) = a91f...  hash(attempt 2) = c4d2...</div>
+            <div class="line red">different hash every time -&gt; loop invisible</div>
+          </div>
+        </section>
+        <section class="scene" id="s2">
+          <h1 class="headline">Runcap measures similarity, not hashes.</h1>
+          <p class="sub">A local gateway sees every request in real time and compares each prompt's shape against the recent run.</p>
+          <div class="warning">loop: last 3 prompts 97.7% identical, no progress. The agent is circling the same failure.</div>
+          <div class="terminal mono">
+            <div class="line green">$ runcap status</div>
+            <div class="line violet">Loop warning: stepping in before it burns more budget.</div>
+          </div>
+        </section>
+        <section class="scene" id="s3">
+          <h1 class="headline">And it compresses every call it lets through.</h1>
+          <div class="numbers">
+            <div class="number-card">
+              <div class="label">baseline prompt</div>
+              <div class="big red mono">1,186</div>
+              <div class="label">tokens</div>
+            </div>
+            <div class="number-card">
+              <div class="label">with Runcap</div>
+              <div class="big green mono">737</div>
+              <div class="label">tokens</div>
+            </div>
+          </div>
+          <div class="bar"><div class="fill"></div></div>
+          <p class="sub"><span class="green">37.9% saved</span> on a real OpenAI call. The model still answered correctly about the changed line.</p>
+        </section>
+        <section class="scene" id="s4">
+          <h1 class="headline">Estimate. Cap. Compress. Catch the loop.</h1>
+          <p class="sub">Point your OpenAI or Anthropic-compatible tools at the local gateway. When the ceiling is crossed, the next call stops.</p>
+          <div class="terminal mono">
+            <div class="line green">$ AIM_DAILY_BUDGET_USD=10 runcap gateway</div>
+            <div class="line">gateway up · compress on · hard cap armed · loop guard on</div>
+            <div class="line red">HTTP 429 budget_guard</div>
+            <div class="line accent">stopped before money left your account</div>
+          </div>
+        </section>
+      </div>
+      <div class="footer">
+        <span class="mono">npm install -g runcap</span>
+        <span>Free · MIT · 100% local</span>
+      </div>
+    </div>
+  </div>
+  <script>
+    const scenes = [...document.querySelectorAll(".scene")];
+    window.renderFrame = (seconds) => {
+      const index =
+        seconds < 2.8 ? 0 :
+        seconds < 5.2 ? 1 :
+        seconds < 8.2 ? 2 :
+        seconds < 10.6 ? 3 : 4;
+      scenes.forEach((scene, i) => scene.classList.toggle("active", i === index));
+    };
+  </script>
+</body>
+</html>`;
+const browser = await chromium.launch({ headless: true });
+const page = await browser.newPage({ viewport: { width, height }, deviceScaleFactor: 1 });
+await page.setContent(html);
+await page.waitForTimeout(100);
+for (let i = 0; i < frameCount; i += 1) {
+  const seconds = i / fps;
+  await page.evaluate((t) => window.renderFrame(t), seconds);
+  await page.screenshot({ path: join(framesDir, `frame-${String(i).padStart(4, "0")}.png`) });
+}
+await browser.close();
+const ffmpeg = spawnSync("ffmpeg", [
+  "-y",
+  "-framerate", String(fps),
+  "-i", join(framesDir, "frame-%04d.png"),
+  "-c:v", "libx264",
+  "-pix_fmt", "yuv420p",
+  "-movflags", "+faststart",
+  "-crf", "18",
+  outFile
+], { stdio: "inherit" });
+if (ffmpeg.status !== 0) {
+  process.exit(ffmpeg.status ?? 1);
+}
+console.log(`wrote ${outFile}`);

package/scripts/mission-test.mjs ADDED Viewed

@@ -0,0 +1,148 @@
+// Proves a policy-bound mission grades a real run into a PASS/BLOCKED verdict and
+// that the verdict drives the process exit code (so CI fails on a blocked mission).
+// Everything runs offline through the mock cap gateway inside a throwaway git repo:
+//   - an honest fix within scope, under cap        → PASS, exit 0
+//   - an agent that rewrites the verifier           → BLOCKED (VERIFIER_COMPROMISED)
+//   - an edit outside the declared allow scope      → BLOCKED (out of scope)
+//   - a mission whose first call trips the hard cap → BLOCKED (budget guard)
+// It also drives the real `bin/runcap.mjs` so the exit codes and the GitHub
+// Action's `runcap ci` PR summary are tested as a reviewer would see them.
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { execFileSync } from "node:child_process";
+import { mkdtempSync, writeFileSync, mkdirSync, readFileSync } from "node:fs";
+const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
+const BIN = path.join(SRC_DIR, "..", "bin", "runcap.mjs");
+const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-mission-"));
+process.chdir(tmp);
+mkdirSync(path.join(tmp, "app"), { recursive: true });
+mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
+// broken.mjs starts wrong, so the task genuinely fails on baseline.
+writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
+writeFileSync(path.join(tmp, "app", "verify.mjs"),
+  "import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
+// A tracked file OUTSIDE the allow scope: editing it must trip the scope check.
+// (The guard reads `git diff --name-status`, which sees tracked edits - the
+// honest unit of "out of scope" for this scenario.)
+writeFileSync(path.join(tmp, "app", "other.mjs"), "export const other = 0;\n");
+const spend = (msg) =>
+  "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'" + msg + "'}]})}).then(r=>r.text());";
+// Honest agent: spends, fixes the source file (the one allowed change).
+writeFileSync(path.join(tmp, "agent-fix.mjs"),
+  spend("fix it") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
+// Cheating agent: rewrites the VERIFIER so it always passes instead of fixing the bug.
+writeFileSync(path.join(tmp, "agent-cheat.mjs"),
+  spend("cheat") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/verify.mjs','console.log(\"ok\");\\n');\n");
+// Out-of-scope agent: fixes the bug (verify passes) BUT also edits a tracked file outside `allow`.
+writeFileSync(path.join(tmp, "agent-scope.mjs"),
+  spend("scope") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');await writeFile('app/other.mjs','export const other = 1;\\n');\n");
+// The mission policy a reviewer commits to the repo.
+const POLICY = `version: v1
+identity:
+  project: checkout
+  team: payments
+mission:
+  name: Fix the failing checkout test
+  task_class: bugfix
+budget:
+  mission_hard_limit_usd: 5
+  max_llm_calls: 12
+verification:
+  command: "node app/verify.mjs"
+  guard: strict
+  protect: ["app/verify.mjs"]
+  allow: ["app/broken.mjs"]
+`;
+writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), POLICY);
+// A second policy with a hair-thin cap, so the gateway trips the budget guard pre-flight.
+const TINY_POLICY = POLICY.replace("mission_hard_limit_usd: 5", "mission_hard_limit_usd: 0.0000001");
+writeFileSync(path.join(tmp, ".runcap", "mission-tiny.yaml"), TINY_POLICY);
+// Commit a baseline so the guard has a real commit + clean tree to check against.
+const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" });
+g("init", "-q");
+g("config", "user.email", "test@runcap.local");
+g("config", "user.name", "runcap-test");
+g("add", "-A");
+g("commit", "-qm", "baseline");
+let failures = 0;
+const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"}  ${name}${detail ? "  — " + detail : ""}`); };
+const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
+const { loadPolicy } = await import(path.join(SRC_DIR, "policy.mjs"));
+// Each scenario starts from the committed baseline so one run's edits (the cheat
+// run's rewritten verifier especially) never leak into the next.
+const resetToBaseline = () => { g("checkout", "-f", "HEAD"); g("clean", "-fdq", "-e", ".runcap"); };
+const loaded = loadPolicy(tmp);
+// 1. Honest fix, within scope, under cap → PASS with a strong verification.
+const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "mission-fix", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
+check("honest fix verifies", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
+check("honest fix grades VERIFIED_STRONG", fix.receipt.verificationIntegrity.status === "VERIFIED_STRONG", `status=${fix.receipt.verificationIntegrity.status}`);
+check("honest fix mission verdict PASS", fix.receipt.policy?.verdict === "PASS", JSON.stringify(fix.receipt.policy?.reasons));
+check("receipt carries the policy hash", /^[0-9a-f]{64}$/.test(fix.receipt.policy?.hash ?? ""), fix.receipt.policy?.hash);
+check("receipt carries org attribution", fix.receipt.policy?.identity?.project === "checkout" && fix.receipt.policy?.identity?.team === "payments");
+check("receipt bumps to v0.3 schema", fix.receipt.schema === "runcap.outcome-receipt/v0.3", fix.receipt.schema);
+// 2. Cheat by rewriting the verifier → BLOCKED, VERIFIER_COMPROMISED.
+resetToBaseline();
+const cheat = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-cheat.mjs"], label: "mission-cheat", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
+check("cheat run mission verdict BLOCKED", cheat.receipt.policy?.verdict === "BLOCKED", `verdict=${cheat.receipt.policy?.verdict}`);
+check("cheat run names VERIFIER_COMPROMISED", (cheat.receipt.policy?.reasons ?? []).some((r) => r.includes("VERIFIER_COMPROMISED")), JSON.stringify(cheat.receipt.policy?.reasons));
+// 3. Edit outside the declared scope → BLOCKED, out-of-scope.
+resetToBaseline();
+const scope = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-scope.mjs"], label: "mission-scope", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
+check("out-of-scope run mission verdict BLOCKED", scope.receipt.policy?.verdict === "BLOCKED", `verdict=${scope.receipt.policy?.verdict}`);
+check("out-of-scope run names the scope breach", (scope.receipt.policy?.reasons ?? []).some((r) => r.toLowerCase().includes("scope")), JSON.stringify(scope.receipt.policy?.reasons));
+// 4. A hair-thin cap trips the gateway budget guard → BLOCKED, budget reason.
+resetToBaseline();
+const tinyLoaded = loadPolicy(tmp, ".runcap/mission-tiny.yaml");
+const broke = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "mission-broke", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 0.0000001, policy: tinyLoaded });
+check("tiny cap trips the budget guard", broke.receipt.cost.budgetGuardTripped === true, `tripped=${broke.receipt.cost.budgetGuardTripped}`);
+check("budget trip mission verdict BLOCKED", broke.receipt.policy?.verdict === "BLOCKED", `verdict=${broke.receipt.policy?.verdict}`);
+check("budget trip names the budget guard", (broke.receipt.policy?.reasons ?? []).some((r) => r.toLowerCase().includes("budget")), JSON.stringify(broke.receipt.policy?.reasons));
+// 5. The real bin must exit 0 on PASS and 1 on BLOCKED so CI fails on a bad mission.
+const runBin = (args, extraEnv = {}) => {
+  try {
+    const stdout = execFileSync("node", [BIN, ...args], { cwd: tmp, env: { ...process.env, ...extraEnv }, stdio: ["ignore", "pipe", "pipe"] });
+    return { code: 0, stdout: String(stdout) };
+  } catch (e) {
+    return { code: e.status ?? 1, stdout: String(e.stdout ?? ""), stderr: String(e.stderr ?? "") };
+  }
+};
+resetToBaseline();
+const binPass = runBin(["mission", "run", "--mock", "--", "node", "agent-fix.mjs"]);
+check("`runcap mission run` exits 0 on a PASS mission", binPass.code === 0, `code=${binPass.code}`);
+check("PASS run prints the verdict", /Mission verdict: PASS/.test(binPass.stdout), binPass.stdout.slice(-200));
+resetToBaseline();
+const binBlock = runBin(["mission", "run", "--mock", "--", "node", "agent-cheat.mjs"]);
+check("`runcap mission run` exits 1 on a BLOCKED mission", binBlock.code === 1, `code=${binBlock.code}`);
+// 6. `runcap ci` (the GitHub Action's grader) must write the PR summary and exit 1 on BLOCKED.
+//    It grades the latest receipt on disk - which the BLOCKED cheat run just wrote.
+const summaryFile = path.join(tmp, "step-summary.md");
+writeFileSync(summaryFile, "");
+const ci = runBin(["ci", "--policy", ".runcap/mission.yaml"], { GITHUB_STEP_SUMMARY: summaryFile });
+check("`runcap ci` exits 1 when the graded receipt is BLOCKED", ci.code === 1, `code=${ci.code}`);
+const summary = readFileSync(summaryFile, "utf8");
+check("`runcap ci` writes a PR summary to GITHUB_STEP_SUMMARY", /Runcap mission verdict: BLOCKED/.test(summary), summary.slice(0, 200));
+console.log("\n" + (failures === 0 ? "ALL MISSION TESTS PASSED" : `${failures} MISSION TEST(S) FAILED`));
+process.exit(failures === 0 ? 0 : 1);

package/scripts/outcome-test.mjs ADDED Viewed

@@ -0,0 +1,48 @@
+// Proves runOutcome produces an honest receipt end-to-end through the REAL cap
+// gateway (mock upstream, so no network/keys), for both the VERIFIED and
+// UNVERIFIED cases. The agent spends recorded tokens; the verify command's exit
+// code is the oracle; Verified Outcome Cost is the actual spend only when verify
+// passes. Runs in an isolated temp cwd so it never touches real .runcap data.
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
+// Resolve the engine relative to this script so the test runs from any cwd
+// (it chdir's into a temp dir below, so a relative import would break).
+const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
+const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-outcome-"));
+process.chdir(tmp);
+// A tiny agent that spends through the gateway and writes (or doesn't write) a fix.
+mkdirSync(path.join(tmp, "app"), { recursive: true });
+writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
+writeFileSync(path.join(tmp, "app", "verify.mjs"),
+  "import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
+writeFileSync(path.join(tmp, "agent-fix.mjs"),
+  "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'fix it'}]})}).then(r=>r.text());" +
+  "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
+writeFileSync(path.join(tmp, "agent-nop.mjs"),
+  "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'think'}]})}).then(r=>r.text());console.log('no fix');\n");
+let failures = 0;
+const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"}  ${name}${detail ? "  — " + detail : ""}`); };
+const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
+const nop = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-nop.mjs"], label: "nop", mock: true });
+check("no-fix run is UNVERIFIED", nop.receipt.outcome === "UNVERIFIED", `outcome=${nop.receipt.outcome}`);
+check("no-fix run still spent real money", nop.receipt.cost.actualCostUsd > 0, `cost=${nop.receipt.cost.actualCostUsd}`);
+check("no-fix Verified Outcome Cost is null", nop.receipt.cost.verifiedOutcomeCostUsd === null);
+check("no-fix counts money without delivery", nop.receipt.cost.moneySpentWithoutVerifiedDeliveryUsd > 0);
+const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "fix", mock: true });
+check("fix run is VERIFIED", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
+check("fix Verified Outcome Cost equals actual spend", fix.receipt.cost.verifiedOutcomeCostUsd === fix.receipt.cost.actualCostUsd);
+check("fix counts zero undelivered money", fix.receipt.cost.moneySpentWithoutVerifiedDeliveryUsd === 0);
+check("cost truth is calculated from usage + price table", /price_table/.test(fix.receipt.cost.truth));
+console.log("\n" + (failures === 0 ? "ALL OUTCOME TESTS PASSED" : `${failures} OUTCOME TEST(S) FAILED`));
+process.exit(failures === 0 ? 0 : 1);

package/scripts/policy-test.mjs ADDED Viewed

@@ -0,0 +1,121 @@
+// Proves src/policy.mjs parses, validates, and grades correctly. Pure unit test:
+// no gateway, no git, no agent - just the policy module over hand-built inputs.
+// Covers: YAML parse + hash, .json fallback, required-field validation, the
+// guard/scope warnings, and every BLOCK condition in evaluatePolicyVerdict.
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
+const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
+const { loadPolicy, validatePolicy, evaluatePolicyVerdict, policyMeta } = await import(path.join(SRC_DIR, "policy.mjs"));
+let failures = 0;
+const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"}  ${name}${detail ? "  — " + detail : ""}`); };
+const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-policy-"));
+mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
+const VALID_YAML = `version: v1
+identity:
+  project: checkout
+  team: payments
+mission:
+  name: Fix the failing checkout test
+  task_class: bugfix
+budget:
+  mission_hard_limit_usd: 10
+  max_llm_calls: 12
+  max_runtime_minutes: 30
+verification:
+  command: "node app/verify.mjs"
+  guard: strict
+  protect: ["tests/**"]
+  allow: ["src/checkout/**"]
+`;
+// 1. Valid YAML loads, parses, hashes, validates clean.
+writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), VALID_YAML);
+const loaded = loadPolicy(tmp);
+check("loadPolicy finds .runcap/mission.yaml", loaded && loaded.source.endsWith("mission.yaml"));
+check("loadPolicy computes a sha256 hash", /^[0-9a-f]{64}$/.test(loaded.hash), loaded.hash);
+check("valid policy parses mission.name", loaded.policy.mission.name === "Fix the failing checkout test");
+const v1 = validatePolicy(loaded.policy);
+check("valid policy validates ok", v1.ok === true, JSON.stringify(v1.errors));
+check("valid policy with allow has no scope warning", !v1.warnings.some((w) => w.includes("allow is empty")));
+const meta = policyMeta(loaded);
+check("policyMeta carries identity + hash", meta.identity.project === "checkout" && meta.hash === loaded.hash);
+check("policyMeta carries the limits", meta.limits.mission_hard_limit_usd === 10 && meta.limits.max_llm_calls === 12);
+// 2. .json fallback parses with native JSON.parse (no parser needed).
+const tmp2 = mkdtempSync(path.join(os.tmpdir(), "runcap-policy-json-"));
+mkdirSync(path.join(tmp2, ".runcap"), { recursive: true });
+writeFileSync(path.join(tmp2, ".runcap", "mission.json"), JSON.stringify({
+  version: "v1",
+  mission: { name: "json mission" },
+  budget: { mission_hard_limit_usd: 5 },
+  verification: { command: "npm test" }
+}));
+const jsonLoaded = loadPolicy(tmp2);
+check("loadPolicy reads .json fallback", jsonLoaded && jsonLoaded.source.endsWith("mission.json"));
+check("json policy validates ok", validatePolicy(jsonLoaded.policy).ok === true);
+// 3. Missing verification.command → invalid.
+const noVerify = validatePolicy({ version: "v1", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 } });
+check("missing verification.command is invalid", noVerify.ok === false && noVerify.errors.some((e) => e.includes("verification.command")));
+// 4. Bad version → invalid.
+const badVersion = validatePolicy({ version: "v2", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 }, verification: { command: "npm test" } });
+check("wrong version is invalid", badVersion.ok === false && badVersion.errors.some((e) => e.includes("version")));
+// 5. Missing budget cap → invalid.
+const noBudget = validatePolicy({ version: "v1", mission: { name: "x" }, verification: { command: "npm test" } });
+check("missing mission_hard_limit_usd is invalid", noBudget.ok === false && noBudget.errors.some((e) => e.includes("mission_hard_limit_usd")));
+// 6. No allow scope → warning (not error).
+const noAllow = validatePolicy({ version: "v1", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 }, verification: { command: "npm test", allow: [] } });
+check("empty allow produces a warning", noAllow.ok === true && noAllow.warnings.some((w) => w.includes("allow is empty")));
+// 7. evaluatePolicyVerdict: a clean VERIFIED receipt → PASS.
+const policy = loaded.policy;
+const cleanReceipt = {
+  outcome: "VERIFIED",
+  verificationIntegrity: { status: "VERIFIED_STRONG", violations: [] },
+  cost: { actualCostUsd: 0.0007, llmCalls: 2, budgetGuardTripped: false },
+  work: { agentDurationMs: 5000 }
+};
+check("clean receipt grades PASS", evaluatePolicyVerdict(cleanReceipt, policy).verdict === "PASS");
+// 8. Compromised verifier → BLOCKED with the reason.
+const compromised = { ...cleanReceipt, verificationIntegrity: { status: "VERIFIER_COMPROMISED", violations: ["verifier_file_unchanged:app/verify.mjs"] } };
+const cv = evaluatePolicyVerdict(compromised, policy);
+check("compromised verifier grades BLOCKED", cv.verdict === "BLOCKED" && cv.reasons.some((r) => r.includes("VERIFIER_COMPROMISED")));
+// 9. UNVERIFIED → BLOCKED.
+const unver = { ...cleanReceipt, outcome: "UNVERIFIED", verificationIntegrity: { status: "UNVERIFIED", violations: [] } };
+check("unverified grades BLOCKED", evaluatePolicyVerdict(unver, policy).verdict === "BLOCKED");
+// 10. Out-of-allow scope → BLOCKED.
+const scope = { ...cleanReceipt, verificationIntegrity: { status: "VERIFIED_STRONG", violations: ["within_allowed_scope:src/other.mjs"] } };
+const sc = evaluatePolicyVerdict(scope, policy);
+check("out-of-scope edit grades BLOCKED", sc.verdict === "BLOCKED" && sc.reasons.some((r) => r.toLowerCase().includes("scope")));
+// 11. Over the dollar cap → BLOCKED.
+const overCost = { ...cleanReceipt, cost: { actualCostUsd: 11, llmCalls: 2, budgetGuardTripped: false } };
+check("over the cap grades BLOCKED", evaluatePolicyVerdict(overCost, policy).verdict === "BLOCKED");
+// 12. budget_guard tripped → BLOCKED.
+const guardTrip = { ...cleanReceipt, cost: { actualCostUsd: 1, llmCalls: 2, budgetGuardTripped: true } };
+check("budget guard trip grades BLOCKED", evaluatePolicyVerdict(guardTrip, policy).verdict === "BLOCKED");
+// 13. Too many LLM calls → BLOCKED.
+const tooMany = { ...cleanReceipt, cost: { actualCostUsd: 1, llmCalls: 99, budgetGuardTripped: false } };
+check("too many llm calls grades BLOCKED", evaluatePolicyVerdict(tooMany, policy).verdict === "BLOCKED");
+// 14. Over the runtime budget → BLOCKED.
+const slow = { ...cleanReceipt, work: { agentDurationMs: 31 * 60_000 } };
+check("over runtime budget grades BLOCKED", evaluatePolicyVerdict(slow, policy).verdict === "BLOCKED");
+console.log("\n" + (failures === 0 ? "ALL POLICY TESTS PASSED" : `${failures} POLICY TEST(S) FAILED`));
+process.exit(failures === 0 ? 0 : 1);