npm - runcap - Versions diffs - 0.3.0 → 0.5.0 - Mend

runcap 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +211 -9
package/bin/runcap.mjs +153 -0
package/examples/outcome-demo/agent-fixes.mjs +24 -0
package/examples/outcome-demo/agent-spins.mjs +20 -0
package/examples/outcome-demo/broken.mjs +5 -0
package/examples/outcome-demo/verify.mjs +7 -0
package/package.json +11 -2
package/scripts/guard-test.mjs +76 -0
package/scripts/loop-e2e.mjs +137 -0
package/scripts/loop-test.mjs +45 -1
package/scripts/make-demo-svg.mjs +20 -19
package/scripts/make-linkedin-loop-video.mjs +338 -0
package/scripts/mission-test.mjs +148 -0
package/scripts/outcome-test.mjs +48 -0
package/scripts/policy-test.mjs +121 -0
package/scripts/render-media-screenshots.mjs +37 -0
package/src/compressor.mjs +77 -9
package/src/mission-control.mjs +475 -8
package/src/policy.mjs +208 -0

package/scripts/guard-test.mjs ADDED Viewed

@@ -0,0 +1,76 @@
+// Proves `runcap outcome --guard` grades verification trust, not just pass/fail.
+// Three real runs through the cap gateway (mock upstream) inside a throwaway git
+// repo: an honest fix earns VERIFIED_STRONG; an agent that rewrites the verifier
+// to force a green earns VERIFIER_COMPROMISED; a no-fix run stays UNVERIFIED.
+// The whole point of the guard is that the middle case must NOT read as VERIFIED.
+import os from "node:os";
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { execFileSync } from "node:child_process";
+import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
+const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
+const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-guard-"));
+process.chdir(tmp);
+mkdirSync(path.join(tmp, "app"), { recursive: true });
+// broken.mjs starts wrong (ok=false), so the task genuinely fails on baseline.
+writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
+writeFileSync(path.join(tmp, "app", "verify.mjs"),
+  "import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
+// Honest agent: spends, then fixes the source file (an allowed change).
+writeFileSync(path.join(tmp, "agent-fix.mjs"),
+  "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'fix it'}]})}).then(r=>r.text());" +
+  "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
+// Cheating agent: instead of fixing the bug, it rewrites the VERIFIER so it always passes.
+writeFileSync(path.join(tmp, "agent-cheat.mjs"),
+  "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'cheat'}]})}).then(r=>r.text());" +
+  "const {writeFile}=await import('node:fs/promises');await writeFile('app/verify.mjs','console.log(\"ok\");\\n');\n");
+// No-fix agent: spends, changes nothing.
+writeFileSync(path.join(tmp, "agent-nop.mjs"),
+  "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'think'}]})}).then(r=>r.text());console.log('no fix');\n");
+// Commit a baseline so the guard has a real commit + clean tree to check against.
+const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" });
+g("init", "-q");
+g("config", "user.email", "test@runcap.local");
+g("config", "user.name", "runcap-test");
+g("add", "-A");
+g("commit", "-qm", "baseline");
+let failures = 0;
+const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"}  ${name}${detail ? "  — " + detail : ""}`); };
+const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
+// Each scenario must start from the committed baseline - otherwise one run's
+// edits (especially the cheat run's rewritten verifier) leak into the next.
+const resetToBaseline = () => { g("checkout", "-f", "HEAD"); g("clean", "-fdq", "-e", ".runcap"); };
+// 1. Honest fix, guarded → VERIFIED_STRONG.
+const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "guard-fix", mock: true, guard: true, allow: ["app/broken.mjs"] });
+check("honest fix verifies (exit-code oracle)", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
+check("honest fix grades VERIFIED_STRONG", fix.receipt.verificationIntegrity.status === "VERIFIED_STRONG", `status=${fix.receipt.verificationIntegrity.status}`);
+check("strong run reproduced the baseline failure first", fix.receipt.verificationIntegrity.contract.baselineVerifyPassed === false);
+check("strong pass survives a clean checkout", fix.receipt.verificationIntegrity.cleanRoom.passed === true, fix.receipt.verificationIntegrity.cleanRoom.detail);
+// 2. Cheat by rewriting the verifier → still exit-0 "VERIFIED", but COMPROMISED.
+resetToBaseline();
+const cheat = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-cheat.mjs"], label: "guard-cheat", mock: true, guard: true, allow: ["app/broken.mjs"] });
+check("cheat run's exit code is still 0 (the trap)", cheat.receipt.verify.passed === true);
+check("guard catches the rewritten verifier", cheat.receipt.verificationIntegrity.status === "VERIFIER_COMPROMISED", `status=${cheat.receipt.verificationIntegrity.status}`);
+check("compromised run names the tampered file", cheat.receipt.verificationIntegrity.violations.some((v) => v.startsWith("verifier_file_unchanged:")), JSON.stringify(cheat.receipt.verificationIntegrity.violations));
+// 3. No-fix, guarded → UNVERIFIED (verify never passed).
+resetToBaseline();
+const nop = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-nop.mjs"], label: "guard-nop", mock: true, guard: true, allow: ["app/broken.mjs"] });
+check("no-fix guarded run is UNVERIFIED", nop.receipt.verificationIntegrity.status === "UNVERIFIED", `status=${nop.receipt.verificationIntegrity.status}`);
+// 4. The honesty note about cost scope rides on every guarded receipt.
+check("receipt states cost scope is LLM-only", /subscriptions/.test(fix.receipt.costScope.note));
+console.log("\n" + (failures === 0 ? "ALL GUARD TESTS PASSED" : `${failures} GUARD TEST(S) FAILED`));
+process.exit(failures === 0 ? 0 : 1);

package/scripts/loop-e2e.mjs ADDED Viewed

@@ -0,0 +1,137 @@
+// End-to-end proof that the response-side loop gate works through the REAL
+// gateway over HTTP, not just in unit tests. We stand up a tiny local "upstream"
+// that returns a caller-chosen error string, point the real Runcap gateway at
+// it, and drive near-identical prompts through the wire:
+//   A) error CHANGES each turn (convergence)  -> gateway must NOT flag a loop
+//   B) error STAYS the same each turn (circling) -> gateway MUST flag a loop
+// The gateway records its loop verdict per call in the gateway event log, which
+// we read back to assert the real server behaved correctly.
+//
+// Pure Node, no framework. Exits non-zero on any failure so it can gate CI.
+import http from "node:http";
+import os from "node:os";
+import path from "node:path";
+import { mkdtempSync, readFileSync, existsSync } from "node:fs";
+// Isolate all gateway state (the .runcap event log lives under cwd) in a
+// throwaway dir so this never touches real data. The gateway writes its event
+// log to ./.runcap, so we chdir into the temp dir before starting it.
+const tmpHome = mkdtempSync(path.join(os.tmpdir(), "runcap-e2e-"));
+process.chdir(tmpHome);
+process.env.AIM_COMPRESS = "off";      // keep the wire bytes predictable
+process.env.AIM_LOOP_DETECT = "on";
+// A controllable upstream: returns an OpenAI-shaped completion whose assistant
+// text is whatever error we tell it to via a field in the request body. We use
+// the body (not a header) on purpose: the gateway forwards the request body
+// upstream but rewrites headers, so the body is the channel that actually
+// reaches this stub through the real gateway.
+const upstream = http.createServer((req, res) => {
+  let body = "";
+  req.on("data", (c) => (body += c));
+  req.on("end", () => {
+    let err = "default error";
+    try { err = JSON.parse(body)?.mock_error ?? err; } catch {}
+    const payload = {
+      id: "chatcmpl-stub",
+      object: "chat.completion",
+      created: Math.floor(Date.now() / 1000),
+      model: "stub-model",
+      choices: [{ index: 0, message: { role: "assistant", content: String(err) }, finish_reason: "stop" }],
+      usage: { prompt_tokens: 50, completion_tokens: 10, total_tokens: 60 }
+    };
+    res.writeHead(200, { "content-type": "application/json" });
+    res.end(JSON.stringify(payload));
+  });
+});
+async function listen(server, port = 0) {
+  await new Promise((r) => server.listen(port, "127.0.0.1", r));
+  return server.address().port;
+}
+let failures = 0;
+function check(name, pass, detail) {
+  if (!pass) failures++;
+  console.log(`${pass ? "PASS" : "FAIL"}  ${name}${detail ? "  — " + detail : ""}`);
+}
+const stableTail = [
+  "You are a coding agent. Fix the failing build.",
+  ...Array.from({ length: 40 }, (_, i) => `context line ${i}: prior file content the agent keeps resending`)
+].join("\n");
+async function send(port, wording, mockError) {
+  // mock_error rides in the body so it survives the gateway's header rewrite and
+  // reaches the upstream stub, which echoes it back as the assistant response.
+  const body = JSON.stringify({
+    model: "stub-model",
+    mock_error: mockError,
+    messages: [{ role: "user", content: stableTail + "\nLet me try this: " + wording }]
+  });
+  const res = await fetch(`http://127.0.0.1:${port}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "content-type": "application/json" },
+    body
+  });
+  await res.text();
+}
+function readEvents() {
+  const log = path.join(tmpHome, ".runcap", "gateway-events.jsonl");
+  if (!existsSync(log)) return [];
+  return readFileSync(log, "utf8").trim().split("\n").filter(Boolean).map((l) => JSON.parse(l));
+}
+// Loop verdicts accumulate across both scenarios in one shared gateway process
+// (the shape history is per-process), so each scenario asserts against only the
+// events it produced. We snapshot the event count before scenario B.
+const run = async () => {
+  const upstreamPort = await listen(upstream);
+  process.env.AIM_UPSTREAM_BASE_URL = `http://127.0.0.1:${upstreamPort}/v1`;
+  process.env.AIM_UPSTREAM_API_KEY = "test-key";
+  // Import AFTER env is set so the gateway reads our isolated config.
+  const { startEphemeralGateway } = await import("../src/mission-control.mjs");
+  const gw = await startEphemeralGateway();
+  const gwPort = gw.port;
+  // Scenario A: same prompt framing, but the error MOVES every turn (convergence).
+  for (const [w, e] of [
+    ["guard the undefined", "TypeError: cannot read property 'id' of undefined"],
+    ["optional chain", "TypeError: cannot read property 'name' of undefined"],
+    ["default to {}", "ReferenceError: parser is not defined"],
+    ["try/catch", "AssertionError: expected 200 but got 404"]
+  ]) {
+    await send(gwPort, w, e);
+  }
+  const afterA = readEvents();
+  const aFlagged = afterA.filter((ev) => ev.loop && ev.loop.looping).length;
+  check("E2E convergence (moving error) is NOT flagged through real gateway", aFlagged === 0,
+    `loops flagged in scenario A=${aFlagged}`);
+  // Scenario B: same prompt framing AND the SAME error every turn (circling).
+  const stuck = "TypeError: cannot read property 'id' of undefined";
+  for (const w of ["attempt one", "attempt two reworded", "attempt three reworded", "attempt four reworded", "attempt five reworded"]) {
+    await send(gwPort, w, stuck);
+  }
+  const afterB = readEvents().slice(afterA.length); // only scenario-B events
+  const bFlagged = afterB.filter((ev) => ev.loop && ev.loop.looping).length;
+  check("E2E circling (stuck error) IS flagged through real gateway", bFlagged > 0,
+    `loops flagged in scenario B=${bFlagged}`);
+  await gw.close();
+  upstream.close();
+};
+run()
+  .then(() => {
+    console.log("\n" + (failures === 0 ? "ALL LOOP E2E TESTS PASSED" : `${failures} LOOP E2E TEST(S) FAILED`));
+    process.exit(failures === 0 ? 0 : 1);
+  })
+  .catch((e) => {
+    console.error("E2E harness error:", e);
+    process.exit(1);
+  });

package/scripts/loop-test.mjs CHANGED Viewed

@@ -7,7 +7,7 @@
 //
 // Pure Node, no test framework. Exits non-zero on any failure so it can gate CI.
-import { detectLoop, requestShapeText } from "../src/compressor.mjs";
+import { detectLoop, requestShapeText, responseSignature } from "../src/compressor.mjs";
 let failures = 0;
 function check(name, pass, detail) {
@@ -80,5 +80,49 @@ function attempt(wording) {
     `openai="${openai}" anthropic="${anthropic}"`);
 }
+// --- Test 6: response-side gate — similar prompts but a MOVING error is NOT a loop ---
+// The edge case raised on the thread: a converging run also sends near-identical
+// prompts (same files, same framing) while it closes in on the fix. The tell is
+// the observation: if the error/test output changes between turns, that's
+// progress, not circling. Prompts are near-identical here, but each response
+// carries a DIFFERENT error, so the gate must keep it from being flagged.
+{
+  const history = [attempt("try A"), attempt("try B"), attempt("try C")];
+  const current = attempt("try D");
+  const responseSignatures = [
+    "TypeError: cannot read property 'id' of undefined",
+    "TypeError: cannot read property 'name' of undefined",
+    "ReferenceError: parser is not defined"
+  ];
+  const currentResponseSignature = "AssertionError: expected 200 but got 404";
+  const r = detectLoop(current, history, { responseSignatures, currentResponseSignature });
+  check("similar prompts but MOVING error are NOT flagged (convergence)", !r.looping && r.responseMoved,
+    `looping=${r.looping}, repeats=${r.repeats}, responseMoved=${r.responseMoved}`);
+}
+// --- Test 7: response-side gate — similar prompts AND a STUCK error IS a loop ---
+// Same near-identical prompts, but the identical error keeps coming back. Now
+// both signals agree the run is circling, so it must still be flagged.
+{
+  const history = [attempt("try A"), attempt("try B"), attempt("try C")];
+  const current = attempt("try D");
+  const sameError = "TypeError: cannot read property 'id' of undefined";
+  const responseSignatures = [sameError, sameError, sameError];
+  const currentResponseSignature = sameError;
+  const r = detectLoop(current, history, { responseSignatures, currentResponseSignature });
+  check("similar prompts AND stuck error ARE flagged as loop", r.looping && !r.responseMoved && r.repeats >= 3,
+    `looping=${r.looping}, repeats=${r.repeats}, responseMoved=${r.responseMoved}`);
+}
+// --- Test 8: responseSignature extracts the error/text from both provider shapes ---
+{
+  const openai = responseSignature({ choices: [{ message: { content: "boom: it failed" } }] });
+  const anthropic = responseSignature({ content: [{ type: "text", text: "boom: it failed" }] });
+  const errEnvelope = responseSignature({ error: { message: "rate limited" } });
+  check("responseSignature reads OpenAI, Anthropic, and error shapes",
+    openai === "boom: it failed" && anthropic === "boom: it failed" && errEnvelope === "rate limited",
+    `openai="${openai}" anthropic="${anthropic}" err="${errEnvelope}"`);
+}
 console.log("\n" + (failures === 0 ? "ALL LOOP TESTS PASSED" : `${failures} LOOP TEST(S) FAILED`));
 process.exit(failures === 0 ? 0 : 1);

package/scripts/make-demo-svg.mjs CHANGED Viewed

@@ -15,28 +15,29 @@ const C = {
 };
 const lines = [
-  { t: "$ runcap plan --fuel 24 -- \"build a small auth feature and verify it\"", c: C.prompt, at: 0.3 },
-  { t: "Estimate:  $3 - $7   (range, not an oracle)", c: C.text, at: 1.1 },
-  { t: "Recommended cap:  $10", c: C.ok, at: 1.5 },
-  { t: "", c: C.text, at: 1.6 },
-  { t: "$ ANTHROPIC_BASE_URL=http://127.0.0.1:8792/v1 \\", c: C.prompt, at: 2.2 },
-  { t: "    AIM_DAILY_BUDGET_USD=10 runcap gateway", c: C.prompt, at: 2.6 },
-  { t: "gateway up  ·  compression on  ·  hard cap armed", c: C.dim, at: 3.2 },
-  { t: "", c: C.text, at: 3.3 },
-  { t: "→ request   10,144 tokens", c: C.text, at: 3.9 },
-  { t: "→ compressed 1,260 tokens   (JSON + logs trimmed, prose untouched)", c: C.ok, at: 4.6 },
-  { t: "", c: C.text, at: 4.7 },
-  { t: "You saved $7.40  ·  would have spent $18.40  ·  cap $10", c: C.accent, at: 5.4 },
-  { t: "", c: C.text, at: 5.5 },
-  { t: "→ next call would cross the ceiling", c: C.text, at: 6.1 },
-  { t: "HTTP 429  budget_guard  — run stopped before money left your account", c: C.bad, at: 6.8 }
+  { t: "$ runcap mission run --policy .runcap/mission.yaml -- claude \"fix the failing checkout test\"", c: C.prompt, at: 0.3 },
+  { t: "Policy: checkout · team payments · cap $10 · verify \"npm test\"", c: C.dim, at: 0.9 },
+  { t: "", c: C.text, at: 1.0 },
+  { t: "→ estimate $3 - $7    ·    hard cap armed at $10", c: C.text, at: 1.5 },
+  { t: "→ compressed 1,186 → 737 tokens on a real call  (37.9% saved)", c: C.ok, at: 2.1 },
+  { t: "", c: C.text, at: 2.2 },
+  { t: "✓ verify passed - but did the agent earn it?", c: C.text, at: 2.9 },
+  { t: "   · verifier unchanged   · baseline truly failed   · clean-room replay reproduced", c: C.dim, at: 3.4 },
+  { t: "   Verification integrity:  VERIFIED_STRONG", c: C.ok, at: 4.0 },
+  { t: "   Mission cost $0.0007 / $10.00   ·   3 files changed, all in scope", c: C.text, at: 4.6 },
+  { t: "   Mission verdict:  PASS", c: C.accent, at: 5.2 },
+  { t: "", c: C.text, at: 5.3 },
+  { t: "$ runcap ci --policy .runcap/mission.yaml      # the same gate, on the PR", c: C.prompt, at: 6.2 },
+  { t: "✗ agent rewrote app/verify.mjs - protected evidence changed", c: C.bad, at: 6.9 },
+  { t: "   Verification integrity:  VERIFIER_COMPROMISED", c: C.bad, at: 7.5 },
+  { t: "   Mission verdict:  BLOCKED      → PR check fails, run stopped", c: C.bad, at: 8.1 }
 ];
-const W = 920, H = 560;
+const W = 980, H = 588;
 const padX = 28, top = 78, lh = 27, fs = 16.5;
 const esc = (s) => s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;");
-const total = 8.0; // loop length seconds
+const total = 11.0; // loop length seconds
 const rows = lines.map((ln, i) => {
   const y = top + i * lh;
   // fade+slide in at ln.at, hold, then reset at end of loop
@@ -46,7 +47,7 @@ const rows = lines.map((ln, i) => {
   ${esc(ln.t)}</text>`;
 }).join("\n");
-const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" width="${W}" height="${H}" role="img" aria-label="Runcap terminal demo: plan, cap, compress, stop">
+const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" width="${W}" height="${H}" role="img" aria-label="Runcap terminal demo: estimate, cap, verify integrity, mission PASS, then a tampered run graded BLOCKED on the PR">
   <defs>
     <linearGradient id="brand" x1="0" y1="0" x2="1" y2="0">
       <stop offset="0" stop-color="#22d3ee"/><stop offset="1" stop-color="#34d399"/>
@@ -64,7 +65,7 @@ const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" wid
     <circle cx="26" cy="28" r="6" fill="#f87171"/>
     <circle cx="48" cy="28" r="6" fill="#fbbf24"/>
     <circle cx="70" cy="28" r="6" fill="#34d399"/>
-    <text x="100" y="33" fill="#8a8a8a" font-family="'JetBrains Mono',monospace" font-size="14">runcap — estimate · cap · compress · rescue</text>
+    <text x="100" y="33" fill="#8a8a8a" font-family="'JetBrains Mono',monospace" font-size="14">runcap · estimate · cap · verify integrity · mission verdict</text>
     <text x="${W-150}" y="33" fill="url(#brand)" font-family="'JetBrains Mono',monospace" font-weight="700" font-size="15">run·cap</text>
   </g>
   <line x1="0" y1="50" x2="${W}" y2="50" stroke="#1c1c1f"/>