runcap 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +211 -9
- package/bin/runcap.mjs +153 -0
- package/examples/outcome-demo/agent-fixes.mjs +24 -0
- package/examples/outcome-demo/agent-spins.mjs +20 -0
- package/examples/outcome-demo/broken.mjs +5 -0
- package/examples/outcome-demo/verify.mjs +7 -0
- package/package.json +11 -2
- package/scripts/guard-test.mjs +76 -0
- package/scripts/make-demo-svg.mjs +20 -20
- package/scripts/mission-test.mjs +148 -0
- package/scripts/outcome-test.mjs +48 -0
- package/scripts/policy-test.mjs +121 -0
- package/scripts/render-media-screenshots.mjs +37 -0
- package/src/mission-control.mjs +441 -1
- package/src/policy.mjs +208 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
// Proves `runcap outcome --guard` grades verification trust, not just pass/fail.
|
|
2
|
+
// Three real runs through the cap gateway (mock upstream) inside a throwaway git
|
|
3
|
+
// repo: an honest fix earns VERIFIED_STRONG; an agent that rewrites the verifier
|
|
4
|
+
// to force a green earns VERIFIER_COMPROMISED; a no-fix run stays UNVERIFIED.
|
|
5
|
+
// The whole point of the guard is that the middle case must NOT read as VERIFIED.
|
|
6
|
+
|
|
7
|
+
import os from "node:os";
|
|
8
|
+
import path from "node:path";
|
|
9
|
+
import { fileURLToPath } from "node:url";
|
|
10
|
+
import { execFileSync } from "node:child_process";
|
|
11
|
+
import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
|
|
12
|
+
|
|
13
|
+
const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
|
|
14
|
+
|
|
15
|
+
const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-guard-"));
|
|
16
|
+
process.chdir(tmp);
|
|
17
|
+
|
|
18
|
+
mkdirSync(path.join(tmp, "app"), { recursive: true });
|
|
19
|
+
// broken.mjs starts wrong (ok=false), so the task genuinely fails on baseline.
|
|
20
|
+
writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
|
|
21
|
+
writeFileSync(path.join(tmp, "app", "verify.mjs"),
|
|
22
|
+
"import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
|
|
23
|
+
|
|
24
|
+
// Honest agent: spends, then fixes the source file (an allowed change).
|
|
25
|
+
writeFileSync(path.join(tmp, "agent-fix.mjs"),
|
|
26
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'fix it'}]})}).then(r=>r.text());" +
|
|
27
|
+
"const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
|
|
28
|
+
// Cheating agent: instead of fixing the bug, it rewrites the VERIFIER so it always passes.
|
|
29
|
+
writeFileSync(path.join(tmp, "agent-cheat.mjs"),
|
|
30
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'cheat'}]})}).then(r=>r.text());" +
|
|
31
|
+
"const {writeFile}=await import('node:fs/promises');await writeFile('app/verify.mjs','console.log(\"ok\");\\n');\n");
|
|
32
|
+
// No-fix agent: spends, changes nothing.
|
|
33
|
+
writeFileSync(path.join(tmp, "agent-nop.mjs"),
|
|
34
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'think'}]})}).then(r=>r.text());console.log('no fix');\n");
|
|
35
|
+
|
|
36
|
+
// Commit a baseline so the guard has a real commit + clean tree to check against.
|
|
37
|
+
const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" });
|
|
38
|
+
g("init", "-q");
|
|
39
|
+
g("config", "user.email", "test@runcap.local");
|
|
40
|
+
g("config", "user.name", "runcap-test");
|
|
41
|
+
g("add", "-A");
|
|
42
|
+
g("commit", "-qm", "baseline");
|
|
43
|
+
|
|
44
|
+
let failures = 0;
|
|
45
|
+
const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
|
|
46
|
+
|
|
47
|
+
const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
|
|
48
|
+
|
|
49
|
+
// Each scenario must start from the committed baseline - otherwise one run's
|
|
50
|
+
// edits (especially the cheat run's rewritten verifier) leak into the next.
|
|
51
|
+
const resetToBaseline = () => { g("checkout", "-f", "HEAD"); g("clean", "-fdq", "-e", ".runcap"); };
|
|
52
|
+
|
|
53
|
+
// 1. Honest fix, guarded → VERIFIED_STRONG.
|
|
54
|
+
const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "guard-fix", mock: true, guard: true, allow: ["app/broken.mjs"] });
|
|
55
|
+
check("honest fix verifies (exit-code oracle)", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
|
|
56
|
+
check("honest fix grades VERIFIED_STRONG", fix.receipt.verificationIntegrity.status === "VERIFIED_STRONG", `status=${fix.receipt.verificationIntegrity.status}`);
|
|
57
|
+
check("strong run reproduced the baseline failure first", fix.receipt.verificationIntegrity.contract.baselineVerifyPassed === false);
|
|
58
|
+
check("strong pass survives a clean checkout", fix.receipt.verificationIntegrity.cleanRoom.passed === true, fix.receipt.verificationIntegrity.cleanRoom.detail);
|
|
59
|
+
|
|
60
|
+
// 2. Cheat by rewriting the verifier → still exit-0 "VERIFIED", but COMPROMISED.
|
|
61
|
+
resetToBaseline();
|
|
62
|
+
const cheat = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-cheat.mjs"], label: "guard-cheat", mock: true, guard: true, allow: ["app/broken.mjs"] });
|
|
63
|
+
check("cheat run's exit code is still 0 (the trap)", cheat.receipt.verify.passed === true);
|
|
64
|
+
check("guard catches the rewritten verifier", cheat.receipt.verificationIntegrity.status === "VERIFIER_COMPROMISED", `status=${cheat.receipt.verificationIntegrity.status}`);
|
|
65
|
+
check("compromised run names the tampered file", cheat.receipt.verificationIntegrity.violations.some((v) => v.startsWith("verifier_file_unchanged:")), JSON.stringify(cheat.receipt.verificationIntegrity.violations));
|
|
66
|
+
|
|
67
|
+
// 3. No-fix, guarded → UNVERIFIED (verify never passed).
|
|
68
|
+
resetToBaseline();
|
|
69
|
+
const nop = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-nop.mjs"], label: "guard-nop", mock: true, guard: true, allow: ["app/broken.mjs"] });
|
|
70
|
+
check("no-fix guarded run is UNVERIFIED", nop.receipt.verificationIntegrity.status === "UNVERIFIED", `status=${nop.receipt.verificationIntegrity.status}`);
|
|
71
|
+
|
|
72
|
+
// 4. The honesty note about cost scope rides on every guarded receipt.
|
|
73
|
+
check("receipt states cost scope is LLM-only", /subscriptions/.test(fix.receipt.costScope.note));
|
|
74
|
+
|
|
75
|
+
console.log("\n" + (failures === 0 ? "ALL GUARD TESTS PASSED" : `${failures} GUARD TEST(S) FAILED`));
|
|
76
|
+
process.exit(failures === 0 ? 0 : 1);
|
|
@@ -15,29 +15,29 @@ const C = {
|
|
|
15
15
|
};
|
|
16
16
|
|
|
17
17
|
const lines = [
|
|
18
|
-
{ t: "$ runcap
|
|
19
|
-
{ t: "
|
|
20
|
-
{ t: "
|
|
21
|
-
{ t: "", c: C.text, at: 1.5 },
|
|
22
|
-
{ t: "
|
|
23
|
-
{ t: "
|
|
24
|
-
{ t: "
|
|
25
|
-
{ t: "", c: C.
|
|
26
|
-
{ t: "
|
|
27
|
-
{ t: "
|
|
28
|
-
{ t: "", c: C.
|
|
29
|
-
{ t: "
|
|
30
|
-
{ t: "
|
|
31
|
-
{ t: "", c: C.
|
|
32
|
-
{ t: "
|
|
33
|
-
{ t: "
|
|
18
|
+
{ t: "$ runcap mission run --policy .runcap/mission.yaml -- claude \"fix the failing checkout test\"", c: C.prompt, at: 0.3 },
|
|
19
|
+
{ t: "Policy: checkout · team payments · cap $10 · verify \"npm test\"", c: C.dim, at: 0.9 },
|
|
20
|
+
{ t: "", c: C.text, at: 1.0 },
|
|
21
|
+
{ t: "→ estimate $3 - $7 · hard cap armed at $10", c: C.text, at: 1.5 },
|
|
22
|
+
{ t: "→ compressed 1,186 → 737 tokens on a real call (37.9% saved)", c: C.ok, at: 2.1 },
|
|
23
|
+
{ t: "", c: C.text, at: 2.2 },
|
|
24
|
+
{ t: "✓ verify passed - but did the agent earn it?", c: C.text, at: 2.9 },
|
|
25
|
+
{ t: " · verifier unchanged · baseline truly failed · clean-room replay reproduced", c: C.dim, at: 3.4 },
|
|
26
|
+
{ t: " Verification integrity: VERIFIED_STRONG", c: C.ok, at: 4.0 },
|
|
27
|
+
{ t: " Mission cost $0.0007 / $10.00 · 3 files changed, all in scope", c: C.text, at: 4.6 },
|
|
28
|
+
{ t: " Mission verdict: PASS", c: C.accent, at: 5.2 },
|
|
29
|
+
{ t: "", c: C.text, at: 5.3 },
|
|
30
|
+
{ t: "$ runcap ci --policy .runcap/mission.yaml # the same gate, on the PR", c: C.prompt, at: 6.2 },
|
|
31
|
+
{ t: "✗ agent rewrote app/verify.mjs - protected evidence changed", c: C.bad, at: 6.9 },
|
|
32
|
+
{ t: " Verification integrity: VERIFIER_COMPROMISED", c: C.bad, at: 7.5 },
|
|
33
|
+
{ t: " Mission verdict: BLOCKED → PR check fails, run stopped", c: C.bad, at: 8.1 }
|
|
34
34
|
];
|
|
35
35
|
|
|
36
|
-
const W =
|
|
36
|
+
const W = 980, H = 588;
|
|
37
37
|
const padX = 28, top = 78, lh = 27, fs = 16.5;
|
|
38
38
|
const esc = (s) => s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
39
39
|
|
|
40
|
-
const total =
|
|
40
|
+
const total = 11.0; // loop length seconds
|
|
41
41
|
const rows = lines.map((ln, i) => {
|
|
42
42
|
const y = top + i * lh;
|
|
43
43
|
// fade+slide in at ln.at, hold, then reset at end of loop
|
|
@@ -47,7 +47,7 @@ const rows = lines.map((ln, i) => {
|
|
|
47
47
|
${esc(ln.t)}</text>`;
|
|
48
48
|
}).join("\n");
|
|
49
49
|
|
|
50
|
-
const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" width="${W}" height="${H}" role="img" aria-label="Runcap terminal demo:
|
|
50
|
+
const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" width="${W}" height="${H}" role="img" aria-label="Runcap terminal demo: estimate, cap, verify integrity, mission PASS, then a tampered run graded BLOCKED on the PR">
|
|
51
51
|
<defs>
|
|
52
52
|
<linearGradient id="brand" x1="0" y1="0" x2="1" y2="0">
|
|
53
53
|
<stop offset="0" stop-color="#22d3ee"/><stop offset="1" stop-color="#34d399"/>
|
|
@@ -65,7 +65,7 @@ const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" wid
|
|
|
65
65
|
<circle cx="26" cy="28" r="6" fill="#f87171"/>
|
|
66
66
|
<circle cx="48" cy="28" r="6" fill="#fbbf24"/>
|
|
67
67
|
<circle cx="70" cy="28" r="6" fill="#34d399"/>
|
|
68
|
-
<text x="100" y="33" fill="#8a8a8a" font-family="'JetBrains Mono',monospace" font-size="14">runcap · estimate · cap ·
|
|
68
|
+
<text x="100" y="33" fill="#8a8a8a" font-family="'JetBrains Mono',monospace" font-size="14">runcap · estimate · cap · verify integrity · mission verdict</text>
|
|
69
69
|
<text x="${W-150}" y="33" fill="url(#brand)" font-family="'JetBrains Mono',monospace" font-weight="700" font-size="15">run·cap</text>
|
|
70
70
|
</g>
|
|
71
71
|
<line x1="0" y1="50" x2="${W}" y2="50" stroke="#1c1c1f"/>
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
// Proves a policy-bound mission grades a real run into a PASS/BLOCKED verdict and
|
|
2
|
+
// that the verdict drives the process exit code (so CI fails on a blocked mission).
|
|
3
|
+
// Everything runs offline through the mock cap gateway inside a throwaway git repo:
|
|
4
|
+
// - an honest fix within scope, under cap → PASS, exit 0
|
|
5
|
+
// - an agent that rewrites the verifier → BLOCKED (VERIFIER_COMPROMISED)
|
|
6
|
+
// - an edit outside the declared allow scope → BLOCKED (out of scope)
|
|
7
|
+
// - a mission whose first call trips the hard cap → BLOCKED (budget guard)
|
|
8
|
+
// It also drives the real `bin/runcap.mjs` so the exit codes and the GitHub
|
|
9
|
+
// Action's `runcap ci` PR summary are tested as a reviewer would see them.
|
|
10
|
+
|
|
11
|
+
import os from "node:os";
|
|
12
|
+
import path from "node:path";
|
|
13
|
+
import { fileURLToPath } from "node:url";
|
|
14
|
+
import { execFileSync } from "node:child_process";
|
|
15
|
+
import { mkdtempSync, writeFileSync, mkdirSync, readFileSync } from "node:fs";
|
|
16
|
+
|
|
17
|
+
const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
|
|
18
|
+
const BIN = path.join(SRC_DIR, "..", "bin", "runcap.mjs");
|
|
19
|
+
|
|
20
|
+
const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-mission-"));
|
|
21
|
+
process.chdir(tmp);
|
|
22
|
+
|
|
23
|
+
mkdirSync(path.join(tmp, "app"), { recursive: true });
|
|
24
|
+
mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
|
|
25
|
+
// broken.mjs starts wrong, so the task genuinely fails on baseline.
|
|
26
|
+
writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
|
|
27
|
+
writeFileSync(path.join(tmp, "app", "verify.mjs"),
|
|
28
|
+
"import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
|
|
29
|
+
// A tracked file OUTSIDE the allow scope: editing it must trip the scope check.
|
|
30
|
+
// (The guard reads `git diff --name-status`, which sees tracked edits - the
|
|
31
|
+
// honest unit of "out of scope" for this scenario.)
|
|
32
|
+
writeFileSync(path.join(tmp, "app", "other.mjs"), "export const other = 0;\n");
|
|
33
|
+
|
|
34
|
+
const spend = (msg) =>
|
|
35
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'" + msg + "'}]})}).then(r=>r.text());";
|
|
36
|
+
|
|
37
|
+
// Honest agent: spends, fixes the source file (the one allowed change).
|
|
38
|
+
writeFileSync(path.join(tmp, "agent-fix.mjs"),
|
|
39
|
+
spend("fix it") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
|
|
40
|
+
// Cheating agent: rewrites the VERIFIER so it always passes instead of fixing the bug.
|
|
41
|
+
writeFileSync(path.join(tmp, "agent-cheat.mjs"),
|
|
42
|
+
spend("cheat") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/verify.mjs','console.log(\"ok\");\\n');\n");
|
|
43
|
+
// Out-of-scope agent: fixes the bug (verify passes) BUT also edits a tracked file outside `allow`.
|
|
44
|
+
writeFileSync(path.join(tmp, "agent-scope.mjs"),
|
|
45
|
+
spend("scope") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');await writeFile('app/other.mjs','export const other = 1;\\n');\n");
|
|
46
|
+
|
|
47
|
+
// The mission policy a reviewer commits to the repo.
|
|
48
|
+
const POLICY = `version: v1
|
|
49
|
+
identity:
|
|
50
|
+
project: checkout
|
|
51
|
+
team: payments
|
|
52
|
+
mission:
|
|
53
|
+
name: Fix the failing checkout test
|
|
54
|
+
task_class: bugfix
|
|
55
|
+
budget:
|
|
56
|
+
mission_hard_limit_usd: 5
|
|
57
|
+
max_llm_calls: 12
|
|
58
|
+
verification:
|
|
59
|
+
command: "node app/verify.mjs"
|
|
60
|
+
guard: strict
|
|
61
|
+
protect: ["app/verify.mjs"]
|
|
62
|
+
allow: ["app/broken.mjs"]
|
|
63
|
+
`;
|
|
64
|
+
writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), POLICY);
|
|
65
|
+
|
|
66
|
+
// A second policy with a hair-thin cap, so the gateway trips the budget guard pre-flight.
|
|
67
|
+
const TINY_POLICY = POLICY.replace("mission_hard_limit_usd: 5", "mission_hard_limit_usd: 0.0000001");
|
|
68
|
+
writeFileSync(path.join(tmp, ".runcap", "mission-tiny.yaml"), TINY_POLICY);
|
|
69
|
+
|
|
70
|
+
// Commit a baseline so the guard has a real commit + clean tree to check against.
|
|
71
|
+
const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" });
|
|
72
|
+
g("init", "-q");
|
|
73
|
+
g("config", "user.email", "test@runcap.local");
|
|
74
|
+
g("config", "user.name", "runcap-test");
|
|
75
|
+
g("add", "-A");
|
|
76
|
+
g("commit", "-qm", "baseline");
|
|
77
|
+
|
|
78
|
+
let failures = 0;
|
|
79
|
+
const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
|
|
80
|
+
|
|
81
|
+
const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
|
|
82
|
+
const { loadPolicy } = await import(path.join(SRC_DIR, "policy.mjs"));
|
|
83
|
+
|
|
84
|
+
// Each scenario starts from the committed baseline so one run's edits (the cheat
|
|
85
|
+
// run's rewritten verifier especially) never leak into the next.
|
|
86
|
+
const resetToBaseline = () => { g("checkout", "-f", "HEAD"); g("clean", "-fdq", "-e", ".runcap"); };
|
|
87
|
+
|
|
88
|
+
const loaded = loadPolicy(tmp);
|
|
89
|
+
|
|
90
|
+
// 1. Honest fix, within scope, under cap → PASS with a strong verification.
|
|
91
|
+
const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "mission-fix", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
|
|
92
|
+
check("honest fix verifies", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
|
|
93
|
+
check("honest fix grades VERIFIED_STRONG", fix.receipt.verificationIntegrity.status === "VERIFIED_STRONG", `status=${fix.receipt.verificationIntegrity.status}`);
|
|
94
|
+
check("honest fix mission verdict PASS", fix.receipt.policy?.verdict === "PASS", JSON.stringify(fix.receipt.policy?.reasons));
|
|
95
|
+
check("receipt carries the policy hash", /^[0-9a-f]{64}$/.test(fix.receipt.policy?.hash ?? ""), fix.receipt.policy?.hash);
|
|
96
|
+
check("receipt carries org attribution", fix.receipt.policy?.identity?.project === "checkout" && fix.receipt.policy?.identity?.team === "payments");
|
|
97
|
+
check("receipt bumps to v0.3 schema", fix.receipt.schema === "runcap.outcome-receipt/v0.3", fix.receipt.schema);
|
|
98
|
+
|
|
99
|
+
// 2. Cheat by rewriting the verifier → BLOCKED, VERIFIER_COMPROMISED.
|
|
100
|
+
resetToBaseline();
|
|
101
|
+
const cheat = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-cheat.mjs"], label: "mission-cheat", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
|
|
102
|
+
check("cheat run mission verdict BLOCKED", cheat.receipt.policy?.verdict === "BLOCKED", `verdict=${cheat.receipt.policy?.verdict}`);
|
|
103
|
+
check("cheat run names VERIFIER_COMPROMISED", (cheat.receipt.policy?.reasons ?? []).some((r) => r.includes("VERIFIER_COMPROMISED")), JSON.stringify(cheat.receipt.policy?.reasons));
|
|
104
|
+
|
|
105
|
+
// 3. Edit outside the declared scope → BLOCKED, out-of-scope.
|
|
106
|
+
resetToBaseline();
|
|
107
|
+
const scope = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-scope.mjs"], label: "mission-scope", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
|
|
108
|
+
check("out-of-scope run mission verdict BLOCKED", scope.receipt.policy?.verdict === "BLOCKED", `verdict=${scope.receipt.policy?.verdict}`);
|
|
109
|
+
check("out-of-scope run names the scope breach", (scope.receipt.policy?.reasons ?? []).some((r) => r.toLowerCase().includes("scope")), JSON.stringify(scope.receipt.policy?.reasons));
|
|
110
|
+
|
|
111
|
+
// 4. A hair-thin cap trips the gateway budget guard → BLOCKED, budget reason.
|
|
112
|
+
resetToBaseline();
|
|
113
|
+
const tinyLoaded = loadPolicy(tmp, ".runcap/mission-tiny.yaml");
|
|
114
|
+
const broke = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "mission-broke", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 0.0000001, policy: tinyLoaded });
|
|
115
|
+
check("tiny cap trips the budget guard", broke.receipt.cost.budgetGuardTripped === true, `tripped=${broke.receipt.cost.budgetGuardTripped}`);
|
|
116
|
+
check("budget trip mission verdict BLOCKED", broke.receipt.policy?.verdict === "BLOCKED", `verdict=${broke.receipt.policy?.verdict}`);
|
|
117
|
+
check("budget trip names the budget guard", (broke.receipt.policy?.reasons ?? []).some((r) => r.toLowerCase().includes("budget")), JSON.stringify(broke.receipt.policy?.reasons));
|
|
118
|
+
|
|
119
|
+
// 5. The real bin must exit 0 on PASS and 1 on BLOCKED so CI fails on a bad mission.
|
|
120
|
+
const runBin = (args, extraEnv = {}) => {
|
|
121
|
+
try {
|
|
122
|
+
const stdout = execFileSync("node", [BIN, ...args], { cwd: tmp, env: { ...process.env, ...extraEnv }, stdio: ["ignore", "pipe", "pipe"] });
|
|
123
|
+
return { code: 0, stdout: String(stdout) };
|
|
124
|
+
} catch (e) {
|
|
125
|
+
return { code: e.status ?? 1, stdout: String(e.stdout ?? ""), stderr: String(e.stderr ?? "") };
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
resetToBaseline();
|
|
130
|
+
const binPass = runBin(["mission", "run", "--mock", "--", "node", "agent-fix.mjs"]);
|
|
131
|
+
check("`runcap mission run` exits 0 on a PASS mission", binPass.code === 0, `code=${binPass.code}`);
|
|
132
|
+
check("PASS run prints the verdict", /Mission verdict: PASS/.test(binPass.stdout), binPass.stdout.slice(-200));
|
|
133
|
+
|
|
134
|
+
resetToBaseline();
|
|
135
|
+
const binBlock = runBin(["mission", "run", "--mock", "--", "node", "agent-cheat.mjs"]);
|
|
136
|
+
check("`runcap mission run` exits 1 on a BLOCKED mission", binBlock.code === 1, `code=${binBlock.code}`);
|
|
137
|
+
|
|
138
|
+
// 6. `runcap ci` (the GitHub Action's grader) must write the PR summary and exit 1 on BLOCKED.
|
|
139
|
+
// It grades the latest receipt on disk - which the BLOCKED cheat run just wrote.
|
|
140
|
+
const summaryFile = path.join(tmp, "step-summary.md");
|
|
141
|
+
writeFileSync(summaryFile, "");
|
|
142
|
+
const ci = runBin(["ci", "--policy", ".runcap/mission.yaml"], { GITHUB_STEP_SUMMARY: summaryFile });
|
|
143
|
+
check("`runcap ci` exits 1 when the graded receipt is BLOCKED", ci.code === 1, `code=${ci.code}`);
|
|
144
|
+
const summary = readFileSync(summaryFile, "utf8");
|
|
145
|
+
check("`runcap ci` writes a PR summary to GITHUB_STEP_SUMMARY", /Runcap mission verdict: BLOCKED/.test(summary), summary.slice(0, 200));
|
|
146
|
+
|
|
147
|
+
console.log("\n" + (failures === 0 ? "ALL MISSION TESTS PASSED" : `${failures} MISSION TEST(S) FAILED`));
|
|
148
|
+
process.exit(failures === 0 ? 0 : 1);
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
// Proves runOutcome produces an honest receipt end-to-end through the REAL cap
|
|
2
|
+
// gateway (mock upstream, so no network/keys), for both the VERIFIED and
|
|
3
|
+
// UNVERIFIED cases. The agent spends recorded tokens; the verify command's exit
|
|
4
|
+
// code is the oracle; Verified Outcome Cost is the actual spend only when verify
|
|
5
|
+
// passes. Runs in an isolated temp cwd so it never touches real .runcap data.
|
|
6
|
+
|
|
7
|
+
import os from "node:os";
|
|
8
|
+
import path from "node:path";
|
|
9
|
+
import { fileURLToPath } from "node:url";
|
|
10
|
+
import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
|
|
11
|
+
|
|
12
|
+
// Resolve the engine relative to this script so the test runs from any cwd
|
|
13
|
+
// (it chdir's into a temp dir below, so a relative import would break).
|
|
14
|
+
const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
|
|
15
|
+
|
|
16
|
+
const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-outcome-"));
|
|
17
|
+
process.chdir(tmp);
|
|
18
|
+
|
|
19
|
+
// A tiny agent that spends through the gateway and writes (or doesn't write) a fix.
|
|
20
|
+
mkdirSync(path.join(tmp, "app"), { recursive: true });
|
|
21
|
+
writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
|
|
22
|
+
writeFileSync(path.join(tmp, "app", "verify.mjs"),
|
|
23
|
+
"import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
|
|
24
|
+
writeFileSync(path.join(tmp, "agent-fix.mjs"),
|
|
25
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'fix it'}]})}).then(r=>r.text());" +
|
|
26
|
+
"const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
|
|
27
|
+
writeFileSync(path.join(tmp, "agent-nop.mjs"),
|
|
28
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'think'}]})}).then(r=>r.text());console.log('no fix');\n");
|
|
29
|
+
|
|
30
|
+
let failures = 0;
|
|
31
|
+
const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
|
|
32
|
+
|
|
33
|
+
const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
|
|
34
|
+
|
|
35
|
+
const nop = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-nop.mjs"], label: "nop", mock: true });
|
|
36
|
+
check("no-fix run is UNVERIFIED", nop.receipt.outcome === "UNVERIFIED", `outcome=${nop.receipt.outcome}`);
|
|
37
|
+
check("no-fix run still spent real money", nop.receipt.cost.actualCostUsd > 0, `cost=${nop.receipt.cost.actualCostUsd}`);
|
|
38
|
+
check("no-fix Verified Outcome Cost is null", nop.receipt.cost.verifiedOutcomeCostUsd === null);
|
|
39
|
+
check("no-fix counts money without delivery", nop.receipt.cost.moneySpentWithoutVerifiedDeliveryUsd > 0);
|
|
40
|
+
|
|
41
|
+
const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "fix", mock: true });
|
|
42
|
+
check("fix run is VERIFIED", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
|
|
43
|
+
check("fix Verified Outcome Cost equals actual spend", fix.receipt.cost.verifiedOutcomeCostUsd === fix.receipt.cost.actualCostUsd);
|
|
44
|
+
check("fix counts zero undelivered money", fix.receipt.cost.moneySpentWithoutVerifiedDeliveryUsd === 0);
|
|
45
|
+
check("cost truth is calculated from usage + price table", /price_table/.test(fix.receipt.cost.truth));
|
|
46
|
+
|
|
47
|
+
console.log("\n" + (failures === 0 ? "ALL OUTCOME TESTS PASSED" : `${failures} OUTCOME TEST(S) FAILED`));
|
|
48
|
+
process.exit(failures === 0 ? 0 : 1);
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
// Proves src/policy.mjs parses, validates, and grades correctly. Pure unit test:
|
|
2
|
+
// no gateway, no git, no agent - just the policy module over hand-built inputs.
|
|
3
|
+
// Covers: YAML parse + hash, .json fallback, required-field validation, the
|
|
4
|
+
// guard/scope warnings, and every BLOCK condition in evaluatePolicyVerdict.
|
|
5
|
+
|
|
6
|
+
import os from "node:os";
|
|
7
|
+
import path from "node:path";
|
|
8
|
+
import { fileURLToPath } from "node:url";
|
|
9
|
+
import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
|
|
10
|
+
|
|
11
|
+
const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
|
|
12
|
+
const { loadPolicy, validatePolicy, evaluatePolicyVerdict, policyMeta } = await import(path.join(SRC_DIR, "policy.mjs"));
|
|
13
|
+
|
|
14
|
+
let failures = 0;
|
|
15
|
+
const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
|
|
16
|
+
|
|
17
|
+
const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-policy-"));
|
|
18
|
+
mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
|
|
19
|
+
|
|
20
|
+
const VALID_YAML = `version: v1
|
|
21
|
+
identity:
|
|
22
|
+
project: checkout
|
|
23
|
+
team: payments
|
|
24
|
+
mission:
|
|
25
|
+
name: Fix the failing checkout test
|
|
26
|
+
task_class: bugfix
|
|
27
|
+
budget:
|
|
28
|
+
mission_hard_limit_usd: 10
|
|
29
|
+
max_llm_calls: 12
|
|
30
|
+
max_runtime_minutes: 30
|
|
31
|
+
verification:
|
|
32
|
+
command: "node app/verify.mjs"
|
|
33
|
+
guard: strict
|
|
34
|
+
protect: ["tests/**"]
|
|
35
|
+
allow: ["src/checkout/**"]
|
|
36
|
+
`;
|
|
37
|
+
|
|
38
|
+
// 1. Valid YAML loads, parses, hashes, validates clean.
|
|
39
|
+
writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), VALID_YAML);
|
|
40
|
+
const loaded = loadPolicy(tmp);
|
|
41
|
+
check("loadPolicy finds .runcap/mission.yaml", loaded && loaded.source.endsWith("mission.yaml"));
|
|
42
|
+
check("loadPolicy computes a sha256 hash", /^[0-9a-f]{64}$/.test(loaded.hash), loaded.hash);
|
|
43
|
+
check("valid policy parses mission.name", loaded.policy.mission.name === "Fix the failing checkout test");
|
|
44
|
+
const v1 = validatePolicy(loaded.policy);
|
|
45
|
+
check("valid policy validates ok", v1.ok === true, JSON.stringify(v1.errors));
|
|
46
|
+
check("valid policy with allow has no scope warning", !v1.warnings.some((w) => w.includes("allow is empty")));
|
|
47
|
+
const meta = policyMeta(loaded);
|
|
48
|
+
check("policyMeta carries identity + hash", meta.identity.project === "checkout" && meta.hash === loaded.hash);
|
|
49
|
+
check("policyMeta carries the limits", meta.limits.mission_hard_limit_usd === 10 && meta.limits.max_llm_calls === 12);
|
|
50
|
+
|
|
51
|
+
// 2. .json fallback parses with native JSON.parse (no parser needed).
|
|
52
|
+
const tmp2 = mkdtempSync(path.join(os.tmpdir(), "runcap-policy-json-"));
|
|
53
|
+
mkdirSync(path.join(tmp2, ".runcap"), { recursive: true });
|
|
54
|
+
writeFileSync(path.join(tmp2, ".runcap", "mission.json"), JSON.stringify({
|
|
55
|
+
version: "v1",
|
|
56
|
+
mission: { name: "json mission" },
|
|
57
|
+
budget: { mission_hard_limit_usd: 5 },
|
|
58
|
+
verification: { command: "npm test" }
|
|
59
|
+
}));
|
|
60
|
+
const jsonLoaded = loadPolicy(tmp2);
|
|
61
|
+
check("loadPolicy reads .json fallback", jsonLoaded && jsonLoaded.source.endsWith("mission.json"));
|
|
62
|
+
check("json policy validates ok", validatePolicy(jsonLoaded.policy).ok === true);
|
|
63
|
+
|
|
64
|
+
// 3. Missing verification.command → invalid.
|
|
65
|
+
const noVerify = validatePolicy({ version: "v1", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 } });
|
|
66
|
+
check("missing verification.command is invalid", noVerify.ok === false && noVerify.errors.some((e) => e.includes("verification.command")));
|
|
67
|
+
|
|
68
|
+
// 4. Bad version → invalid.
|
|
69
|
+
const badVersion = validatePolicy({ version: "v2", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 }, verification: { command: "npm test" } });
|
|
70
|
+
check("wrong version is invalid", badVersion.ok === false && badVersion.errors.some((e) => e.includes("version")));
|
|
71
|
+
|
|
72
|
+
// 5. Missing budget cap → invalid.
|
|
73
|
+
const noBudget = validatePolicy({ version: "v1", mission: { name: "x" }, verification: { command: "npm test" } });
|
|
74
|
+
check("missing mission_hard_limit_usd is invalid", noBudget.ok === false && noBudget.errors.some((e) => e.includes("mission_hard_limit_usd")));
|
|
75
|
+
|
|
76
|
+
// 6. No allow scope → warning (not error).
|
|
77
|
+
const noAllow = validatePolicy({ version: "v1", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 }, verification: { command: "npm test", allow: [] } });
|
|
78
|
+
check("empty allow produces a warning", noAllow.ok === true && noAllow.warnings.some((w) => w.includes("allow is empty")));
|
|
79
|
+
|
|
80
|
+
// 7. evaluatePolicyVerdict: a clean VERIFIED receipt → PASS.
|
|
81
|
+
const policy = loaded.policy;
|
|
82
|
+
const cleanReceipt = {
|
|
83
|
+
outcome: "VERIFIED",
|
|
84
|
+
verificationIntegrity: { status: "VERIFIED_STRONG", violations: [] },
|
|
85
|
+
cost: { actualCostUsd: 0.0007, llmCalls: 2, budgetGuardTripped: false },
|
|
86
|
+
work: { agentDurationMs: 5000 }
|
|
87
|
+
};
|
|
88
|
+
check("clean receipt grades PASS", evaluatePolicyVerdict(cleanReceipt, policy).verdict === "PASS");
|
|
89
|
+
|
|
90
|
+
// 8. Compromised verifier → BLOCKED with the reason.
|
|
91
|
+
const compromised = { ...cleanReceipt, verificationIntegrity: { status: "VERIFIER_COMPROMISED", violations: ["verifier_file_unchanged:app/verify.mjs"] } };
|
|
92
|
+
const cv = evaluatePolicyVerdict(compromised, policy);
|
|
93
|
+
check("compromised verifier grades BLOCKED", cv.verdict === "BLOCKED" && cv.reasons.some((r) => r.includes("VERIFIER_COMPROMISED")));
|
|
94
|
+
|
|
95
|
+
// 9. UNVERIFIED → BLOCKED.
|
|
96
|
+
const unver = { ...cleanReceipt, outcome: "UNVERIFIED", verificationIntegrity: { status: "UNVERIFIED", violations: [] } };
|
|
97
|
+
check("unverified grades BLOCKED", evaluatePolicyVerdict(unver, policy).verdict === "BLOCKED");
|
|
98
|
+
|
|
99
|
+
// 10. Out-of-allow scope → BLOCKED.
|
|
100
|
+
const scope = { ...cleanReceipt, verificationIntegrity: { status: "VERIFIED_STRONG", violations: ["within_allowed_scope:src/other.mjs"] } };
|
|
101
|
+
const sc = evaluatePolicyVerdict(scope, policy);
|
|
102
|
+
check("out-of-scope edit grades BLOCKED", sc.verdict === "BLOCKED" && sc.reasons.some((r) => r.toLowerCase().includes("scope")));
|
|
103
|
+
|
|
104
|
+
// 11. Over the dollar cap → BLOCKED.
|
|
105
|
+
const overCost = { ...cleanReceipt, cost: { actualCostUsd: 11, llmCalls: 2, budgetGuardTripped: false } };
|
|
106
|
+
check("over the cap grades BLOCKED", evaluatePolicyVerdict(overCost, policy).verdict === "BLOCKED");
|
|
107
|
+
|
|
108
|
+
// 12. budget_guard tripped → BLOCKED.
|
|
109
|
+
const guardTrip = { ...cleanReceipt, cost: { actualCostUsd: 1, llmCalls: 2, budgetGuardTripped: true } };
|
|
110
|
+
check("budget guard trip grades BLOCKED", evaluatePolicyVerdict(guardTrip, policy).verdict === "BLOCKED");
|
|
111
|
+
|
|
112
|
+
// 13. Too many LLM calls → BLOCKED.
|
|
113
|
+
const tooMany = { ...cleanReceipt, cost: { actualCostUsd: 1, llmCalls: 99, budgetGuardTripped: false } };
|
|
114
|
+
check("too many llm calls grades BLOCKED", evaluatePolicyVerdict(tooMany, policy).verdict === "BLOCKED");
|
|
115
|
+
|
|
116
|
+
// 14. Over the runtime budget → BLOCKED.
|
|
117
|
+
const slow = { ...cleanReceipt, work: { agentDurationMs: 31 * 60_000 } };
|
|
118
|
+
check("over runtime budget grades BLOCKED", evaluatePolicyVerdict(slow, policy).verdict === "BLOCKED");
|
|
119
|
+
|
|
120
|
+
console.log("\n" + (failures === 0 ? "ALL POLICY TESTS PASSED" : `${failures} POLICY TEST(S) FAILED`));
|
|
121
|
+
process.exit(failures === 0 ? 0 : 1);
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { chromium } from "playwright";
|
|
2
|
+
import { pathToFileURL } from "node:url";
|
|
3
|
+
import { resolve } from "node:path";
|
|
4
|
+
|
|
5
|
+
const root = resolve(import.meta.dirname, "..");
|
|
6
|
+
const mediaDir = resolve(root, "docs/assets/media");
|
|
7
|
+
|
|
8
|
+
const shots = [
|
|
9
|
+
{
|
|
10
|
+
html: resolve(mediaDir, "cover.html"),
|
|
11
|
+
png: resolve(mediaDir, "cover.png"),
|
|
12
|
+
width: 1200,
|
|
13
|
+
height: 630
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
html: resolve(mediaDir, "demo.html"),
|
|
17
|
+
png: resolve(mediaDir, "demo.png"),
|
|
18
|
+
width: 1200,
|
|
19
|
+
height: 750
|
|
20
|
+
}
|
|
21
|
+
];
|
|
22
|
+
|
|
23
|
+
const browser = await chromium.launch();
|
|
24
|
+
try {
|
|
25
|
+
for (const shot of shots) {
|
|
26
|
+
const page = await browser.newPage({
|
|
27
|
+
viewport: { width: shot.width, height: shot.height },
|
|
28
|
+
deviceScaleFactor: 2
|
|
29
|
+
});
|
|
30
|
+
await page.goto(pathToFileURL(shot.html).href, { waitUntil: "networkidle" });
|
|
31
|
+
await page.screenshot({ path: shot.png, fullPage: false });
|
|
32
|
+
await page.close();
|
|
33
|
+
console.log(`rendered ${shot.png}`);
|
|
34
|
+
}
|
|
35
|
+
} finally {
|
|
36
|
+
await browser.close();
|
|
37
|
+
}
|