runcap 0.3.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,148 @@
1
+ // Proves a policy-bound mission grades a real run into a PASS/BLOCKED verdict and
2
+ // that the verdict drives the process exit code (so CI fails on a blocked mission).
3
+ // Everything runs offline through the mock cap gateway inside a throwaway git repo:
4
+ // - an honest fix within scope, under cap → PASS, exit 0
5
+ // - an agent that rewrites the verifier → BLOCKED (VERIFIER_COMPROMISED)
6
+ // - an edit outside the declared allow scope → BLOCKED (out of scope)
7
+ // - a mission whose first call trips the hard cap → BLOCKED (budget guard)
8
+ // It also drives the real `bin/runcap.mjs` so the exit codes and the GitHub
9
+ // Action's `runcap ci` PR summary are tested as a reviewer would see them.
10
+
11
+ import os from "node:os";
12
+ import path from "node:path";
13
+ import { fileURLToPath } from "node:url";
14
+ import { execFileSync } from "node:child_process";
15
+ import { mkdtempSync, writeFileSync, mkdirSync, readFileSync } from "node:fs";
16
+
17
+ const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
18
+ const BIN = path.join(SRC_DIR, "..", "bin", "runcap.mjs");
19
+
20
+ const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-mission-"));
21
+ process.chdir(tmp);
22
+
23
+ mkdirSync(path.join(tmp, "app"), { recursive: true });
24
+ mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
25
+ // broken.mjs starts wrong, so the task genuinely fails on baseline.
26
+ writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
27
+ writeFileSync(path.join(tmp, "app", "verify.mjs"),
28
+ "import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
29
+ // A tracked file OUTSIDE the allow scope: editing it must trip the scope check.
30
+ // (The guard reads `git diff --name-status`, which sees tracked edits - the
31
+ // honest unit of "out of scope" for this scenario.)
32
+ writeFileSync(path.join(tmp, "app", "other.mjs"), "export const other = 0;\n");
33
+
34
+ const spend = (msg) =>
35
+ "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'" + msg + "'}]})}).then(r=>r.text());";
36
+
37
+ // Honest agent: spends, fixes the source file (the one allowed change).
38
+ writeFileSync(path.join(tmp, "agent-fix.mjs"),
39
+ spend("fix it") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
40
+ // Cheating agent: rewrites the VERIFIER so it always passes instead of fixing the bug.
41
+ writeFileSync(path.join(tmp, "agent-cheat.mjs"),
42
+ spend("cheat") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/verify.mjs','console.log(\"ok\");\\n');\n");
43
+ // Out-of-scope agent: fixes the bug (verify passes) BUT also edits a tracked file outside `allow`.
44
+ writeFileSync(path.join(tmp, "agent-scope.mjs"),
45
+ spend("scope") + "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');await writeFile('app/other.mjs','export const other = 1;\\n');\n");
46
+
47
+ // The mission policy a reviewer commits to the repo.
48
+ const POLICY = `version: v1
49
+ identity:
50
+ project: checkout
51
+ team: payments
52
+ mission:
53
+ name: Fix the failing checkout test
54
+ task_class: bugfix
55
+ budget:
56
+ mission_hard_limit_usd: 5
57
+ max_llm_calls: 12
58
+ verification:
59
+ command: "node app/verify.mjs"
60
+ guard: strict
61
+ protect: ["app/verify.mjs"]
62
+ allow: ["app/broken.mjs"]
63
+ `;
64
+ writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), POLICY);
65
+
66
+ // A second policy with a hair-thin cap, so the gateway trips the budget guard pre-flight.
67
+ const TINY_POLICY = POLICY.replace("mission_hard_limit_usd: 5", "mission_hard_limit_usd: 0.0000001");
68
+ writeFileSync(path.join(tmp, ".runcap", "mission-tiny.yaml"), TINY_POLICY);
69
+
70
+ // Commit a baseline so the guard has a real commit + clean tree to check against.
71
+ const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" });
72
+ g("init", "-q");
73
+ g("config", "user.email", "test@runcap.local");
74
+ g("config", "user.name", "runcap-test");
75
+ g("add", "-A");
76
+ g("commit", "-qm", "baseline");
77
+
78
+ let failures = 0;
79
+ const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
80
+
81
+ const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
82
+ const { loadPolicy } = await import(path.join(SRC_DIR, "policy.mjs"));
83
+
84
+ // Each scenario starts from the committed baseline so one run's edits (the cheat
85
+ // run's rewritten verifier especially) never leak into the next.
86
+ const resetToBaseline = () => { g("checkout", "-f", "HEAD"); g("clean", "-fdq", "-e", ".runcap"); };
87
+
88
+ const loaded = loadPolicy(tmp);
89
+
90
+ // 1. Honest fix, within scope, under cap → PASS with a strong verification.
91
+ const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "mission-fix", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
92
+ check("honest fix verifies", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
93
+ check("honest fix grades VERIFIED_STRONG", fix.receipt.verificationIntegrity.status === "VERIFIED_STRONG", `status=${fix.receipt.verificationIntegrity.status}`);
94
+ check("honest fix mission verdict PASS", fix.receipt.policy?.verdict === "PASS", JSON.stringify(fix.receipt.policy?.reasons));
95
+ check("receipt carries the policy hash", /^[0-9a-f]{64}$/.test(fix.receipt.policy?.hash ?? ""), fix.receipt.policy?.hash);
96
+ check("receipt carries org attribution", fix.receipt.policy?.identity?.project === "checkout" && fix.receipt.policy?.identity?.team === "payments");
97
+ check("receipt bumps to v0.3 schema", fix.receipt.schema === "runcap.outcome-receipt/v0.3", fix.receipt.schema);
98
+
99
+ // 2. Cheat by rewriting the verifier → BLOCKED, VERIFIER_COMPROMISED.
100
+ resetToBaseline();
101
+ const cheat = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-cheat.mjs"], label: "mission-cheat", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
102
+ check("cheat run mission verdict BLOCKED", cheat.receipt.policy?.verdict === "BLOCKED", `verdict=${cheat.receipt.policy?.verdict}`);
103
+ check("cheat run names VERIFIER_COMPROMISED", (cheat.receipt.policy?.reasons ?? []).some((r) => r.includes("VERIFIER_COMPROMISED")), JSON.stringify(cheat.receipt.policy?.reasons));
104
+
105
+ // 3. Edit outside the declared scope → BLOCKED, out-of-scope.
106
+ resetToBaseline();
107
+ const scope = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-scope.mjs"], label: "mission-scope", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 5, policy: loaded });
108
+ check("out-of-scope run mission verdict BLOCKED", scope.receipt.policy?.verdict === "BLOCKED", `verdict=${scope.receipt.policy?.verdict}`);
109
+ check("out-of-scope run names the scope breach", (scope.receipt.policy?.reasons ?? []).some((r) => r.toLowerCase().includes("scope")), JSON.stringify(scope.receipt.policy?.reasons));
110
+
111
+ // 4. A hair-thin cap trips the gateway budget guard → BLOCKED, budget reason.
112
+ resetToBaseline();
113
+ const tinyLoaded = loadPolicy(tmp, ".runcap/mission-tiny.yaml");
114
+ const broke = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "mission-broke", mock: true, guard: true, protect: ["app/verify.mjs"], allow: ["app/broken.mjs"], capUsd: 0.0000001, policy: tinyLoaded });
115
+ check("tiny cap trips the budget guard", broke.receipt.cost.budgetGuardTripped === true, `tripped=${broke.receipt.cost.budgetGuardTripped}`);
116
+ check("budget trip mission verdict BLOCKED", broke.receipt.policy?.verdict === "BLOCKED", `verdict=${broke.receipt.policy?.verdict}`);
117
+ check("budget trip names the budget guard", (broke.receipt.policy?.reasons ?? []).some((r) => r.toLowerCase().includes("budget")), JSON.stringify(broke.receipt.policy?.reasons));
118
+
119
+ // 5. The real bin must exit 0 on PASS and 1 on BLOCKED so CI fails on a bad mission.
120
+ const runBin = (args, extraEnv = {}) => {
121
+ try {
122
+ const stdout = execFileSync("node", [BIN, ...args], { cwd: tmp, env: { ...process.env, ...extraEnv }, stdio: ["ignore", "pipe", "pipe"] });
123
+ return { code: 0, stdout: String(stdout) };
124
+ } catch (e) {
125
+ return { code: e.status ?? 1, stdout: String(e.stdout ?? ""), stderr: String(e.stderr ?? "") };
126
+ }
127
+ };
128
+
129
+ resetToBaseline();
130
+ const binPass = runBin(["mission", "run", "--mock", "--", "node", "agent-fix.mjs"]);
131
+ check("`runcap mission run` exits 0 on a PASS mission", binPass.code === 0, `code=${binPass.code}`);
132
+ check("PASS run prints the verdict", /Mission verdict: PASS/.test(binPass.stdout), binPass.stdout.slice(-200));
133
+
134
+ resetToBaseline();
135
+ const binBlock = runBin(["mission", "run", "--mock", "--", "node", "agent-cheat.mjs"]);
136
+ check("`runcap mission run` exits 1 on a BLOCKED mission", binBlock.code === 1, `code=${binBlock.code}`);
137
+
138
+ // 6. `runcap ci` (the GitHub Action's grader) must write the PR summary and exit 1 on BLOCKED.
139
+ // It grades the latest receipt on disk - which the BLOCKED cheat run just wrote.
140
+ const summaryFile = path.join(tmp, "step-summary.md");
141
+ writeFileSync(summaryFile, "");
142
+ const ci = runBin(["ci", "--policy", ".runcap/mission.yaml"], { GITHUB_STEP_SUMMARY: summaryFile });
143
+ check("`runcap ci` exits 1 when the graded receipt is BLOCKED", ci.code === 1, `code=${ci.code}`);
144
+ const summary = readFileSync(summaryFile, "utf8");
145
+ check("`runcap ci` writes a PR summary to GITHUB_STEP_SUMMARY", /Runcap mission verdict: BLOCKED/.test(summary), summary.slice(0, 200));
146
+
147
+ console.log("\n" + (failures === 0 ? "ALL MISSION TESTS PASSED" : `${failures} MISSION TEST(S) FAILED`));
148
+ process.exit(failures === 0 ? 0 : 1);
@@ -0,0 +1,48 @@
1
+ // Proves runOutcome produces an honest receipt end-to-end through the REAL cap
2
+ // gateway (mock upstream, so no network/keys), for both the VERIFIED and
3
+ // UNVERIFIED cases. The agent spends recorded tokens; the verify command's exit
4
+ // code is the oracle; Verified Outcome Cost is the actual spend only when verify
5
+ // passes. Runs in an isolated temp cwd so it never touches real .runcap data.
6
+
7
+ import os from "node:os";
8
+ import path from "node:path";
9
+ import { fileURLToPath } from "node:url";
10
+ import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
11
+
12
+ // Resolve the engine relative to this script so the test runs from any cwd
13
+ // (it chdir's into a temp dir below, so a relative import would break).
14
+ const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
15
+
16
+ const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-outcome-"));
17
+ process.chdir(tmp);
18
+
19
+ // A tiny agent that spends through the gateway and writes (or doesn't write) a fix.
20
+ mkdirSync(path.join(tmp, "app"), { recursive: true });
21
+ writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
22
+ writeFileSync(path.join(tmp, "app", "verify.mjs"),
23
+ "import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
24
+ writeFileSync(path.join(tmp, "agent-fix.mjs"),
25
+ "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'fix it'}]})}).then(r=>r.text());" +
26
+ "const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
27
+ writeFileSync(path.join(tmp, "agent-nop.mjs"),
28
+ "const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'think'}]})}).then(r=>r.text());console.log('no fix');\n");
29
+
30
+ let failures = 0;
31
+ const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
32
+
33
+ const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
34
+
35
+ const nop = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-nop.mjs"], label: "nop", mock: true });
36
+ check("no-fix run is UNVERIFIED", nop.receipt.outcome === "UNVERIFIED", `outcome=${nop.receipt.outcome}`);
37
+ check("no-fix run still spent real money", nop.receipt.cost.actualCostUsd > 0, `cost=${nop.receipt.cost.actualCostUsd}`);
38
+ check("no-fix Verified Outcome Cost is null", nop.receipt.cost.verifiedOutcomeCostUsd === null);
39
+ check("no-fix counts money without delivery", nop.receipt.cost.moneySpentWithoutVerifiedDeliveryUsd > 0);
40
+
41
+ const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "fix", mock: true });
42
+ check("fix run is VERIFIED", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
43
+ check("fix Verified Outcome Cost equals actual spend", fix.receipt.cost.verifiedOutcomeCostUsd === fix.receipt.cost.actualCostUsd);
44
+ check("fix counts zero undelivered money", fix.receipt.cost.moneySpentWithoutVerifiedDeliveryUsd === 0);
45
+ check("cost truth is calculated from usage + price table", /price_table/.test(fix.receipt.cost.truth));
46
+
47
+ console.log("\n" + (failures === 0 ? "ALL OUTCOME TESTS PASSED" : `${failures} OUTCOME TEST(S) FAILED`));
48
+ process.exit(failures === 0 ? 0 : 1);
@@ -0,0 +1,121 @@
1
+ // Proves src/policy.mjs parses, validates, and grades correctly. Pure unit test:
2
+ // no gateway, no git, no agent - just the policy module over hand-built inputs.
3
+ // Covers: YAML parse + hash, .json fallback, required-field validation, the
4
+ // guard/scope warnings, and every BLOCK condition in evaluatePolicyVerdict.
5
+
6
+ import os from "node:os";
7
+ import path from "node:path";
8
+ import { fileURLToPath } from "node:url";
9
+ import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
10
+
11
+ const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
12
+ const { loadPolicy, validatePolicy, evaluatePolicyVerdict, policyMeta } = await import(path.join(SRC_DIR, "policy.mjs"));
13
+
14
+ let failures = 0;
15
+ const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
16
+
17
+ const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-policy-"));
18
+ mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
19
+
20
+ const VALID_YAML = `version: v1
21
+ identity:
22
+ project: checkout
23
+ team: payments
24
+ mission:
25
+ name: Fix the failing checkout test
26
+ task_class: bugfix
27
+ budget:
28
+ mission_hard_limit_usd: 10
29
+ max_llm_calls: 12
30
+ max_runtime_minutes: 30
31
+ verification:
32
+ command: "node app/verify.mjs"
33
+ guard: strict
34
+ protect: ["tests/**"]
35
+ allow: ["src/checkout/**"]
36
+ `;
37
+
38
+ // 1. Valid YAML loads, parses, hashes, validates clean.
39
+ writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), VALID_YAML);
40
+ const loaded = loadPolicy(tmp);
41
+ check("loadPolicy finds .runcap/mission.yaml", loaded && loaded.source.endsWith("mission.yaml"));
42
+ check("loadPolicy computes a sha256 hash", /^[0-9a-f]{64}$/.test(loaded.hash), loaded.hash);
43
+ check("valid policy parses mission.name", loaded.policy.mission.name === "Fix the failing checkout test");
44
+ const v1 = validatePolicy(loaded.policy);
45
+ check("valid policy validates ok", v1.ok === true, JSON.stringify(v1.errors));
46
+ check("valid policy with allow has no scope warning", !v1.warnings.some((w) => w.includes("allow is empty")));
47
+ const meta = policyMeta(loaded);
48
+ check("policyMeta carries identity + hash", meta.identity.project === "checkout" && meta.hash === loaded.hash);
49
+ check("policyMeta carries the limits", meta.limits.mission_hard_limit_usd === 10 && meta.limits.max_llm_calls === 12);
50
+
51
+ // 2. .json fallback parses with native JSON.parse (no parser needed).
52
+ const tmp2 = mkdtempSync(path.join(os.tmpdir(), "runcap-policy-json-"));
53
+ mkdirSync(path.join(tmp2, ".runcap"), { recursive: true });
54
+ writeFileSync(path.join(tmp2, ".runcap", "mission.json"), JSON.stringify({
55
+ version: "v1",
56
+ mission: { name: "json mission" },
57
+ budget: { mission_hard_limit_usd: 5 },
58
+ verification: { command: "npm test" }
59
+ }));
60
+ const jsonLoaded = loadPolicy(tmp2);
61
+ check("loadPolicy reads .json fallback", jsonLoaded && jsonLoaded.source.endsWith("mission.json"));
62
+ check("json policy validates ok", validatePolicy(jsonLoaded.policy).ok === true);
63
+
64
+ // 3. Missing verification.command → invalid.
65
+ const noVerify = validatePolicy({ version: "v1", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 } });
66
+ check("missing verification.command is invalid", noVerify.ok === false && noVerify.errors.some((e) => e.includes("verification.command")));
67
+
68
+ // 4. Bad version → invalid.
69
+ const badVersion = validatePolicy({ version: "v2", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 }, verification: { command: "npm test" } });
70
+ check("wrong version is invalid", badVersion.ok === false && badVersion.errors.some((e) => e.includes("version")));
71
+
72
+ // 5. Missing budget cap → invalid.
73
+ const noBudget = validatePolicy({ version: "v1", mission: { name: "x" }, verification: { command: "npm test" } });
74
+ check("missing mission_hard_limit_usd is invalid", noBudget.ok === false && noBudget.errors.some((e) => e.includes("mission_hard_limit_usd")));
75
+
76
+ // 6. No allow scope → warning (not error).
77
+ const noAllow = validatePolicy({ version: "v1", mission: { name: "x" }, budget: { mission_hard_limit_usd: 1 }, verification: { command: "npm test", allow: [] } });
78
+ check("empty allow produces a warning", noAllow.ok === true && noAllow.warnings.some((w) => w.includes("allow is empty")));
79
+
80
+ // 7. evaluatePolicyVerdict: a clean VERIFIED receipt → PASS.
81
+ const policy = loaded.policy;
82
+ const cleanReceipt = {
83
+ outcome: "VERIFIED",
84
+ verificationIntegrity: { status: "VERIFIED_STRONG", violations: [] },
85
+ cost: { actualCostUsd: 0.0007, llmCalls: 2, budgetGuardTripped: false },
86
+ work: { agentDurationMs: 5000 }
87
+ };
88
+ check("clean receipt grades PASS", evaluatePolicyVerdict(cleanReceipt, policy).verdict === "PASS");
89
+
90
+ // 8. Compromised verifier → BLOCKED with the reason.
91
+ const compromised = { ...cleanReceipt, verificationIntegrity: { status: "VERIFIER_COMPROMISED", violations: ["verifier_file_unchanged:app/verify.mjs"] } };
92
+ const cv = evaluatePolicyVerdict(compromised, policy);
93
+ check("compromised verifier grades BLOCKED", cv.verdict === "BLOCKED" && cv.reasons.some((r) => r.includes("VERIFIER_COMPROMISED")));
94
+
95
+ // 9. UNVERIFIED → BLOCKED.
96
+ const unver = { ...cleanReceipt, outcome: "UNVERIFIED", verificationIntegrity: { status: "UNVERIFIED", violations: [] } };
97
+ check("unverified grades BLOCKED", evaluatePolicyVerdict(unver, policy).verdict === "BLOCKED");
98
+
99
+ // 10. Out-of-allow scope → BLOCKED.
100
+ const scope = { ...cleanReceipt, verificationIntegrity: { status: "VERIFIED_STRONG", violations: ["within_allowed_scope:src/other.mjs"] } };
101
+ const sc = evaluatePolicyVerdict(scope, policy);
102
+ check("out-of-scope edit grades BLOCKED", sc.verdict === "BLOCKED" && sc.reasons.some((r) => r.toLowerCase().includes("scope")));
103
+
104
+ // 11. Over the dollar cap → BLOCKED.
105
+ const overCost = { ...cleanReceipt, cost: { actualCostUsd: 11, llmCalls: 2, budgetGuardTripped: false } };
106
+ check("over the cap grades BLOCKED", evaluatePolicyVerdict(overCost, policy).verdict === "BLOCKED");
107
+
108
+ // 12. budget_guard tripped → BLOCKED.
109
+ const guardTrip = { ...cleanReceipt, cost: { actualCostUsd: 1, llmCalls: 2, budgetGuardTripped: true } };
110
+ check("budget guard trip grades BLOCKED", evaluatePolicyVerdict(guardTrip, policy).verdict === "BLOCKED");
111
+
112
+ // 13. Too many LLM calls → BLOCKED.
113
+ const tooMany = { ...cleanReceipt, cost: { actualCostUsd: 1, llmCalls: 99, budgetGuardTripped: false } };
114
+ check("too many llm calls grades BLOCKED", evaluatePolicyVerdict(tooMany, policy).verdict === "BLOCKED");
115
+
116
+ // 14. Over the runtime budget → BLOCKED.
117
+ const slow = { ...cleanReceipt, work: { agentDurationMs: 31 * 60_000 } };
118
+ check("over runtime budget grades BLOCKED", evaluatePolicyVerdict(slow, policy).verdict === "BLOCKED");
119
+
120
+ console.log("\n" + (failures === 0 ? "ALL POLICY TESTS PASSED" : `${failures} POLICY TEST(S) FAILED`));
121
+ process.exit(failures === 0 ? 0 : 1);
@@ -0,0 +1,37 @@
1
+ import { chromium } from "playwright";
2
+ import { pathToFileURL } from "node:url";
3
+ import { resolve } from "node:path";
4
+
5
+ const root = resolve(import.meta.dirname, "..");
6
+ const mediaDir = resolve(root, "docs/assets/media");
7
+
8
+ const shots = [
9
+ {
10
+ html: resolve(mediaDir, "cover.html"),
11
+ png: resolve(mediaDir, "cover.png"),
12
+ width: 1200,
13
+ height: 630
14
+ },
15
+ {
16
+ html: resolve(mediaDir, "demo.html"),
17
+ png: resolve(mediaDir, "demo.png"),
18
+ width: 1200,
19
+ height: 750
20
+ }
21
+ ];
22
+
23
+ const browser = await chromium.launch();
24
+ try {
25
+ for (const shot of shots) {
26
+ const page = await browser.newPage({
27
+ viewport: { width: shot.width, height: shot.height },
28
+ deviceScaleFactor: 2
29
+ });
30
+ await page.goto(pathToFileURL(shot.html).href, { waitUntil: "networkidle" });
31
+ await page.screenshot({ path: shot.png, fullPage: false });
32
+ await page.close();
33
+ console.log(`rendered ${shot.png}`);
34
+ }
35
+ } finally {
36
+ await browser.close();
37
+ }