runcap 0.3.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +211 -9
- package/bin/runcap.mjs +153 -0
- package/examples/outcome-demo/agent-fixes.mjs +24 -0
- package/examples/outcome-demo/agent-spins.mjs +20 -0
- package/examples/outcome-demo/broken.mjs +5 -0
- package/examples/outcome-demo/verify.mjs +7 -0
- package/package.json +11 -2
- package/scripts/guard-test.mjs +76 -0
- package/scripts/make-demo-svg.mjs +20 -20
- package/scripts/mission-test.mjs +148 -0
- package/scripts/outcome-test.mjs +48 -0
- package/scripts/policy-test.mjs +121 -0
- package/scripts/render-media-screenshots.mjs +37 -0
- package/src/mission-control.mjs +441 -1
- package/src/policy.mjs +208 -0
package/src/mission-control.mjs
CHANGED
|
@@ -8,10 +8,12 @@ import process from "node:process";
|
|
|
8
8
|
import { syncRun } from "./cloud.mjs";
|
|
9
9
|
import { sendAlert } from "./alerts.mjs";
|
|
10
10
|
import { compressRequestBody, estimateTokens, requestShapeText, detectLoop, responseSignature } from "./compressor.mjs";
|
|
11
|
+
import { evaluatePolicyVerdict, policyMeta, formatPolicyBlock } from "./policy.mjs";
|
|
11
12
|
|
|
12
13
|
const STORE_DIR = ".runcap";
|
|
13
14
|
const MISSIONS_DIR = path.join(STORE_DIR, "missions");
|
|
14
15
|
const PLANS_DIR = path.join(STORE_DIR, "plans");
|
|
16
|
+
const OUTCOMES_DIR = path.join(STORE_DIR, "outcomes");
|
|
15
17
|
const FUEL_FILE = path.join(STORE_DIR, "fuel.json");
|
|
16
18
|
const GATEWAY_EVENTS_FILE = path.join(STORE_DIR, "gateway-events.jsonl");
|
|
17
19
|
const BUDGET_FILE = path.join(STORE_DIR, "budget.json");
|
|
@@ -147,6 +149,439 @@ export async function runMission({ command, label, fuelBefore, autoGateway = fal
|
|
|
147
149
|
};
|
|
148
150
|
}
|
|
149
151
|
|
|
152
|
+
// Verified Outcome Cost: run an agent on one task, then run a verification
|
|
153
|
+
// command, and only count the money as "delivered" if verification passes.
|
|
154
|
+
// The unit the rest of the industry ignores: dollars per VERIFIED task, not
|
|
155
|
+
// dollars per token. Reuses the same gateway cost, git-diff, and truth-label
|
|
156
|
+
// machinery as runMission, so every number on the receipt is observed or
|
|
157
|
+
// calculated, never guessed.
|
|
158
|
+
export async function runOutcome({ task, verify, command, label, mock = false, guard = false, protect = [], allow = [], policy = null, capUsd = null }) {
|
|
159
|
+
if (!task || !task.trim()) throw new Error("runOutcome: a --task description is required.");
|
|
160
|
+
if (!verify || !verify.trim()) throw new Error("runOutcome: a --verify command is required (e.g. \"npm test && npm run build\").");
|
|
161
|
+
if (!Array.isArray(command) || command.length === 0) throw new Error("runOutcome: an agent command after `--` is required.");
|
|
162
|
+
// A policy-bound mission is always guarded: the verdict leans on the integrity
|
|
163
|
+
// grade, so trusting an unguarded verifier would let a tampered pass score PASS.
|
|
164
|
+
if (policy) guard = true;
|
|
165
|
+
await ensureStore();
|
|
166
|
+
await mkdir(OUTCOMES_DIR, { recursive: true });
|
|
167
|
+
|
|
168
|
+
// Per-mission hard cap: override the gateway's budget env for the duration of
|
|
169
|
+
// THIS run so only this mission's spend counts against capUsd, then restore.
|
|
170
|
+
// The gateway reads readBudget()/budgetWindowMs() per request, so this reuses
|
|
171
|
+
// the existing budget_guard 429 enforcement with no new budget code.
|
|
172
|
+
const prevBudgetEnv = process.env.AIM_DAILY_BUDGET_USD;
|
|
173
|
+
const prevWindowEnv = process.env.AIM_BUDGET_WINDOW;
|
|
174
|
+
if (capUsd != null) {
|
|
175
|
+
process.env.AIM_DAILY_BUDGET_USD = String(capUsd);
|
|
176
|
+
process.env.AIM_BUDGET_WINDOW = "session";
|
|
177
|
+
}
|
|
178
|
+
try {
|
|
179
|
+
return await runOutcomeInner();
|
|
180
|
+
} finally {
|
|
181
|
+
if (capUsd != null) {
|
|
182
|
+
if (prevBudgetEnv === undefined) delete process.env.AIM_DAILY_BUDGET_USD;
|
|
183
|
+
else process.env.AIM_DAILY_BUDGET_USD = prevBudgetEnv;
|
|
184
|
+
if (prevWindowEnv === undefined) delete process.env.AIM_BUDGET_WINDOW;
|
|
185
|
+
else process.env.AIM_BUDGET_WINDOW = prevWindowEnv;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
async function runOutcomeInner() {
|
|
190
|
+
|
|
191
|
+
const windowMs = budgetWindowMs();
|
|
192
|
+
const spentBefore = (await readGatewaySummary({ windowMs })).estimatedCostUsd;
|
|
193
|
+
const cap = readBudget();
|
|
194
|
+
// Snapshot the ledger length BEFORE the run so we attribute events to this run
|
|
195
|
+
// by log position, not wall clock. Two runs in the same second would otherwise
|
|
196
|
+
// overlap a time window and double-count each other's calls and models.
|
|
197
|
+
const eventCountBefore = (await readGatewayEvents()).length;
|
|
198
|
+
|
|
199
|
+
// Guard: freeze a Task Contract BEFORE the agent touches anything. Verifying
|
|
200
|
+
// the result is meaningless if the agent can edit the verifier; so we hash the
|
|
201
|
+
// verifier files, snapshot package scripts, record the baseline commit, and
|
|
202
|
+
// confirm the task actually fails today (a pass on an already-green tree
|
|
203
|
+
// proves nothing the agent did). cwd is known only after the mission resolves
|
|
204
|
+
// it, so we resolve it the same way runMission does.
|
|
205
|
+
const guardCwd = process.cwd();
|
|
206
|
+
const contract = guard ? await freezeTaskContract({ cwd: guardCwd, verify, protect, allow }) : null;
|
|
207
|
+
|
|
208
|
+
// 1. Run the agent through the cap gateway so the spend is real and recorded.
|
|
209
|
+
const mission = await runMission({ command, label: label ?? "outcome", autoGateway: true, mock });
|
|
210
|
+
const missionRecord = await readMission(mission.id);
|
|
211
|
+
|
|
212
|
+
// 2. Cost actually spent on this run, measured from the gateway ledger delta.
|
|
213
|
+
const summaryAfter = await readGatewaySummary({ windowMs });
|
|
214
|
+
const spentAfter = summaryAfter.estimatedCostUsd;
|
|
215
|
+
const actualCostUsd = Number(Math.max(0, spentAfter - spentBefore).toFixed(6));
|
|
216
|
+
// Models/calls the run actually hit: exactly the events appended during it.
|
|
217
|
+
const runEvents = (await readGatewayEvents()).slice(eventCountBefore);
|
|
218
|
+
const models = [...new Set(runEvents.map((e) => e.model).filter((m) => m && m !== "unknown"))];
|
|
219
|
+
const llmCalls = runEvents.filter((e) => e.status >= 200 && e.status < 300 && e.usage).length;
|
|
220
|
+
// Did the gateway block a call to stay under the cap? A budget_guard 429 means
|
|
221
|
+
// the mission hit its ceiling - a policy-graded mission BLOCKS on it.
|
|
222
|
+
const budgetGuardTripped = runEvents.some((e) => e.status === 429 && e.truth === "budget_guard");
|
|
223
|
+
const costTruth = llmCalls > 0
|
|
224
|
+
? "calculated_from_provider_usage_and_sourced_price_table"
|
|
225
|
+
: "no_llm_calls_through_gateway";
|
|
226
|
+
|
|
227
|
+
// 3. Verify: run the user's verification command. Its exit code is the oracle.
|
|
228
|
+
const verifyResult = await runShell(verify, missionRecord.cwd);
|
|
229
|
+
const verifyPassed = verifyResult.exitCode === 0;
|
|
230
|
+
|
|
231
|
+
// 4. Did the agent actually change anything? A pass on a no-op is not delivery.
|
|
232
|
+
const changedFiles = missionRecord.diffEvidence.changedFiles;
|
|
233
|
+
const producedDiff = changedFiles.length > 0;
|
|
234
|
+
const outcome = verifyPassed ? "VERIFIED" : "UNVERIFIED";
|
|
235
|
+
const verifiedOutcomeCostUsd = verifyPassed ? actualCostUsd : null;
|
|
236
|
+
|
|
237
|
+
// 5. Guard: did the agent pass the check FAIRLY? Re-hash the verifier, look
|
|
238
|
+
// for tampering, and grade the verification's trustworthiness on a 4-level
|
|
239
|
+
// scale instead of a binary pass.
|
|
240
|
+
const integrity = guard
|
|
241
|
+
? await checkVerificationIntegrity({ contract, cwd: missionRecord.cwd, changedFiles, verifyPassed, verify })
|
|
242
|
+
: null;
|
|
243
|
+
|
|
244
|
+
const receipt = {
|
|
245
|
+
schema: policy ? "runcap.outcome-receipt/v0.3" : (guard ? "runcap.outcome-receipt/v0.2" : "runcap.outcome-receipt/v0.1"),
|
|
246
|
+
id: mission.id,
|
|
247
|
+
generatedAt: new Date().toISOString(),
|
|
248
|
+
task,
|
|
249
|
+
agent: { command, program: command[0] },
|
|
250
|
+
models,
|
|
251
|
+
verify: {
|
|
252
|
+
command: verify,
|
|
253
|
+
exitCode: verifyResult.exitCode,
|
|
254
|
+
passed: verifyPassed,
|
|
255
|
+
truth: "observed_exit_code"
|
|
256
|
+
},
|
|
257
|
+
cost: {
|
|
258
|
+
plannedCapUsd: cap,
|
|
259
|
+
actualCostUsd,
|
|
260
|
+
verifiedOutcomeCostUsd,
|
|
261
|
+
moneySpentWithoutVerifiedDeliveryUsd: verifyPassed ? 0 : actualCostUsd,
|
|
262
|
+
llmCalls,
|
|
263
|
+
budgetGuardTripped,
|
|
264
|
+
truth: costTruth
|
|
265
|
+
},
|
|
266
|
+
work: {
|
|
267
|
+
agentExitCode: missionRecord.exitCode,
|
|
268
|
+
agentDurationMs: missionRecord.durationMs,
|
|
269
|
+
verifyDurationMs: verifyResult.durationMs,
|
|
270
|
+
changedFiles,
|
|
271
|
+
changedFileCount: changedFiles.length,
|
|
272
|
+
producedDiff,
|
|
273
|
+
retries: { value: null, truth: "not_tracked_v0.1" },
|
|
274
|
+
truth: "observed_from_git_and_exit_code"
|
|
275
|
+
},
|
|
276
|
+
outcome,
|
|
277
|
+
verificationIntegrity: integrity,
|
|
278
|
+
costScope: {
|
|
279
|
+
measured: "observed_llm_calls_through_gateway_only",
|
|
280
|
+
note: "Verified Outcome Cost is the LLM spend that bought the result. It does NOT include subscriptions, CI minutes, sandbox compute, or human review time. For full agent economics, divide total spend across N attempts by strongly-verified outcomes (Expected Verified Outcome Cost, needs N>=5)."
|
|
281
|
+
},
|
|
282
|
+
missionReport: path.join(MISSIONS_DIR, mission.id, "report.md")
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
// Policy-bound mission: stamp the receipt with who/what + the rules (and the
|
|
286
|
+
// hash of the exact policy text), then grade an overall PASS/BLOCKED verdict.
|
|
287
|
+
// The hash lets a reviewer confirm which rules were in force for this run.
|
|
288
|
+
if (policy) {
|
|
289
|
+
receipt.policy = { ...policyMeta(policy), ...evaluatePolicyVerdict(receipt, policy.policy) };
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const outcomeDir = path.join(OUTCOMES_DIR, mission.id);
|
|
293
|
+
await mkdir(outcomeDir, { recursive: true });
|
|
294
|
+
await writeFile(path.join(outcomeDir, "receipt.json"), JSON.stringify(receipt, null, 2));
|
|
295
|
+
await writeFile(path.join(outcomeDir, "receipt.md"), formatOutcomeReceipt(receipt));
|
|
296
|
+
await writeFile(path.join(OUTCOMES_DIR, "latest"), mission.id);
|
|
297
|
+
|
|
298
|
+
return { id: mission.id, receipt, summary: formatOutcomeReceipt(receipt) };
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// Run a verification command string through the shell so operators can write
|
|
303
|
+
// natural pipelines like "npm test && npm run build". Output streams live.
|
|
304
|
+
async function runShell(commandString, cwd) {
|
|
305
|
+
const started = Date.now();
|
|
306
|
+
const shell = process.platform === "win32" ? "cmd" : "sh";
|
|
307
|
+
const shellArgs = process.platform === "win32" ? ["/c", commandString] : ["-c", commandString];
|
|
308
|
+
return await new Promise((resolve) => {
|
|
309
|
+
const child = spawn(shell, shellArgs, { cwd, env: { ...process.env, AIM_WRAPPED: "1" }, shell: false });
|
|
310
|
+
let stdout = "";
|
|
311
|
+
let stderr = "";
|
|
312
|
+
child.stdout?.on("data", (chunk) => { const t = chunk.toString(); stdout += t; process.stdout.write(t); });
|
|
313
|
+
child.stderr?.on("data", (chunk) => { const t = chunk.toString(); stderr += t; process.stderr.write(t); });
|
|
314
|
+
child.on("error", (error) => resolve({ stdout, stderr: stderr + `\n${error.message}`, exitCode: 127, durationMs: Date.now() - started }));
|
|
315
|
+
child.on("close", (exitCode) => resolve({ stdout, stderr, exitCode: exitCode ?? 1, durationMs: Date.now() - started }));
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// --- Verification Integrity (runcap outcome --guard) ---------------------
|
|
320
|
+
// The honest hole in outcome v0.1: it trusts the verifier. An agent can turn a
|
|
321
|
+
// test green without doing the work - delete the test, rewrite the assertion,
|
|
322
|
+
// repoint the `npm test` script, disable strict mode, mock the real API. The
|
|
323
|
+
// guard freezes a contract before the run and re-checks it after, so a pass is
|
|
324
|
+
// graded on a 4-level trust scale instead of a binary VERIFIED.
|
|
325
|
+
|
|
326
|
+
const DEFAULT_PROTECTED_GLOBS = [
|
|
327
|
+
/(^|\/)[^/]*\.test\.[mc]?[jt]sx?$/,
|
|
328
|
+
/(^|\/)[^/]*\.spec\.[mc]?[jt]sx?$/,
|
|
329
|
+
/(^|\/)__tests__\//,
|
|
330
|
+
/(^|\/)tests?\//,
|
|
331
|
+
/(^|\/)package\.json$/,
|
|
332
|
+
/(^|\/)tsconfig[^/]*\.json$/,
|
|
333
|
+
/(^|\/)jest\.config\./,
|
|
334
|
+
/(^|\/)vitest\.config\./
|
|
335
|
+
];
|
|
336
|
+
|
|
337
|
+
// Pull the concrete file paths a verify command names so we can hash them and
|
|
338
|
+
// detect edits. We can't statically parse an arbitrary shell pipeline, so we
|
|
339
|
+
// take a deliberately simple, honest approach: any whitespace token that looks
|
|
340
|
+
// like a path to an existing file is treated as a verifier file.
|
|
341
|
+
function verifierFilesFrom(verify, cwd) {
|
|
342
|
+
const tokens = verify.split(/\s+/).filter(Boolean);
|
|
343
|
+
const files = [];
|
|
344
|
+
for (const raw of tokens) {
|
|
345
|
+
const tok = raw.replace(/^["']|["']$/g, "");
|
|
346
|
+
if (!/[./]/.test(tok)) continue;
|
|
347
|
+
const abs = path.isAbsolute(tok) ? tok : path.join(cwd, tok);
|
|
348
|
+
if (existsSync(abs) && !files.includes(abs)) files.push(abs);
|
|
349
|
+
}
|
|
350
|
+
return files;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
function hashFile(absPath) {
|
|
354
|
+
try {
|
|
355
|
+
return createHash("sha256").update(readFileSync(absPath)).digest("hex");
|
|
356
|
+
} catch {
|
|
357
|
+
return null;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
function packageScriptsOf(cwd) {
|
|
362
|
+
const pkg = readOptionalSync(path.join(cwd, "package.json"));
|
|
363
|
+
if (!pkg) return null;
|
|
364
|
+
const parsed = safeJson(pkg);
|
|
365
|
+
return parsed && parsed.scripts ? parsed.scripts : {};
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
function readOptionalSync(file) {
|
|
369
|
+
try {
|
|
370
|
+
return readFileSync(file, "utf8");
|
|
371
|
+
} catch {
|
|
372
|
+
return null;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
async function freezeTaskContract({ cwd, verify, protect, allow }) {
|
|
377
|
+
const head = await git(["rev-parse", "HEAD"], cwd);
|
|
378
|
+
const baselineCommit = head.error ? null : head.text;
|
|
379
|
+
const verifierFiles = verifierFilesFrom(verify, cwd).map((abs) => ({
|
|
380
|
+
path: path.relative(cwd, abs),
|
|
381
|
+
sha256: hashFile(abs)
|
|
382
|
+
}));
|
|
383
|
+
const packageScripts = packageScriptsOf(cwd);
|
|
384
|
+
|
|
385
|
+
// Does the task actually fail today? A verify that already passes on the
|
|
386
|
+
// baseline tree proves the agent did nothing. Run it before the agent moves.
|
|
387
|
+
const baseline = await runShell(verify, cwd);
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
schema: "runcap.task-contract/v0.1",
|
|
391
|
+
frozenAt: new Date().toISOString(),
|
|
392
|
+
cwd,
|
|
393
|
+
baselineCommit,
|
|
394
|
+
verifyCommand: verify,
|
|
395
|
+
verifierFiles,
|
|
396
|
+
packageScripts,
|
|
397
|
+
protectedPaths: protect,
|
|
398
|
+
allowedPaths: allow,
|
|
399
|
+
baselineVerify: { exitCode: baseline.exitCode, passed: baseline.exitCode === 0 }
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function isProtected(relPath, extraProtected) {
|
|
404
|
+
if (extraProtected.some((p) => relPath === p || relPath.startsWith(p.replace(/\/?$/, "/")))) return true;
|
|
405
|
+
return DEFAULT_PROTECTED_GLOBS.some((re) => re.test(relPath));
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
function withinAllowed(relPath, allowed) {
|
|
409
|
+
if (allowed.length === 0) return true;
|
|
410
|
+
return allowed.some((a) => relPath === a || relPath.startsWith(a.replace(/\/?$/, "/")));
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
async function checkVerificationIntegrity({ contract, cwd, changedFiles, verifyPassed, verify }) {
|
|
414
|
+
const violations = [];
|
|
415
|
+
const checks = [];
|
|
416
|
+
const record = (id, ok, detail) => { checks.push({ id, ok, detail, truth: "calculated_from_observed_state" }); if (!ok) violations.push(id); };
|
|
417
|
+
|
|
418
|
+
// 1. Verifier files unchanged (hash match).
|
|
419
|
+
for (const vf of contract.verifierFiles) {
|
|
420
|
+
const now = hashFile(path.join(cwd, vf.path));
|
|
421
|
+
if (vf.sha256 === null) { record(`verifier_file_unreadable:${vf.path}`, true, "could not hash at freeze time"); continue; }
|
|
422
|
+
record(`verifier_file_unchanged:${vf.path}`, now === vf.sha256, now === null ? "verifier file deleted after run" : (now === vf.sha256 ? "hash matches" : "verifier file edited after run"));
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// 2. package.json scripts unchanged (can't repoint `npm test` at `true`).
|
|
426
|
+
if (contract.packageScripts) {
|
|
427
|
+
const after = packageScriptsOf(cwd);
|
|
428
|
+
const same = JSON.stringify(after) === JSON.stringify(contract.packageScripts);
|
|
429
|
+
record("package_scripts_unchanged", same, same ? "scripts identical" : "package.json scripts changed during run");
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// 3. No protected/test file deleted, and changes stay within allowed scope.
|
|
433
|
+
for (const f of changedFiles) {
|
|
434
|
+
if (isProtected(f, contract.protectedPaths)) {
|
|
435
|
+
record(`protected_path_untouched:${f}`, false, "agent modified a protected/test/config path");
|
|
436
|
+
}
|
|
437
|
+
if (!withinAllowed(f, contract.allowedPaths)) {
|
|
438
|
+
record(`within_allowed_scope:${f}`, false, "change is outside the allowed paths");
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// 4. The task actually failed before the run (otherwise a pass is meaningless).
|
|
443
|
+
const baseFailed = contract.baselineVerify && contract.baselineVerify.passed === false;
|
|
444
|
+
record("baseline_failed_before_run", !!baseFailed, baseFailed ? "verify failed on the baseline tree (the task was real)" : "verify already passed before the agent ran - the pass proves nothing");
|
|
445
|
+
|
|
446
|
+
// 5. Re-run verify against the baseline commit in a clean checkout: does the
|
|
447
|
+
// pass survive without the agent's uncommitted working-tree state? This
|
|
448
|
+
// catches a green that only exists because of untracked/uncommitted hacks.
|
|
449
|
+
let cleanRoom = { ran: false, passed: null, detail: "skipped (no baseline commit)" };
|
|
450
|
+
if (verifyPassed && contract.baselineCommit) {
|
|
451
|
+
cleanRoom = await verifyInCleanWorktree({ cwd, baselineCommit: contract.baselineCommit, verify, changedFiles });
|
|
452
|
+
record("verify_survives_clean_checkout", cleanRoom.passed === true, cleanRoom.detail);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Grade. Tampering with the verifier is categorically worse than a weak pass:
|
|
456
|
+
// it means the green light itself is untrustworthy.
|
|
457
|
+
const verifierTampered = checks.some((c) => !c.ok && (c.id.startsWith("verifier_file_unchanged:") || c.id.startsWith("protected_path_untouched:") || c.id === "package_scripts_unchanged"));
|
|
458
|
+
|
|
459
|
+
let status;
|
|
460
|
+
if (!verifyPassed) {
|
|
461
|
+
status = "UNVERIFIED";
|
|
462
|
+
} else if (verifierTampered) {
|
|
463
|
+
status = "VERIFIER_COMPROMISED";
|
|
464
|
+
} else if (violations.length === 0) {
|
|
465
|
+
status = "VERIFIED_STRONG";
|
|
466
|
+
} else {
|
|
467
|
+
status = "VERIFIED_WEAK";
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
const reason = {
|
|
471
|
+
UNVERIFIED: "Verification did not pass.",
|
|
472
|
+
VERIFIER_COMPROMISED: "Verification passed, but the verifier itself was modified during the run. The green light cannot be trusted.",
|
|
473
|
+
VERIFIED_STRONG: "Verification passed and the verifier was untouched: tests/scripts unchanged, changes in scope, the task really failed before, and the pass survives a clean checkout.",
|
|
474
|
+
VERIFIED_WEAK: "Verification passed and the verifier was untouched, but at least one strong condition was not met (e.g. baseline failure not reproduced, or pass not reproduced in a clean checkout)."
|
|
475
|
+
}[status];
|
|
476
|
+
|
|
477
|
+
return {
|
|
478
|
+
schema: "runcap.verification-integrity/v0.1",
|
|
479
|
+
status,
|
|
480
|
+
reason,
|
|
481
|
+
contract: {
|
|
482
|
+
baselineCommit: contract.baselineCommit,
|
|
483
|
+
verifierFiles: contract.verifierFiles.map((f) => f.path),
|
|
484
|
+
protectedPaths: contract.protectedPaths,
|
|
485
|
+
allowedPaths: contract.allowedPaths,
|
|
486
|
+
baselineVerifyPassed: contract.baselineVerify ? contract.baselineVerify.passed : null
|
|
487
|
+
},
|
|
488
|
+
cleanRoom,
|
|
489
|
+
checks,
|
|
490
|
+
violations
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Re-run the verify command from the baseline commit in a throwaway worktree,
|
|
495
|
+
// then copy in only the agent's changed files. If the pass came from real edits
|
|
496
|
+
// to allowed files it survives; if it came from uncommitted local junk it dies.
|
|
497
|
+
async function verifyInCleanWorktree({ cwd, baselineCommit, verify, changedFiles }) {
|
|
498
|
+
const tmpBase = path.join(STORE_DIR, "cleanroom");
|
|
499
|
+
await mkdir(tmpBase, { recursive: true });
|
|
500
|
+
const wt = path.join(tmpBase, `wt-${createHash("sha1").update(`${baselineCommit}${Date.now()}${Math.random()}`).digest("hex").slice(0, 8)}`);
|
|
501
|
+
const add = await git(["worktree", "add", "--detach", wt, baselineCommit], cwd);
|
|
502
|
+
if (add.error) {
|
|
503
|
+
return { ran: false, passed: null, detail: `clean-worktree check skipped: ${add.error}` };
|
|
504
|
+
}
|
|
505
|
+
try {
|
|
506
|
+
// Bring the agent's changed files into the clean baseline so we test the
|
|
507
|
+
// work, not the agent's whole dirty tree.
|
|
508
|
+
for (const rel of changedFiles) {
|
|
509
|
+
const src = path.join(cwd, rel);
|
|
510
|
+
const dst = path.join(wt, rel);
|
|
511
|
+
try {
|
|
512
|
+
await mkdir(path.dirname(dst), { recursive: true });
|
|
513
|
+
await writeFile(dst, await readFile(src));
|
|
514
|
+
} catch { /* deleted/binary file: leave baseline version */ }
|
|
515
|
+
}
|
|
516
|
+
const result = await runShell(verify, wt);
|
|
517
|
+
return {
|
|
518
|
+
ran: true,
|
|
519
|
+
passed: result.exitCode === 0,
|
|
520
|
+
detail: result.exitCode === 0
|
|
521
|
+
? "pass reproduced from baseline + changed files in a clean checkout"
|
|
522
|
+
: "pass did NOT reproduce in a clean checkout (green depended on uncommitted local state)"
|
|
523
|
+
};
|
|
524
|
+
} finally {
|
|
525
|
+
await git(["worktree", "remove", "--force", wt], cwd);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
function formatOutcomeReceipt(r) {
|
|
530
|
+
const usd = (n) => (n === null || n === undefined ? "n/a" : fmtUsd(n));
|
|
531
|
+
const lines = [
|
|
532
|
+
"Runcap outcome receipt",
|
|
533
|
+
"======================",
|
|
534
|
+
`Task: ${r.task}`,
|
|
535
|
+
`Agent: ${r.agent.command.join(" ")}`,
|
|
536
|
+
`Model(s): ${r.models.length ? r.models.join(", ") : "none (no LLM calls through gateway)"}`,
|
|
537
|
+
`Planned cap: ${r.cost.plannedCapUsd === null ? "no cap set" : usd(r.cost.plannedCapUsd)}`,
|
|
538
|
+
`Actual cost: ${usd(r.cost.actualCostUsd)} (${r.cost.llmCalls} priced LLM call(s), truth: ${r.cost.truth})`,
|
|
539
|
+
`Agent runtime: ${(r.work.agentDurationMs / 1000).toFixed(1)}s exit ${r.work.agentExitCode}`,
|
|
540
|
+
`Verify runtime: ${(r.work.verifyDurationMs / 1000).toFixed(1)}s`,
|
|
541
|
+
`Retries: not tracked (v0.1)`,
|
|
542
|
+
`Changed files: ${r.work.changedFileCount}${r.work.changedFileCount ? " (" + r.work.changedFiles.join(", ") + ")" : ""}`,
|
|
543
|
+
`Verification: \`${r.verify.command}\``,
|
|
544
|
+
`Verify result: ${r.verify.passed ? "PASSED" : "FAILED"} (exit ${r.verify.exitCode}, truth: observed)`,
|
|
545
|
+
"",
|
|
546
|
+
`Outcome: ${r.outcome}`
|
|
547
|
+
];
|
|
548
|
+
if (r.outcome === "VERIFIED") {
|
|
549
|
+
lines.push(`Verified Outcome Cost: ${usd(r.cost.verifiedOutcomeCostUsd)} (money that bought a verified result)`);
|
|
550
|
+
} else {
|
|
551
|
+
lines.push(`Verified Outcome Cost: N/A (verification did not pass)`);
|
|
552
|
+
lines.push(`Money spent without verified delivery: ${usd(r.cost.moneySpentWithoutVerifiedDeliveryUsd)}`);
|
|
553
|
+
}
|
|
554
|
+
if (r.verificationIntegrity) {
|
|
555
|
+
const vi = r.verificationIntegrity;
|
|
556
|
+
lines.push("");
|
|
557
|
+
lines.push(`Verification integrity: ${vi.status}`);
|
|
558
|
+
lines.push(` ${vi.reason}`);
|
|
559
|
+
if (vi.violations.length) {
|
|
560
|
+
lines.push(` Failed checks (${vi.violations.length}):`);
|
|
561
|
+
for (const c of vi.checks.filter((x) => !x.ok)) lines.push(` - ${c.id}: ${c.detail}`);
|
|
562
|
+
}
|
|
563
|
+
if (vi.cleanRoom && vi.cleanRoom.ran) lines.push(` Clean-checkout re-verify: ${vi.cleanRoom.passed ? "PASSED" : "FAILED"} (${vi.cleanRoom.detail})`);
|
|
564
|
+
}
|
|
565
|
+
if (r.policy) {
|
|
566
|
+
lines.push("");
|
|
567
|
+
lines.push(...formatPolicyBlock(r.policy));
|
|
568
|
+
}
|
|
569
|
+
return lines.join("\n") + "\n";
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
export async function latestOutcomeId() {
|
|
573
|
+
try {
|
|
574
|
+
return (await readFile(path.join(OUTCOMES_DIR, "latest"), "utf8")).trim();
|
|
575
|
+
} catch {
|
|
576
|
+
return null;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
export async function renderOutcome(id) {
|
|
581
|
+
const file = path.join(OUTCOMES_DIR, id, "receipt.md");
|
|
582
|
+
return (await readFile(file, "utf8"));
|
|
583
|
+
}
|
|
584
|
+
|
|
150
585
|
export async function latestMissionId() {
|
|
151
586
|
try {
|
|
152
587
|
return (await readFile(path.join(STORE_DIR, "latest"), "utf8")).trim();
|
|
@@ -1599,7 +2034,12 @@ const MODEL_PRICES = [
|
|
|
1599
2034
|
{ match: ["gpt-4.1-mini"], inputPerMillion: 0.4, outputPerMillion: 1.6, cacheReadPerMillion: 0.1, provider: "openai" },
|
|
1600
2035
|
{ match: ["gpt-4.1"], inputPerMillion: 2, outputPerMillion: 8, cacheReadPerMillion: 0.5, provider: "openai" },
|
|
1601
2036
|
{ match: ["gpt-4o-mini"], inputPerMillion: 0.15, outputPerMillion: 0.6, cacheReadPerMillion: 0.075, provider: "openai" },
|
|
1602
|
-
{ match: ["gpt-4o"], inputPerMillion: 2.5, outputPerMillion: 10, cacheReadPerMillion: 1.25, provider: "openai" }
|
|
2037
|
+
{ match: ["gpt-4o"], inputPerMillion: 2.5, outputPerMillion: 10, cacheReadPerMillion: 1.25, provider: "openai" },
|
|
2038
|
+
// DeepSeek (api-docs.deepseek.com/quick_start/pricing). OpenAI-compatible API,
|
|
2039
|
+
// so the same gateway prices and caps it with no extra setup. deepseek-chat /
|
|
2040
|
+
// deepseek-reasoner are the non-thinking / thinking modes of deepseek-v4-flash.
|
|
2041
|
+
{ match: ["deepseek-v4-pro"], inputPerMillion: 0.435, outputPerMillion: 0.87, cacheReadPerMillion: 0.003625, provider: "deepseek" },
|
|
2042
|
+
{ match: ["deepseek-v4-flash", "deepseek-chat", "deepseek-reasoner", "deepseek"], inputPerMillion: 0.14, outputPerMillion: 0.28, cacheReadPerMillion: 0.0028, provider: "deepseek" }
|
|
1603
2043
|
];
|
|
1604
2044
|
|
|
1605
2045
|
function estimateApiCost(usage, model) {
|
package/src/policy.mjs
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
// Policy-bound missions (runcap mission / policy / ci).
|
|
2
|
+
//
|
|
3
|
+
// A mission's rules are declared once in `.runcap/mission.yaml` (or .yml/.json),
|
|
4
|
+
// enforced during the run by the existing gateway cap + verification guard, and
|
|
5
|
+
// graded into a PASS/BLOCKED verdict a GitHub Action turns into a red/green PR
|
|
6
|
+
// check. This module is deliberately pure: it parses the policy, validates it,
|
|
7
|
+
// and grades an ALREADY-BUILT outcome receipt against it. It imports only
|
|
8
|
+
// js-yaml + node builtins and never imports mission-control, so there is no
|
|
9
|
+
// import cycle (mission-control imports FROM here, one direction).
|
|
10
|
+
|
|
11
|
+
import { createHash } from "node:crypto";
|
|
12
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
13
|
+
import path from "node:path";
|
|
14
|
+
import yaml from "js-yaml";
|
|
15
|
+
|
|
16
|
+
const POLICY_FILENAMES = ["mission.yaml", "mission.yml", "mission.json"];
|
|
17
|
+
|
|
18
|
+
// Find and parse the policy. Precedence: an explicit path, else the first of
|
|
19
|
+
// .runcap/mission.{yaml,yml,json} that exists. Returns null when none is found
|
|
20
|
+
// so callers can decide whether a missing policy is an error.
|
|
21
|
+
export function loadPolicy(cwd = process.cwd(), explicitPath) {
|
|
22
|
+
let source = null;
|
|
23
|
+
if (explicitPath) {
|
|
24
|
+
source = path.isAbsolute(explicitPath) ? explicitPath : path.join(cwd, explicitPath);
|
|
25
|
+
if (!existsSync(source)) throw new Error(`Policy file not found: ${explicitPath}`);
|
|
26
|
+
} else {
|
|
27
|
+
for (const name of POLICY_FILENAMES) {
|
|
28
|
+
const candidate = path.join(cwd, ".runcap", name);
|
|
29
|
+
if (existsSync(candidate)) { source = candidate; break; }
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
if (!source) return null;
|
|
33
|
+
|
|
34
|
+
const raw = readFileSync(source, "utf8");
|
|
35
|
+
let policy;
|
|
36
|
+
if (source.endsWith(".json")) {
|
|
37
|
+
// .json fallback uses native JSON.parse so the zero-config path needs no parser.
|
|
38
|
+
policy = JSON.parse(raw);
|
|
39
|
+
} else {
|
|
40
|
+
policy = yaml.load(raw);
|
|
41
|
+
}
|
|
42
|
+
if (!policy || typeof policy !== "object") {
|
|
43
|
+
throw new Error(`Policy file ${path.basename(source)} did not parse to an object.`);
|
|
44
|
+
}
|
|
45
|
+
return {
|
|
46
|
+
policy,
|
|
47
|
+
raw,
|
|
48
|
+
hash: createHash("sha256").update(raw).digest("hex"),
|
|
49
|
+
source
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Validate the policy shape. Errors block the mission; warnings are advisory.
|
|
54
|
+
export function validatePolicy(policy) {
|
|
55
|
+
const errors = [];
|
|
56
|
+
const warnings = [];
|
|
57
|
+
if (!policy || typeof policy !== "object") {
|
|
58
|
+
return { ok: false, errors: ["Policy is empty or not an object."], warnings };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (policy.version !== "v1") {
|
|
62
|
+
errors.push(`version must be "v1" (got ${JSON.stringify(policy.version)}).`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const mission = policy.mission ?? {};
|
|
66
|
+
if (!mission.name || !String(mission.name).trim()) {
|
|
67
|
+
errors.push("mission.name is required.");
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const verification = policy.verification ?? {};
|
|
71
|
+
if (!verification.command || !String(verification.command).trim()) {
|
|
72
|
+
errors.push("verification.command is required.");
|
|
73
|
+
}
|
|
74
|
+
if (verification.guard && !["strict", "off"].includes(verification.guard)) {
|
|
75
|
+
errors.push(`verification.guard must be "strict" or "off" (got ${JSON.stringify(verification.guard)}).`);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const budget = policy.budget ?? {};
|
|
79
|
+
const limit = budget.mission_hard_limit_usd;
|
|
80
|
+
if (!(typeof limit === "number" && Number.isFinite(limit) && limit > 0)) {
|
|
81
|
+
errors.push("budget.mission_hard_limit_usd is required and must be a positive number.");
|
|
82
|
+
}
|
|
83
|
+
if (budget.max_llm_calls !== undefined && !(Number.isFinite(budget.max_llm_calls) && budget.max_llm_calls > 0)) {
|
|
84
|
+
errors.push("budget.max_llm_calls, when set, must be a positive number.");
|
|
85
|
+
}
|
|
86
|
+
if (budget.max_runtime_minutes !== undefined && !(Number.isFinite(budget.max_runtime_minutes) && budget.max_runtime_minutes > 0)) {
|
|
87
|
+
errors.push("budget.max_runtime_minutes, when set, must be a positive number.");
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const identity = policy.identity ?? {};
|
|
91
|
+
if (!identity.project && !identity.team) {
|
|
92
|
+
warnings.push("identity has no project or team - the receipt will not carry org attribution.");
|
|
93
|
+
}
|
|
94
|
+
if (!Array.isArray(verification.allow) || verification.allow.length === 0) {
|
|
95
|
+
warnings.push("verification.allow is empty - any changed path passes the scope check. Declare the paths a legitimate fix should touch.");
|
|
96
|
+
}
|
|
97
|
+
if (verification.guard === "off") {
|
|
98
|
+
warnings.push("verification.guard is off - a tampered verifier will NOT be caught.");
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return { ok: errors.length === 0, errors, warnings };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Grade an already-built outcome receipt against the policy. Pure: no I/O.
|
|
105
|
+
// BLOCK is the conservative verdict - any single failing condition blocks the
|
|
106
|
+
// mission so a reviewer never has to read past the verdict to know it is unsafe.
|
|
107
|
+
export function evaluatePolicyVerdict(receipt, policy) {
|
|
108
|
+
const reasons = [];
|
|
109
|
+
const budget = policy?.budget ?? {};
|
|
110
|
+
const limit = budget.mission_hard_limit_usd;
|
|
111
|
+
|
|
112
|
+
const integrity = receipt.verificationIntegrity;
|
|
113
|
+
if (integrity && integrity.status === "VERIFIER_COMPROMISED") {
|
|
114
|
+
const tampered = (integrity.violations ?? []).filter((v) =>
|
|
115
|
+
v.startsWith("verifier_file_unchanged:") ||
|
|
116
|
+
v === "package_scripts_unchanged" ||
|
|
117
|
+
v.startsWith("protected_path_untouched:"));
|
|
118
|
+
reasons.push(`VERIFIER_COMPROMISED: the agent changed protected verification evidence${tampered.length ? " (" + tampered.join(", ") + ")" : ""}.`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (receipt.outcome === "UNVERIFIED") {
|
|
122
|
+
reasons.push("UNVERIFIED: verification did not pass.");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const scopeViolations = (integrity?.violations ?? []).filter((v) => v.startsWith("within_allowed_scope:"));
|
|
126
|
+
if (scopeViolations.length) {
|
|
127
|
+
reasons.push(`Out of allowed scope: ${scopeViolations.map((v) => v.replace("within_allowed_scope:", "")).join(", ")}.`);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const cost = receipt.cost ?? {};
|
|
131
|
+
if (typeof limit === "number" && typeof cost.actualCostUsd === "number" && cost.actualCostUsd > limit) {
|
|
132
|
+
reasons.push(`Over budget: $${cost.actualCostUsd} spent > $${limit} mission hard limit.`);
|
|
133
|
+
}
|
|
134
|
+
if (cost.budgetGuardTripped) {
|
|
135
|
+
reasons.push("Budget guard tripped: the gateway blocked a call to stay under the mission hard limit.");
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (Number.isFinite(budget.max_llm_calls) && typeof cost.llmCalls === "number" && cost.llmCalls > budget.max_llm_calls) {
|
|
139
|
+
reasons.push(`Too many LLM calls: ${cost.llmCalls} > max_llm_calls ${budget.max_llm_calls}.`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
const work = receipt.work ?? {};
|
|
143
|
+
if (Number.isFinite(budget.max_runtime_minutes) && typeof work.agentDurationMs === "number") {
|
|
144
|
+
const limitMs = budget.max_runtime_minutes * 60_000;
|
|
145
|
+
if (work.agentDurationMs > limitMs) {
|
|
146
|
+
reasons.push(`Over time budget: ${(work.agentDurationMs / 1000).toFixed(1)}s > max_runtime_minutes ${budget.max_runtime_minutes}.`);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
verdict: reasons.length === 0 ? "PASS" : "BLOCKED",
|
|
152
|
+
reasons,
|
|
153
|
+
truth: "calculated_from_policy_and_observed_receipt"
|
|
154
|
+
};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// The compact policy block embedded in the receipt: who/what + the rules and
|
|
158
|
+
// the hash of the exact policy text that graded the run, so a reviewer can
|
|
159
|
+
// confirm which rules were in force.
|
|
160
|
+
export function policyMeta(policyResult) {
|
|
161
|
+
const p = policyResult.policy ?? {};
|
|
162
|
+
const identity = p.identity ?? {};
|
|
163
|
+
const mission = p.mission ?? {};
|
|
164
|
+
const budget = p.budget ?? {};
|
|
165
|
+
const verification = p.verification ?? {};
|
|
166
|
+
return {
|
|
167
|
+
schema: "runcap.policy/v1",
|
|
168
|
+
hash: policyResult.hash,
|
|
169
|
+
source: policyResult.source ? path.basename(policyResult.source) : null,
|
|
170
|
+
identity: {
|
|
171
|
+
project: identity.project ?? null,
|
|
172
|
+
team: identity.team ?? null,
|
|
173
|
+
cost_center: identity.cost_center ?? null,
|
|
174
|
+
owner: identity.owner ?? null
|
|
175
|
+
},
|
|
176
|
+
mission: { name: mission.name ?? null, task_class: mission.task_class ?? null },
|
|
177
|
+
limits: {
|
|
178
|
+
mission_hard_limit_usd: budget.mission_hard_limit_usd ?? null,
|
|
179
|
+
max_llm_calls: budget.max_llm_calls ?? null,
|
|
180
|
+
max_runtime_minutes: budget.max_runtime_minutes ?? null,
|
|
181
|
+
guard: verification.guard ?? "strict"
|
|
182
|
+
}
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Markdown lines for the printed receipt and the PR summary. Accepts the
|
|
187
|
+
// `receipt.policy` block (policyMeta + verdict + reasons merged).
|
|
188
|
+
export function formatPolicyBlock(receiptPolicy) {
|
|
189
|
+
if (!receiptPolicy) return [];
|
|
190
|
+
const id = receiptPolicy.identity ?? {};
|
|
191
|
+
const limits = receiptPolicy.limits ?? {};
|
|
192
|
+
const who = [id.project && `project ${id.project}`, id.team && `team ${id.team}`, id.cost_center && `cost center ${id.cost_center}`]
|
|
193
|
+
.filter(Boolean).join(" / ") || "no org attribution";
|
|
194
|
+
const lines = [
|
|
195
|
+
`Mission policy: ${receiptPolicy.mission?.name ?? "(unnamed)"}${receiptPolicy.mission?.task_class ? " [" + receiptPolicy.mission.task_class + "]" : ""}`,
|
|
196
|
+
` ${who}`,
|
|
197
|
+
` Hard limit: ${limits.mission_hard_limit_usd === null || limits.mission_hard_limit_usd === undefined ? "none" : "$" + Number(limits.mission_hard_limit_usd).toFixed(2)}` +
|
|
198
|
+
`${limits.max_llm_calls ? ", max calls " + limits.max_llm_calls : ""}` +
|
|
199
|
+
`${limits.max_runtime_minutes ? ", max " + limits.max_runtime_minutes + " min" : ""}`,
|
|
200
|
+
` Policy hash: ${receiptPolicy.hash}`,
|
|
201
|
+
`Mission verdict: ${receiptPolicy.verdict}`
|
|
202
|
+
];
|
|
203
|
+
if (Array.isArray(receiptPolicy.reasons) && receiptPolicy.reasons.length) {
|
|
204
|
+
lines.push(` Blocked because:`);
|
|
205
|
+
for (const r of receiptPolicy.reasons) lines.push(` - ${r}`);
|
|
206
|
+
}
|
|
207
|
+
return lines;
|
|
208
|
+
}
|