runcap 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +211 -9
- package/bin/runcap.mjs +153 -0
- package/examples/outcome-demo/agent-fixes.mjs +24 -0
- package/examples/outcome-demo/agent-spins.mjs +20 -0
- package/examples/outcome-demo/broken.mjs +5 -0
- package/examples/outcome-demo/verify.mjs +7 -0
- package/package.json +11 -2
- package/scripts/guard-test.mjs +76 -0
- package/scripts/loop-e2e.mjs +137 -0
- package/scripts/loop-test.mjs +45 -1
- package/scripts/make-demo-svg.mjs +20 -19
- package/scripts/make-linkedin-loop-video.mjs +338 -0
- package/scripts/mission-test.mjs +148 -0
- package/scripts/outcome-test.mjs +48 -0
- package/scripts/policy-test.mjs +121 -0
- package/scripts/render-media-screenshots.mjs +37 -0
- package/src/compressor.mjs +77 -9
- package/src/mission-control.mjs +475 -8
- package/src/policy.mjs +208 -0
package/src/mission-control.mjs
CHANGED
|
@@ -7,11 +7,13 @@ import path from "node:path";
|
|
|
7
7
|
import process from "node:process";
|
|
8
8
|
import { syncRun } from "./cloud.mjs";
|
|
9
9
|
import { sendAlert } from "./alerts.mjs";
|
|
10
|
-
import { compressRequestBody, estimateTokens, requestShapeText, detectLoop } from "./compressor.mjs";
|
|
10
|
+
import { compressRequestBody, estimateTokens, requestShapeText, detectLoop, responseSignature } from "./compressor.mjs";
|
|
11
|
+
import { evaluatePolicyVerdict, policyMeta, formatPolicyBlock } from "./policy.mjs";
|
|
11
12
|
|
|
12
13
|
const STORE_DIR = ".runcap";
|
|
13
14
|
const MISSIONS_DIR = path.join(STORE_DIR, "missions");
|
|
14
15
|
const PLANS_DIR = path.join(STORE_DIR, "plans");
|
|
16
|
+
const OUTCOMES_DIR = path.join(STORE_DIR, "outcomes");
|
|
15
17
|
const FUEL_FILE = path.join(STORE_DIR, "fuel.json");
|
|
16
18
|
const GATEWAY_EVENTS_FILE = path.join(STORE_DIR, "gateway-events.jsonl");
|
|
17
19
|
const BUDGET_FILE = path.join(STORE_DIR, "budget.json");
|
|
@@ -147,6 +149,439 @@ export async function runMission({ command, label, fuelBefore, autoGateway = fal
|
|
|
147
149
|
};
|
|
148
150
|
}
|
|
149
151
|
|
|
152
|
+
// Verified Outcome Cost: run an agent on one task, then run a verification
|
|
153
|
+
// command, and only count the money as "delivered" if verification passes.
|
|
154
|
+
// The unit the rest of the industry ignores: dollars per VERIFIED task, not
|
|
155
|
+
// dollars per token. Reuses the same gateway cost, git-diff, and truth-label
|
|
156
|
+
// machinery as runMission, so every number on the receipt is observed or
|
|
157
|
+
// calculated, never guessed.
|
|
158
|
+
export async function runOutcome({ task, verify, command, label, mock = false, guard = false, protect = [], allow = [], policy = null, capUsd = null }) {
|
|
159
|
+
if (!task || !task.trim()) throw new Error("runOutcome: a --task description is required.");
|
|
160
|
+
if (!verify || !verify.trim()) throw new Error("runOutcome: a --verify command is required (e.g. \"npm test && npm run build\").");
|
|
161
|
+
if (!Array.isArray(command) || command.length === 0) throw new Error("runOutcome: an agent command after `--` is required.");
|
|
162
|
+
// A policy-bound mission is always guarded: the verdict leans on the integrity
|
|
163
|
+
// grade, so trusting an unguarded verifier would let a tampered pass score PASS.
|
|
164
|
+
if (policy) guard = true;
|
|
165
|
+
await ensureStore();
|
|
166
|
+
await mkdir(OUTCOMES_DIR, { recursive: true });
|
|
167
|
+
|
|
168
|
+
// Per-mission hard cap: override the gateway's budget env for the duration of
|
|
169
|
+
// THIS run so only this mission's spend counts against capUsd, then restore.
|
|
170
|
+
// The gateway reads readBudget()/budgetWindowMs() per request, so this reuses
|
|
171
|
+
// the existing budget_guard 429 enforcement with no new budget code.
|
|
172
|
+
const prevBudgetEnv = process.env.AIM_DAILY_BUDGET_USD;
|
|
173
|
+
const prevWindowEnv = process.env.AIM_BUDGET_WINDOW;
|
|
174
|
+
if (capUsd != null) {
|
|
175
|
+
process.env.AIM_DAILY_BUDGET_USD = String(capUsd);
|
|
176
|
+
process.env.AIM_BUDGET_WINDOW = "session";
|
|
177
|
+
}
|
|
178
|
+
try {
|
|
179
|
+
return await runOutcomeInner();
|
|
180
|
+
} finally {
|
|
181
|
+
if (capUsd != null) {
|
|
182
|
+
if (prevBudgetEnv === undefined) delete process.env.AIM_DAILY_BUDGET_USD;
|
|
183
|
+
else process.env.AIM_DAILY_BUDGET_USD = prevBudgetEnv;
|
|
184
|
+
if (prevWindowEnv === undefined) delete process.env.AIM_BUDGET_WINDOW;
|
|
185
|
+
else process.env.AIM_BUDGET_WINDOW = prevWindowEnv;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
async function runOutcomeInner() {
|
|
190
|
+
|
|
191
|
+
const windowMs = budgetWindowMs();
|
|
192
|
+
const spentBefore = (await readGatewaySummary({ windowMs })).estimatedCostUsd;
|
|
193
|
+
const cap = readBudget();
|
|
194
|
+
// Snapshot the ledger length BEFORE the run so we attribute events to this run
|
|
195
|
+
// by log position, not wall clock. Two runs in the same second would otherwise
|
|
196
|
+
// overlap a time window and double-count each other's calls and models.
|
|
197
|
+
const eventCountBefore = (await readGatewayEvents()).length;
|
|
198
|
+
|
|
199
|
+
// Guard: freeze a Task Contract BEFORE the agent touches anything. Verifying
|
|
200
|
+
// the result is meaningless if the agent can edit the verifier; so we hash the
|
|
201
|
+
// verifier files, snapshot package scripts, record the baseline commit, and
|
|
202
|
+
// confirm the task actually fails today (a pass on an already-green tree
|
|
203
|
+
// proves nothing the agent did). cwd is known only after the mission resolves
|
|
204
|
+
// it, so we resolve it the same way runMission does.
|
|
205
|
+
const guardCwd = process.cwd();
|
|
206
|
+
const contract = guard ? await freezeTaskContract({ cwd: guardCwd, verify, protect, allow }) : null;
|
|
207
|
+
|
|
208
|
+
// 1. Run the agent through the cap gateway so the spend is real and recorded.
|
|
209
|
+
const mission = await runMission({ command, label: label ?? "outcome", autoGateway: true, mock });
|
|
210
|
+
const missionRecord = await readMission(mission.id);
|
|
211
|
+
|
|
212
|
+
// 2. Cost actually spent on this run, measured from the gateway ledger delta.
|
|
213
|
+
const summaryAfter = await readGatewaySummary({ windowMs });
|
|
214
|
+
const spentAfter = summaryAfter.estimatedCostUsd;
|
|
215
|
+
const actualCostUsd = Number(Math.max(0, spentAfter - spentBefore).toFixed(6));
|
|
216
|
+
// Models/calls the run actually hit: exactly the events appended during it.
|
|
217
|
+
const runEvents = (await readGatewayEvents()).slice(eventCountBefore);
|
|
218
|
+
const models = [...new Set(runEvents.map((e) => e.model).filter((m) => m && m !== "unknown"))];
|
|
219
|
+
const llmCalls = runEvents.filter((e) => e.status >= 200 && e.status < 300 && e.usage).length;
|
|
220
|
+
// Did the gateway block a call to stay under the cap? A budget_guard 429 means
|
|
221
|
+
// the mission hit its ceiling - a policy-graded mission BLOCKS on it.
|
|
222
|
+
const budgetGuardTripped = runEvents.some((e) => e.status === 429 && e.truth === "budget_guard");
|
|
223
|
+
const costTruth = llmCalls > 0
|
|
224
|
+
? "calculated_from_provider_usage_and_sourced_price_table"
|
|
225
|
+
: "no_llm_calls_through_gateway";
|
|
226
|
+
|
|
227
|
+
// 3. Verify: run the user's verification command. Its exit code is the oracle.
|
|
228
|
+
const verifyResult = await runShell(verify, missionRecord.cwd);
|
|
229
|
+
const verifyPassed = verifyResult.exitCode === 0;
|
|
230
|
+
|
|
231
|
+
// 4. Did the agent actually change anything? A pass on a no-op is not delivery.
|
|
232
|
+
const changedFiles = missionRecord.diffEvidence.changedFiles;
|
|
233
|
+
const producedDiff = changedFiles.length > 0;
|
|
234
|
+
const outcome = verifyPassed ? "VERIFIED" : "UNVERIFIED";
|
|
235
|
+
const verifiedOutcomeCostUsd = verifyPassed ? actualCostUsd : null;
|
|
236
|
+
|
|
237
|
+
// 5. Guard: did the agent pass the check FAIRLY? Re-hash the verifier, look
|
|
238
|
+
// for tampering, and grade the verification's trustworthiness on a 4-level
|
|
239
|
+
// scale instead of a binary pass.
|
|
240
|
+
const integrity = guard
|
|
241
|
+
? await checkVerificationIntegrity({ contract, cwd: missionRecord.cwd, changedFiles, verifyPassed, verify })
|
|
242
|
+
: null;
|
|
243
|
+
|
|
244
|
+
const receipt = {
|
|
245
|
+
schema: policy ? "runcap.outcome-receipt/v0.3" : (guard ? "runcap.outcome-receipt/v0.2" : "runcap.outcome-receipt/v0.1"),
|
|
246
|
+
id: mission.id,
|
|
247
|
+
generatedAt: new Date().toISOString(),
|
|
248
|
+
task,
|
|
249
|
+
agent: { command, program: command[0] },
|
|
250
|
+
models,
|
|
251
|
+
verify: {
|
|
252
|
+
command: verify,
|
|
253
|
+
exitCode: verifyResult.exitCode,
|
|
254
|
+
passed: verifyPassed,
|
|
255
|
+
truth: "observed_exit_code"
|
|
256
|
+
},
|
|
257
|
+
cost: {
|
|
258
|
+
plannedCapUsd: cap,
|
|
259
|
+
actualCostUsd,
|
|
260
|
+
verifiedOutcomeCostUsd,
|
|
261
|
+
moneySpentWithoutVerifiedDeliveryUsd: verifyPassed ? 0 : actualCostUsd,
|
|
262
|
+
llmCalls,
|
|
263
|
+
budgetGuardTripped,
|
|
264
|
+
truth: costTruth
|
|
265
|
+
},
|
|
266
|
+
work: {
|
|
267
|
+
agentExitCode: missionRecord.exitCode,
|
|
268
|
+
agentDurationMs: missionRecord.durationMs,
|
|
269
|
+
verifyDurationMs: verifyResult.durationMs,
|
|
270
|
+
changedFiles,
|
|
271
|
+
changedFileCount: changedFiles.length,
|
|
272
|
+
producedDiff,
|
|
273
|
+
retries: { value: null, truth: "not_tracked_v0.1" },
|
|
274
|
+
truth: "observed_from_git_and_exit_code"
|
|
275
|
+
},
|
|
276
|
+
outcome,
|
|
277
|
+
verificationIntegrity: integrity,
|
|
278
|
+
costScope: {
|
|
279
|
+
measured: "observed_llm_calls_through_gateway_only",
|
|
280
|
+
note: "Verified Outcome Cost is the LLM spend that bought the result. It does NOT include subscriptions, CI minutes, sandbox compute, or human review time. For full agent economics, divide total spend across N attempts by strongly-verified outcomes (Expected Verified Outcome Cost, needs N>=5)."
|
|
281
|
+
},
|
|
282
|
+
missionReport: path.join(MISSIONS_DIR, mission.id, "report.md")
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
// Policy-bound mission: stamp the receipt with who/what + the rules (and the
|
|
286
|
+
// hash of the exact policy text), then grade an overall PASS/BLOCKED verdict.
|
|
287
|
+
// The hash lets a reviewer confirm which rules were in force for this run.
|
|
288
|
+
if (policy) {
|
|
289
|
+
receipt.policy = { ...policyMeta(policy), ...evaluatePolicyVerdict(receipt, policy.policy) };
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const outcomeDir = path.join(OUTCOMES_DIR, mission.id);
|
|
293
|
+
await mkdir(outcomeDir, { recursive: true });
|
|
294
|
+
await writeFile(path.join(outcomeDir, "receipt.json"), JSON.stringify(receipt, null, 2));
|
|
295
|
+
await writeFile(path.join(outcomeDir, "receipt.md"), formatOutcomeReceipt(receipt));
|
|
296
|
+
await writeFile(path.join(OUTCOMES_DIR, "latest"), mission.id);
|
|
297
|
+
|
|
298
|
+
return { id: mission.id, receipt, summary: formatOutcomeReceipt(receipt) };
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
// Run a verification command string through the shell so operators can write
|
|
303
|
+
// natural pipelines like "npm test && npm run build". Output streams live.
|
|
304
|
+
async function runShell(commandString, cwd) {
|
|
305
|
+
const started = Date.now();
|
|
306
|
+
const shell = process.platform === "win32" ? "cmd" : "sh";
|
|
307
|
+
const shellArgs = process.platform === "win32" ? ["/c", commandString] : ["-c", commandString];
|
|
308
|
+
return await new Promise((resolve) => {
|
|
309
|
+
const child = spawn(shell, shellArgs, { cwd, env: { ...process.env, AIM_WRAPPED: "1" }, shell: false });
|
|
310
|
+
let stdout = "";
|
|
311
|
+
let stderr = "";
|
|
312
|
+
child.stdout?.on("data", (chunk) => { const t = chunk.toString(); stdout += t; process.stdout.write(t); });
|
|
313
|
+
child.stderr?.on("data", (chunk) => { const t = chunk.toString(); stderr += t; process.stderr.write(t); });
|
|
314
|
+
child.on("error", (error) => resolve({ stdout, stderr: stderr + `\n${error.message}`, exitCode: 127, durationMs: Date.now() - started }));
|
|
315
|
+
child.on("close", (exitCode) => resolve({ stdout, stderr, exitCode: exitCode ?? 1, durationMs: Date.now() - started }));
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// --- Verification Integrity (runcap outcome --guard) ---------------------
|
|
320
|
+
// The honest hole in outcome v0.1: it trusts the verifier. An agent can turn a
|
|
321
|
+
// test green without doing the work - delete the test, rewrite the assertion,
|
|
322
|
+
// repoint the `npm test` script, disable strict mode, mock the real API. The
|
|
323
|
+
// guard freezes a contract before the run and re-checks it after, so a pass is
|
|
324
|
+
// graded on a 4-level trust scale instead of a binary VERIFIED.
|
|
325
|
+
|
|
326
|
+
const DEFAULT_PROTECTED_GLOBS = [
|
|
327
|
+
/(^|\/)[^/]*\.test\.[mc]?[jt]sx?$/,
|
|
328
|
+
/(^|\/)[^/]*\.spec\.[mc]?[jt]sx?$/,
|
|
329
|
+
/(^|\/)__tests__\//,
|
|
330
|
+
/(^|\/)tests?\//,
|
|
331
|
+
/(^|\/)package\.json$/,
|
|
332
|
+
/(^|\/)tsconfig[^/]*\.json$/,
|
|
333
|
+
/(^|\/)jest\.config\./,
|
|
334
|
+
/(^|\/)vitest\.config\./
|
|
335
|
+
];
|
|
336
|
+
|
|
337
|
+
// Pull the concrete file paths a verify command names so we can hash them and
|
|
338
|
+
// detect edits. We can't statically parse an arbitrary shell pipeline, so we
|
|
339
|
+
// take a deliberately simple, honest approach: any whitespace token that looks
|
|
340
|
+
// like a path to an existing file is treated as a verifier file.
|
|
341
|
+
function verifierFilesFrom(verify, cwd) {
|
|
342
|
+
const tokens = verify.split(/\s+/).filter(Boolean);
|
|
343
|
+
const files = [];
|
|
344
|
+
for (const raw of tokens) {
|
|
345
|
+
const tok = raw.replace(/^["']|["']$/g, "");
|
|
346
|
+
if (!/[./]/.test(tok)) continue;
|
|
347
|
+
const abs = path.isAbsolute(tok) ? tok : path.join(cwd, tok);
|
|
348
|
+
if (existsSync(abs) && !files.includes(abs)) files.push(abs);
|
|
349
|
+
}
|
|
350
|
+
return files;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
function hashFile(absPath) {
|
|
354
|
+
try {
|
|
355
|
+
return createHash("sha256").update(readFileSync(absPath)).digest("hex");
|
|
356
|
+
} catch {
|
|
357
|
+
return null;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
function packageScriptsOf(cwd) {
|
|
362
|
+
const pkg = readOptionalSync(path.join(cwd, "package.json"));
|
|
363
|
+
if (!pkg) return null;
|
|
364
|
+
const parsed = safeJson(pkg);
|
|
365
|
+
return parsed && parsed.scripts ? parsed.scripts : {};
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
function readOptionalSync(file) {
|
|
369
|
+
try {
|
|
370
|
+
return readFileSync(file, "utf8");
|
|
371
|
+
} catch {
|
|
372
|
+
return null;
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
async function freezeTaskContract({ cwd, verify, protect, allow }) {
|
|
377
|
+
const head = await git(["rev-parse", "HEAD"], cwd);
|
|
378
|
+
const baselineCommit = head.error ? null : head.text;
|
|
379
|
+
const verifierFiles = verifierFilesFrom(verify, cwd).map((abs) => ({
|
|
380
|
+
path: path.relative(cwd, abs),
|
|
381
|
+
sha256: hashFile(abs)
|
|
382
|
+
}));
|
|
383
|
+
const packageScripts = packageScriptsOf(cwd);
|
|
384
|
+
|
|
385
|
+
// Does the task actually fail today? A verify that already passes on the
|
|
386
|
+
// baseline tree proves the agent did nothing. Run it before the agent moves.
|
|
387
|
+
const baseline = await runShell(verify, cwd);
|
|
388
|
+
|
|
389
|
+
return {
|
|
390
|
+
schema: "runcap.task-contract/v0.1",
|
|
391
|
+
frozenAt: new Date().toISOString(),
|
|
392
|
+
cwd,
|
|
393
|
+
baselineCommit,
|
|
394
|
+
verifyCommand: verify,
|
|
395
|
+
verifierFiles,
|
|
396
|
+
packageScripts,
|
|
397
|
+
protectedPaths: protect,
|
|
398
|
+
allowedPaths: allow,
|
|
399
|
+
baselineVerify: { exitCode: baseline.exitCode, passed: baseline.exitCode === 0 }
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function isProtected(relPath, extraProtected) {
|
|
404
|
+
if (extraProtected.some((p) => relPath === p || relPath.startsWith(p.replace(/\/?$/, "/")))) return true;
|
|
405
|
+
return DEFAULT_PROTECTED_GLOBS.some((re) => re.test(relPath));
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
function withinAllowed(relPath, allowed) {
|
|
409
|
+
if (allowed.length === 0) return true;
|
|
410
|
+
return allowed.some((a) => relPath === a || relPath.startsWith(a.replace(/\/?$/, "/")));
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
async function checkVerificationIntegrity({ contract, cwd, changedFiles, verifyPassed, verify }) {
|
|
414
|
+
const violations = [];
|
|
415
|
+
const checks = [];
|
|
416
|
+
const record = (id, ok, detail) => { checks.push({ id, ok, detail, truth: "calculated_from_observed_state" }); if (!ok) violations.push(id); };
|
|
417
|
+
|
|
418
|
+
// 1. Verifier files unchanged (hash match).
|
|
419
|
+
for (const vf of contract.verifierFiles) {
|
|
420
|
+
const now = hashFile(path.join(cwd, vf.path));
|
|
421
|
+
if (vf.sha256 === null) { record(`verifier_file_unreadable:${vf.path}`, true, "could not hash at freeze time"); continue; }
|
|
422
|
+
record(`verifier_file_unchanged:${vf.path}`, now === vf.sha256, now === null ? "verifier file deleted after run" : (now === vf.sha256 ? "hash matches" : "verifier file edited after run"));
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// 2. package.json scripts unchanged (can't repoint `npm test` at `true`).
|
|
426
|
+
if (contract.packageScripts) {
|
|
427
|
+
const after = packageScriptsOf(cwd);
|
|
428
|
+
const same = JSON.stringify(after) === JSON.stringify(contract.packageScripts);
|
|
429
|
+
record("package_scripts_unchanged", same, same ? "scripts identical" : "package.json scripts changed during run");
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
// 3. No protected/test file deleted, and changes stay within allowed scope.
|
|
433
|
+
for (const f of changedFiles) {
|
|
434
|
+
if (isProtected(f, contract.protectedPaths)) {
|
|
435
|
+
record(`protected_path_untouched:${f}`, false, "agent modified a protected/test/config path");
|
|
436
|
+
}
|
|
437
|
+
if (!withinAllowed(f, contract.allowedPaths)) {
|
|
438
|
+
record(`within_allowed_scope:${f}`, false, "change is outside the allowed paths");
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// 4. The task actually failed before the run (otherwise a pass is meaningless).
|
|
443
|
+
const baseFailed = contract.baselineVerify && contract.baselineVerify.passed === false;
|
|
444
|
+
record("baseline_failed_before_run", !!baseFailed, baseFailed ? "verify failed on the baseline tree (the task was real)" : "verify already passed before the agent ran - the pass proves nothing");
|
|
445
|
+
|
|
446
|
+
// 5. Re-run verify against the baseline commit in a clean checkout: does the
|
|
447
|
+
// pass survive without the agent's uncommitted working-tree state? This
|
|
448
|
+
// catches a green that only exists because of untracked/uncommitted hacks.
|
|
449
|
+
let cleanRoom = { ran: false, passed: null, detail: "skipped (no baseline commit)" };
|
|
450
|
+
if (verifyPassed && contract.baselineCommit) {
|
|
451
|
+
cleanRoom = await verifyInCleanWorktree({ cwd, baselineCommit: contract.baselineCommit, verify, changedFiles });
|
|
452
|
+
record("verify_survives_clean_checkout", cleanRoom.passed === true, cleanRoom.detail);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Grade. Tampering with the verifier is categorically worse than a weak pass:
|
|
456
|
+
// it means the green light itself is untrustworthy.
|
|
457
|
+
const verifierTampered = checks.some((c) => !c.ok && (c.id.startsWith("verifier_file_unchanged:") || c.id.startsWith("protected_path_untouched:") || c.id === "package_scripts_unchanged"));
|
|
458
|
+
|
|
459
|
+
let status;
|
|
460
|
+
if (!verifyPassed) {
|
|
461
|
+
status = "UNVERIFIED";
|
|
462
|
+
} else if (verifierTampered) {
|
|
463
|
+
status = "VERIFIER_COMPROMISED";
|
|
464
|
+
} else if (violations.length === 0) {
|
|
465
|
+
status = "VERIFIED_STRONG";
|
|
466
|
+
} else {
|
|
467
|
+
status = "VERIFIED_WEAK";
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
const reason = {
|
|
471
|
+
UNVERIFIED: "Verification did not pass.",
|
|
472
|
+
VERIFIER_COMPROMISED: "Verification passed, but the verifier itself was modified during the run. The green light cannot be trusted.",
|
|
473
|
+
VERIFIED_STRONG: "Verification passed and the verifier was untouched: tests/scripts unchanged, changes in scope, the task really failed before, and the pass survives a clean checkout.",
|
|
474
|
+
VERIFIED_WEAK: "Verification passed and the verifier was untouched, but at least one strong condition was not met (e.g. baseline failure not reproduced, or pass not reproduced in a clean checkout)."
|
|
475
|
+
}[status];
|
|
476
|
+
|
|
477
|
+
return {
|
|
478
|
+
schema: "runcap.verification-integrity/v0.1",
|
|
479
|
+
status,
|
|
480
|
+
reason,
|
|
481
|
+
contract: {
|
|
482
|
+
baselineCommit: contract.baselineCommit,
|
|
483
|
+
verifierFiles: contract.verifierFiles.map((f) => f.path),
|
|
484
|
+
protectedPaths: contract.protectedPaths,
|
|
485
|
+
allowedPaths: contract.allowedPaths,
|
|
486
|
+
baselineVerifyPassed: contract.baselineVerify ? contract.baselineVerify.passed : null
|
|
487
|
+
},
|
|
488
|
+
cleanRoom,
|
|
489
|
+
checks,
|
|
490
|
+
violations
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Re-run the verify command from the baseline commit in a throwaway worktree,
|
|
495
|
+
// then copy in only the agent's changed files. If the pass came from real edits
|
|
496
|
+
// to allowed files it survives; if it came from uncommitted local junk it dies.
|
|
497
|
+
async function verifyInCleanWorktree({ cwd, baselineCommit, verify, changedFiles }) {
|
|
498
|
+
const tmpBase = path.join(STORE_DIR, "cleanroom");
|
|
499
|
+
await mkdir(tmpBase, { recursive: true });
|
|
500
|
+
const wt = path.join(tmpBase, `wt-${createHash("sha1").update(`${baselineCommit}${Date.now()}${Math.random()}`).digest("hex").slice(0, 8)}`);
|
|
501
|
+
const add = await git(["worktree", "add", "--detach", wt, baselineCommit], cwd);
|
|
502
|
+
if (add.error) {
|
|
503
|
+
return { ran: false, passed: null, detail: `clean-worktree check skipped: ${add.error}` };
|
|
504
|
+
}
|
|
505
|
+
try {
|
|
506
|
+
// Bring the agent's changed files into the clean baseline so we test the
|
|
507
|
+
// work, not the agent's whole dirty tree.
|
|
508
|
+
for (const rel of changedFiles) {
|
|
509
|
+
const src = path.join(cwd, rel);
|
|
510
|
+
const dst = path.join(wt, rel);
|
|
511
|
+
try {
|
|
512
|
+
await mkdir(path.dirname(dst), { recursive: true });
|
|
513
|
+
await writeFile(dst, await readFile(src));
|
|
514
|
+
} catch { /* deleted/binary file: leave baseline version */ }
|
|
515
|
+
}
|
|
516
|
+
const result = await runShell(verify, wt);
|
|
517
|
+
return {
|
|
518
|
+
ran: true,
|
|
519
|
+
passed: result.exitCode === 0,
|
|
520
|
+
detail: result.exitCode === 0
|
|
521
|
+
? "pass reproduced from baseline + changed files in a clean checkout"
|
|
522
|
+
: "pass did NOT reproduce in a clean checkout (green depended on uncommitted local state)"
|
|
523
|
+
};
|
|
524
|
+
} finally {
|
|
525
|
+
await git(["worktree", "remove", "--force", wt], cwd);
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
function formatOutcomeReceipt(r) {
|
|
530
|
+
const usd = (n) => (n === null || n === undefined ? "n/a" : fmtUsd(n));
|
|
531
|
+
const lines = [
|
|
532
|
+
"Runcap outcome receipt",
|
|
533
|
+
"======================",
|
|
534
|
+
`Task: ${r.task}`,
|
|
535
|
+
`Agent: ${r.agent.command.join(" ")}`,
|
|
536
|
+
`Model(s): ${r.models.length ? r.models.join(", ") : "none (no LLM calls through gateway)"}`,
|
|
537
|
+
`Planned cap: ${r.cost.plannedCapUsd === null ? "no cap set" : usd(r.cost.plannedCapUsd)}`,
|
|
538
|
+
`Actual cost: ${usd(r.cost.actualCostUsd)} (${r.cost.llmCalls} priced LLM call(s), truth: ${r.cost.truth})`,
|
|
539
|
+
`Agent runtime: ${(r.work.agentDurationMs / 1000).toFixed(1)}s exit ${r.work.agentExitCode}`,
|
|
540
|
+
`Verify runtime: ${(r.work.verifyDurationMs / 1000).toFixed(1)}s`,
|
|
541
|
+
`Retries: not tracked (v0.1)`,
|
|
542
|
+
`Changed files: ${r.work.changedFileCount}${r.work.changedFileCount ? " (" + r.work.changedFiles.join(", ") + ")" : ""}`,
|
|
543
|
+
`Verification: \`${r.verify.command}\``,
|
|
544
|
+
`Verify result: ${r.verify.passed ? "PASSED" : "FAILED"} (exit ${r.verify.exitCode}, truth: observed)`,
|
|
545
|
+
"",
|
|
546
|
+
`Outcome: ${r.outcome}`
|
|
547
|
+
];
|
|
548
|
+
if (r.outcome === "VERIFIED") {
|
|
549
|
+
lines.push(`Verified Outcome Cost: ${usd(r.cost.verifiedOutcomeCostUsd)} (money that bought a verified result)`);
|
|
550
|
+
} else {
|
|
551
|
+
lines.push(`Verified Outcome Cost: N/A (verification did not pass)`);
|
|
552
|
+
lines.push(`Money spent without verified delivery: ${usd(r.cost.moneySpentWithoutVerifiedDeliveryUsd)}`);
|
|
553
|
+
}
|
|
554
|
+
if (r.verificationIntegrity) {
|
|
555
|
+
const vi = r.verificationIntegrity;
|
|
556
|
+
lines.push("");
|
|
557
|
+
lines.push(`Verification integrity: ${vi.status}`);
|
|
558
|
+
lines.push(` ${vi.reason}`);
|
|
559
|
+
if (vi.violations.length) {
|
|
560
|
+
lines.push(` Failed checks (${vi.violations.length}):`);
|
|
561
|
+
for (const c of vi.checks.filter((x) => !x.ok)) lines.push(` - ${c.id}: ${c.detail}`);
|
|
562
|
+
}
|
|
563
|
+
if (vi.cleanRoom && vi.cleanRoom.ran) lines.push(` Clean-checkout re-verify: ${vi.cleanRoom.passed ? "PASSED" : "FAILED"} (${vi.cleanRoom.detail})`);
|
|
564
|
+
}
|
|
565
|
+
if (r.policy) {
|
|
566
|
+
lines.push("");
|
|
567
|
+
lines.push(...formatPolicyBlock(r.policy));
|
|
568
|
+
}
|
|
569
|
+
return lines.join("\n") + "\n";
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
export async function latestOutcomeId() {
|
|
573
|
+
try {
|
|
574
|
+
return (await readFile(path.join(OUTCOMES_DIR, "latest"), "utf8")).trim();
|
|
575
|
+
} catch {
|
|
576
|
+
return null;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
export async function renderOutcome(id) {
|
|
581
|
+
const file = path.join(OUTCOMES_DIR, id, "receipt.md");
|
|
582
|
+
return (await readFile(file, "utf8"));
|
|
583
|
+
}
|
|
584
|
+
|
|
150
585
|
export async function latestMissionId() {
|
|
151
586
|
try {
|
|
152
587
|
return (await readFile(path.join(STORE_DIR, "latest"), "utf8")).trim();
|
|
@@ -528,6 +963,13 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
|
|
|
528
963
|
// but-not-identical turns, which plain hashing never catches.
|
|
529
964
|
const loopEnabled = (process.env.AIM_LOOP_DETECT ?? "on").toLowerCase() !== "off";
|
|
530
965
|
const shapeHistory = [];
|
|
966
|
+
// Response signatures aligned with shapeHistory (the observation each prior
|
|
967
|
+
// prompt produced). Lets the loop detector tell circling from convergence:
|
|
968
|
+
// similar prompts only count as a loop when the response did not move either.
|
|
969
|
+
// Each entry is a mutable holder { sig } so the slot for an in-flight turn can
|
|
970
|
+
// be captured by reference and filled once its upstream response returns, even
|
|
971
|
+
// if concurrent turns push new entries or shift() trims the array meanwhile.
|
|
972
|
+
const responseHistory = [];
|
|
531
973
|
const SHAPE_HISTORY_MAX = 12;
|
|
532
974
|
const server = http.createServer(async (request, response) => {
|
|
533
975
|
const started = Date.now();
|
|
@@ -551,15 +993,32 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
|
|
|
551
993
|
|
|
552
994
|
const bodyText = await readRequestBody(request);
|
|
553
995
|
const requestBody = safeJson(bodyText) ?? {};
|
|
554
|
-
// Loop signal: compare this request's shape against the recent run.
|
|
996
|
+
// Loop signal: compare this request's shape against the recent run. The
|
|
997
|
+
// response signatures gate prompt-similarity so a converging run (similar
|
|
998
|
+
// prompts, but the error/output is changing) is not flagged as circling.
|
|
555
999
|
let loop = null;
|
|
1000
|
+
let currentShape = null;
|
|
1001
|
+
let responseSlot = null; // holder for THIS turn's response signature
|
|
556
1002
|
if (loopEnabled) {
|
|
557
1003
|
const shape = requestShapeText(requestBody);
|
|
558
1004
|
if (shape) {
|
|
559
|
-
|
|
560
|
-
|
|
1005
|
+
currentShape = shape;
|
|
1006
|
+
const result = detectLoop(shape, shapeHistory, {
|
|
1007
|
+
responseSignatures: responseHistory.map((h) => h.sig),
|
|
1008
|
+
currentResponseSignature: responseHistory.length ? responseHistory[responseHistory.length - 1].sig : null
|
|
1009
|
+
});
|
|
1010
|
+
loop = {
|
|
1011
|
+
looping: result.looping,
|
|
1012
|
+
repeats: result.repeats,
|
|
1013
|
+
similarity: result.similarity,
|
|
1014
|
+
responseMoved: result.responseMoved,
|
|
1015
|
+
truth: "calculated"
|
|
1016
|
+
};
|
|
561
1017
|
shapeHistory.push(shape);
|
|
1018
|
+
responseSlot = { sig: "" }; // filled by reference once upstream returns
|
|
1019
|
+
responseHistory.push(responseSlot);
|
|
562
1020
|
if (shapeHistory.length > SHAPE_HISTORY_MAX) shapeHistory.shift();
|
|
1021
|
+
if (responseHistory.length > SHAPE_HISTORY_MAX) responseHistory.shift();
|
|
563
1022
|
}
|
|
564
1023
|
}
|
|
565
1024
|
const budget = readBudget();
|
|
@@ -639,6 +1098,8 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
|
|
|
639
1098
|
if (gatewayMode === "mock") {
|
|
640
1099
|
const responseBody = mockCompletion(requestBody, url.pathname);
|
|
641
1100
|
const responseText = JSON.stringify(responseBody);
|
|
1101
|
+
// Record before unblocking the client so a concurrent next turn sees it.
|
|
1102
|
+
if (responseSlot) responseSlot.sig = responseSignature(responseBody);
|
|
642
1103
|
send(response, 200, responseText, "application/json; charset=utf-8");
|
|
643
1104
|
await appendGatewayEvent({
|
|
644
1105
|
at: new Date().toISOString(),
|
|
@@ -685,13 +1146,14 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
|
|
|
685
1146
|
body: forwardBody
|
|
686
1147
|
});
|
|
687
1148
|
const responseText = await upstreamResponse.text();
|
|
1149
|
+
const responseBody = safeJson(responseText) ?? {};
|
|
1150
|
+
// Record before unblocking the client so a concurrent next turn sees it.
|
|
1151
|
+
if (responseSlot) responseSlot.sig = responseSignature(responseBody);
|
|
688
1152
|
response.writeHead(upstreamResponse.status, {
|
|
689
1153
|
"content-type": upstreamResponse.headers.get("content-type") ?? "application/json",
|
|
690
1154
|
"cache-control": "no-store"
|
|
691
1155
|
});
|
|
692
1156
|
response.end(responseText);
|
|
693
|
-
|
|
694
|
-
const responseBody = safeJson(responseText) ?? {};
|
|
695
1157
|
await appendGatewayEvent({
|
|
696
1158
|
at: new Date().toISOString(),
|
|
697
1159
|
path: url.pathname,
|
|
@@ -761,7 +1223,7 @@ export async function startGateway({ port = 8792, mock = false } = {}) {
|
|
|
761
1223
|
// it down afterward. Upstream is pinned from the CURRENT env before the child's
|
|
762
1224
|
// base URLs are rewritten, so the gateway proxies to the real provider, not to
|
|
763
1225
|
// itself.
|
|
764
|
-
async function startEphemeralGateway({ mock = false } = {}) {
|
|
1226
|
+
export async function startEphemeralGateway({ mock = false } = {}) {
|
|
765
1227
|
await ensureStore();
|
|
766
1228
|
const upstream = {
|
|
767
1229
|
openaiKey: process.env.AIM_UPSTREAM_API_KEY ?? process.env.OPENAI_API_KEY,
|
|
@@ -1572,7 +2034,12 @@ const MODEL_PRICES = [
|
|
|
1572
2034
|
{ match: ["gpt-4.1-mini"], inputPerMillion: 0.4, outputPerMillion: 1.6, cacheReadPerMillion: 0.1, provider: "openai" },
|
|
1573
2035
|
{ match: ["gpt-4.1"], inputPerMillion: 2, outputPerMillion: 8, cacheReadPerMillion: 0.5, provider: "openai" },
|
|
1574
2036
|
{ match: ["gpt-4o-mini"], inputPerMillion: 0.15, outputPerMillion: 0.6, cacheReadPerMillion: 0.075, provider: "openai" },
|
|
1575
|
-
{ match: ["gpt-4o"], inputPerMillion: 2.5, outputPerMillion: 10, cacheReadPerMillion: 1.25, provider: "openai" }
|
|
2037
|
+
{ match: ["gpt-4o"], inputPerMillion: 2.5, outputPerMillion: 10, cacheReadPerMillion: 1.25, provider: "openai" },
|
|
2038
|
+
// DeepSeek (api-docs.deepseek.com/quick_start/pricing). OpenAI-compatible API,
|
|
2039
|
+
// so the same gateway prices and caps it with no extra setup. deepseek-chat /
|
|
2040
|
+
// deepseek-reasoner are the non-thinking / thinking modes of deepseek-v4-flash.
|
|
2041
|
+
{ match: ["deepseek-v4-pro"], inputPerMillion: 0.435, outputPerMillion: 0.87, cacheReadPerMillion: 0.003625, provider: "deepseek" },
|
|
2042
|
+
{ match: ["deepseek-v4-flash", "deepseek-chat", "deepseek-reasoner", "deepseek"], inputPerMillion: 0.14, outputPerMillion: 0.28, cacheReadPerMillion: 0.0028, provider: "deepseek" }
|
|
1576
2043
|
];
|
|
1577
2044
|
|
|
1578
2045
|
function estimateApiCost(usage, model) {
|