runcap 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,11 +7,13 @@ import path from "node:path";
7
7
  import process from "node:process";
8
8
  import { syncRun } from "./cloud.mjs";
9
9
  import { sendAlert } from "./alerts.mjs";
10
- import { compressRequestBody, estimateTokens, requestShapeText, detectLoop } from "./compressor.mjs";
10
+ import { compressRequestBody, estimateTokens, requestShapeText, detectLoop, responseSignature } from "./compressor.mjs";
11
+ import { evaluatePolicyVerdict, policyMeta, formatPolicyBlock } from "./policy.mjs";
11
12
 
12
13
  const STORE_DIR = ".runcap";
13
14
  const MISSIONS_DIR = path.join(STORE_DIR, "missions");
14
15
  const PLANS_DIR = path.join(STORE_DIR, "plans");
16
+ const OUTCOMES_DIR = path.join(STORE_DIR, "outcomes");
15
17
  const FUEL_FILE = path.join(STORE_DIR, "fuel.json");
16
18
  const GATEWAY_EVENTS_FILE = path.join(STORE_DIR, "gateway-events.jsonl");
17
19
  const BUDGET_FILE = path.join(STORE_DIR, "budget.json");
@@ -147,6 +149,439 @@ export async function runMission({ command, label, fuelBefore, autoGateway = fal
147
149
  };
148
150
  }
149
151
 
152
+ // Verified Outcome Cost: run an agent on one task, then run a verification
153
+ // command, and only count the money as "delivered" if verification passes.
154
+ // The unit the rest of the industry ignores: dollars per VERIFIED task, not
155
+ // dollars per token. Reuses the same gateway cost, git-diff, and truth-label
156
+ // machinery as runMission, so every number on the receipt is observed or
157
+ // calculated, never guessed.
158
+ export async function runOutcome({ task, verify, command, label, mock = false, guard = false, protect = [], allow = [], policy = null, capUsd = null }) {
159
+ if (!task || !task.trim()) throw new Error("runOutcome: a --task description is required.");
160
+ if (!verify || !verify.trim()) throw new Error("runOutcome: a --verify command is required (e.g. \"npm test && npm run build\").");
161
+ if (!Array.isArray(command) || command.length === 0) throw new Error("runOutcome: an agent command after `--` is required.");
162
+ // A policy-bound mission is always guarded: the verdict leans on the integrity
163
+ // grade, so trusting an unguarded verifier would let a tampered pass score PASS.
164
+ if (policy) guard = true;
165
+ await ensureStore();
166
+ await mkdir(OUTCOMES_DIR, { recursive: true });
167
+
168
+ // Per-mission hard cap: override the gateway's budget env for the duration of
169
+ // THIS run so only this mission's spend counts against capUsd, then restore.
170
+ // The gateway reads readBudget()/budgetWindowMs() per request, so this reuses
171
+ // the existing budget_guard 429 enforcement with no new budget code.
172
+ const prevBudgetEnv = process.env.AIM_DAILY_BUDGET_USD;
173
+ const prevWindowEnv = process.env.AIM_BUDGET_WINDOW;
174
+ if (capUsd != null) {
175
+ process.env.AIM_DAILY_BUDGET_USD = String(capUsd);
176
+ process.env.AIM_BUDGET_WINDOW = "session";
177
+ }
178
+ try {
179
+ return await runOutcomeInner();
180
+ } finally {
181
+ if (capUsd != null) {
182
+ if (prevBudgetEnv === undefined) delete process.env.AIM_DAILY_BUDGET_USD;
183
+ else process.env.AIM_DAILY_BUDGET_USD = prevBudgetEnv;
184
+ if (prevWindowEnv === undefined) delete process.env.AIM_BUDGET_WINDOW;
185
+ else process.env.AIM_BUDGET_WINDOW = prevWindowEnv;
186
+ }
187
+ }
188
+
189
+ async function runOutcomeInner() {
190
+
191
+ const windowMs = budgetWindowMs();
192
+ const spentBefore = (await readGatewaySummary({ windowMs })).estimatedCostUsd;
193
+ const cap = readBudget();
194
+ // Snapshot the ledger length BEFORE the run so we attribute events to this run
195
+ // by log position, not wall clock. Two runs in the same second would otherwise
196
+ // overlap a time window and double-count each other's calls and models.
197
+ const eventCountBefore = (await readGatewayEvents()).length;
198
+
199
+ // Guard: freeze a Task Contract BEFORE the agent touches anything. Verifying
200
+ // the result is meaningless if the agent can edit the verifier; so we hash the
201
+ // verifier files, snapshot package scripts, record the baseline commit, and
202
+ // confirm the task actually fails today (a pass on an already-green tree
203
+ // proves nothing the agent did). cwd is known only after the mission resolves
204
+ // it, so we resolve it the same way runMission does.
205
+ const guardCwd = process.cwd();
206
+ const contract = guard ? await freezeTaskContract({ cwd: guardCwd, verify, protect, allow }) : null;
207
+
208
+ // 1. Run the agent through the cap gateway so the spend is real and recorded.
209
+ const mission = await runMission({ command, label: label ?? "outcome", autoGateway: true, mock });
210
+ const missionRecord = await readMission(mission.id);
211
+
212
+ // 2. Cost actually spent on this run, measured from the gateway ledger delta.
213
+ const summaryAfter = await readGatewaySummary({ windowMs });
214
+ const spentAfter = summaryAfter.estimatedCostUsd;
215
+ const actualCostUsd = Number(Math.max(0, spentAfter - spentBefore).toFixed(6));
216
+ // Models/calls the run actually hit: exactly the events appended during it.
217
+ const runEvents = (await readGatewayEvents()).slice(eventCountBefore);
218
+ const models = [...new Set(runEvents.map((e) => e.model).filter((m) => m && m !== "unknown"))];
219
+ const llmCalls = runEvents.filter((e) => e.status >= 200 && e.status < 300 && e.usage).length;
220
+ // Did the gateway block a call to stay under the cap? A budget_guard 429 means
221
+ // the mission hit its ceiling - a policy-graded mission BLOCKS on it.
222
+ const budgetGuardTripped = runEvents.some((e) => e.status === 429 && e.truth === "budget_guard");
223
+ const costTruth = llmCalls > 0
224
+ ? "calculated_from_provider_usage_and_sourced_price_table"
225
+ : "no_llm_calls_through_gateway";
226
+
227
+ // 3. Verify: run the user's verification command. Its exit code is the oracle.
228
+ const verifyResult = await runShell(verify, missionRecord.cwd);
229
+ const verifyPassed = verifyResult.exitCode === 0;
230
+
231
+ // 4. Did the agent actually change anything? A pass on a no-op is not delivery.
232
+ const changedFiles = missionRecord.diffEvidence.changedFiles;
233
+ const producedDiff = changedFiles.length > 0;
234
+ const outcome = verifyPassed ? "VERIFIED" : "UNVERIFIED";
235
+ const verifiedOutcomeCostUsd = verifyPassed ? actualCostUsd : null;
236
+
237
+ // 5. Guard: did the agent pass the check FAIRLY? Re-hash the verifier, look
238
+ // for tampering, and grade the verification's trustworthiness on a 4-level
239
+ // scale instead of a binary pass.
240
+ const integrity = guard
241
+ ? await checkVerificationIntegrity({ contract, cwd: missionRecord.cwd, changedFiles, verifyPassed, verify })
242
+ : null;
243
+
244
+ const receipt = {
245
+ schema: policy ? "runcap.outcome-receipt/v0.3" : (guard ? "runcap.outcome-receipt/v0.2" : "runcap.outcome-receipt/v0.1"),
246
+ id: mission.id,
247
+ generatedAt: new Date().toISOString(),
248
+ task,
249
+ agent: { command, program: command[0] },
250
+ models,
251
+ verify: {
252
+ command: verify,
253
+ exitCode: verifyResult.exitCode,
254
+ passed: verifyPassed,
255
+ truth: "observed_exit_code"
256
+ },
257
+ cost: {
258
+ plannedCapUsd: cap,
259
+ actualCostUsd,
260
+ verifiedOutcomeCostUsd,
261
+ moneySpentWithoutVerifiedDeliveryUsd: verifyPassed ? 0 : actualCostUsd,
262
+ llmCalls,
263
+ budgetGuardTripped,
264
+ truth: costTruth
265
+ },
266
+ work: {
267
+ agentExitCode: missionRecord.exitCode,
268
+ agentDurationMs: missionRecord.durationMs,
269
+ verifyDurationMs: verifyResult.durationMs,
270
+ changedFiles,
271
+ changedFileCount: changedFiles.length,
272
+ producedDiff,
273
+ retries: { value: null, truth: "not_tracked_v0.1" },
274
+ truth: "observed_from_git_and_exit_code"
275
+ },
276
+ outcome,
277
+ verificationIntegrity: integrity,
278
+ costScope: {
279
+ measured: "observed_llm_calls_through_gateway_only",
280
+ note: "Verified Outcome Cost is the LLM spend that bought the result. It does NOT include subscriptions, CI minutes, sandbox compute, or human review time. For full agent economics, divide total spend across N attempts by strongly-verified outcomes (Expected Verified Outcome Cost, needs N>=5)."
281
+ },
282
+ missionReport: path.join(MISSIONS_DIR, mission.id, "report.md")
283
+ };
284
+
285
+ // Policy-bound mission: stamp the receipt with who/what + the rules (and the
286
+ // hash of the exact policy text), then grade an overall PASS/BLOCKED verdict.
287
+ // The hash lets a reviewer confirm which rules were in force for this run.
288
+ if (policy) {
289
+ receipt.policy = { ...policyMeta(policy), ...evaluatePolicyVerdict(receipt, policy.policy) };
290
+ }
291
+
292
+ const outcomeDir = path.join(OUTCOMES_DIR, mission.id);
293
+ await mkdir(outcomeDir, { recursive: true });
294
+ await writeFile(path.join(outcomeDir, "receipt.json"), JSON.stringify(receipt, null, 2));
295
+ await writeFile(path.join(outcomeDir, "receipt.md"), formatOutcomeReceipt(receipt));
296
+ await writeFile(path.join(OUTCOMES_DIR, "latest"), mission.id);
297
+
298
+ return { id: mission.id, receipt, summary: formatOutcomeReceipt(receipt) };
299
+ }
300
+ }
301
+
302
+ // Run a verification command string through the shell so operators can write
303
+ // natural pipelines like "npm test && npm run build". Output streams live.
304
+ async function runShell(commandString, cwd) {
305
+ const started = Date.now();
306
+ const shell = process.platform === "win32" ? "cmd" : "sh";
307
+ const shellArgs = process.platform === "win32" ? ["/c", commandString] : ["-c", commandString];
308
+ return await new Promise((resolve) => {
309
+ const child = spawn(shell, shellArgs, { cwd, env: { ...process.env, AIM_WRAPPED: "1" }, shell: false });
310
+ let stdout = "";
311
+ let stderr = "";
312
+ child.stdout?.on("data", (chunk) => { const t = chunk.toString(); stdout += t; process.stdout.write(t); });
313
+ child.stderr?.on("data", (chunk) => { const t = chunk.toString(); stderr += t; process.stderr.write(t); });
314
+ child.on("error", (error) => resolve({ stdout, stderr: stderr + `\n${error.message}`, exitCode: 127, durationMs: Date.now() - started }));
315
+ child.on("close", (exitCode) => resolve({ stdout, stderr, exitCode: exitCode ?? 1, durationMs: Date.now() - started }));
316
+ });
317
+ }
318
+
319
+ // --- Verification Integrity (runcap outcome --guard) ---------------------
320
+ // The honest hole in outcome v0.1: it trusts the verifier. An agent can turn a
321
+ // test green without doing the work - delete the test, rewrite the assertion,
322
+ // repoint the `npm test` script, disable strict mode, mock the real API. The
323
+ // guard freezes a contract before the run and re-checks it after, so a pass is
324
+ // graded on a 4-level trust scale instead of a binary VERIFIED.
325
+
326
+ const DEFAULT_PROTECTED_GLOBS = [
327
+ /(^|\/)[^/]*\.test\.[mc]?[jt]sx?$/,
328
+ /(^|\/)[^/]*\.spec\.[mc]?[jt]sx?$/,
329
+ /(^|\/)__tests__\//,
330
+ /(^|\/)tests?\//,
331
+ /(^|\/)package\.json$/,
332
+ /(^|\/)tsconfig[^/]*\.json$/,
333
+ /(^|\/)jest\.config\./,
334
+ /(^|\/)vitest\.config\./
335
+ ];
336
+
337
+ // Pull the concrete file paths a verify command names so we can hash them and
338
+ // detect edits. We can't statically parse an arbitrary shell pipeline, so we
339
+ // take a deliberately simple, honest approach: any whitespace token that looks
340
+ // like a path to an existing file is treated as a verifier file.
341
+ function verifierFilesFrom(verify, cwd) {
342
+ const tokens = verify.split(/\s+/).filter(Boolean);
343
+ const files = [];
344
+ for (const raw of tokens) {
345
+ const tok = raw.replace(/^["']|["']$/g, "");
346
+ if (!/[./]/.test(tok)) continue;
347
+ const abs = path.isAbsolute(tok) ? tok : path.join(cwd, tok);
348
+ if (existsSync(abs) && !files.includes(abs)) files.push(abs);
349
+ }
350
+ return files;
351
+ }
352
+
353
+ function hashFile(absPath) {
354
+ try {
355
+ return createHash("sha256").update(readFileSync(absPath)).digest("hex");
356
+ } catch {
357
+ return null;
358
+ }
359
+ }
360
+
361
+ function packageScriptsOf(cwd) {
362
+ const pkg = readOptionalSync(path.join(cwd, "package.json"));
363
+ if (!pkg) return null;
364
+ const parsed = safeJson(pkg);
365
+ return parsed && parsed.scripts ? parsed.scripts : {};
366
+ }
367
+
368
+ function readOptionalSync(file) {
369
+ try {
370
+ return readFileSync(file, "utf8");
371
+ } catch {
372
+ return null;
373
+ }
374
+ }
375
+
376
+ async function freezeTaskContract({ cwd, verify, protect, allow }) {
377
+ const head = await git(["rev-parse", "HEAD"], cwd);
378
+ const baselineCommit = head.error ? null : head.text;
379
+ const verifierFiles = verifierFilesFrom(verify, cwd).map((abs) => ({
380
+ path: path.relative(cwd, abs),
381
+ sha256: hashFile(abs)
382
+ }));
383
+ const packageScripts = packageScriptsOf(cwd);
384
+
385
+ // Does the task actually fail today? A verify that already passes on the
386
+ // baseline tree proves the agent did nothing. Run it before the agent moves.
387
+ const baseline = await runShell(verify, cwd);
388
+
389
+ return {
390
+ schema: "runcap.task-contract/v0.1",
391
+ frozenAt: new Date().toISOString(),
392
+ cwd,
393
+ baselineCommit,
394
+ verifyCommand: verify,
395
+ verifierFiles,
396
+ packageScripts,
397
+ protectedPaths: protect,
398
+ allowedPaths: allow,
399
+ baselineVerify: { exitCode: baseline.exitCode, passed: baseline.exitCode === 0 }
400
+ };
401
+ }
402
+
403
+ function isProtected(relPath, extraProtected) {
404
+ if (extraProtected.some((p) => relPath === p || relPath.startsWith(p.replace(/\/?$/, "/")))) return true;
405
+ return DEFAULT_PROTECTED_GLOBS.some((re) => re.test(relPath));
406
+ }
407
+
408
+ function withinAllowed(relPath, allowed) {
409
+ if (allowed.length === 0) return true;
410
+ return allowed.some((a) => relPath === a || relPath.startsWith(a.replace(/\/?$/, "/")));
411
+ }
412
+
413
+ async function checkVerificationIntegrity({ contract, cwd, changedFiles, verifyPassed, verify }) {
414
+ const violations = [];
415
+ const checks = [];
416
+ const record = (id, ok, detail) => { checks.push({ id, ok, detail, truth: "calculated_from_observed_state" }); if (!ok) violations.push(id); };
417
+
418
+ // 1. Verifier files unchanged (hash match).
419
+ for (const vf of contract.verifierFiles) {
420
+ const now = hashFile(path.join(cwd, vf.path));
421
+ if (vf.sha256 === null) { record(`verifier_file_unreadable:${vf.path}`, true, "could not hash at freeze time"); continue; }
422
+ record(`verifier_file_unchanged:${vf.path}`, now === vf.sha256, now === null ? "verifier file deleted after run" : (now === vf.sha256 ? "hash matches" : "verifier file edited after run"));
423
+ }
424
+
425
+ // 2. package.json scripts unchanged (can't repoint `npm test` at `true`).
426
+ if (contract.packageScripts) {
427
+ const after = packageScriptsOf(cwd);
428
+ const same = JSON.stringify(after) === JSON.stringify(contract.packageScripts);
429
+ record("package_scripts_unchanged", same, same ? "scripts identical" : "package.json scripts changed during run");
430
+ }
431
+
432
+ // 3. No protected/test file deleted, and changes stay within allowed scope.
433
+ for (const f of changedFiles) {
434
+ if (isProtected(f, contract.protectedPaths)) {
435
+ record(`protected_path_untouched:${f}`, false, "agent modified a protected/test/config path");
436
+ }
437
+ if (!withinAllowed(f, contract.allowedPaths)) {
438
+ record(`within_allowed_scope:${f}`, false, "change is outside the allowed paths");
439
+ }
440
+ }
441
+
442
+ // 4. The task actually failed before the run (otherwise a pass is meaningless).
443
+ const baseFailed = contract.baselineVerify && contract.baselineVerify.passed === false;
444
+ record("baseline_failed_before_run", !!baseFailed, baseFailed ? "verify failed on the baseline tree (the task was real)" : "verify already passed before the agent ran - the pass proves nothing");
445
+
446
+ // 5. Re-run verify against the baseline commit in a clean checkout: does the
447
+ // pass survive without the agent's uncommitted working-tree state? This
448
+ // catches a green that only exists because of untracked/uncommitted hacks.
449
+ let cleanRoom = { ran: false, passed: null, detail: "skipped (no baseline commit)" };
450
+ if (verifyPassed && contract.baselineCommit) {
451
+ cleanRoom = await verifyInCleanWorktree({ cwd, baselineCommit: contract.baselineCommit, verify, changedFiles });
452
+ record("verify_survives_clean_checkout", cleanRoom.passed === true, cleanRoom.detail);
453
+ }
454
+
455
+ // Grade. Tampering with the verifier is categorically worse than a weak pass:
456
+ // it means the green light itself is untrustworthy.
457
+ const verifierTampered = checks.some((c) => !c.ok && (c.id.startsWith("verifier_file_unchanged:") || c.id.startsWith("protected_path_untouched:") || c.id === "package_scripts_unchanged"));
458
+
459
+ let status;
460
+ if (!verifyPassed) {
461
+ status = "UNVERIFIED";
462
+ } else if (verifierTampered) {
463
+ status = "VERIFIER_COMPROMISED";
464
+ } else if (violations.length === 0) {
465
+ status = "VERIFIED_STRONG";
466
+ } else {
467
+ status = "VERIFIED_WEAK";
468
+ }
469
+
470
+ const reason = {
471
+ UNVERIFIED: "Verification did not pass.",
472
+ VERIFIER_COMPROMISED: "Verification passed, but the verifier itself was modified during the run. The green light cannot be trusted.",
473
+ VERIFIED_STRONG: "Verification passed and the verifier was untouched: tests/scripts unchanged, changes in scope, the task really failed before, and the pass survives a clean checkout.",
474
+ VERIFIED_WEAK: "Verification passed and the verifier was untouched, but at least one strong condition was not met (e.g. baseline failure not reproduced, or pass not reproduced in a clean checkout)."
475
+ }[status];
476
+
477
+ return {
478
+ schema: "runcap.verification-integrity/v0.1",
479
+ status,
480
+ reason,
481
+ contract: {
482
+ baselineCommit: contract.baselineCommit,
483
+ verifierFiles: contract.verifierFiles.map((f) => f.path),
484
+ protectedPaths: contract.protectedPaths,
485
+ allowedPaths: contract.allowedPaths,
486
+ baselineVerifyPassed: contract.baselineVerify ? contract.baselineVerify.passed : null
487
+ },
488
+ cleanRoom,
489
+ checks,
490
+ violations
491
+ };
492
+ }
493
+
494
+ // Re-run the verify command from the baseline commit in a throwaway worktree,
495
+ // then copy in only the agent's changed files. If the pass came from real edits
496
+ // to allowed files it survives; if it came from uncommitted local junk it dies.
497
+ async function verifyInCleanWorktree({ cwd, baselineCommit, verify, changedFiles }) {
498
+ const tmpBase = path.join(STORE_DIR, "cleanroom");
499
+ await mkdir(tmpBase, { recursive: true });
500
+ const wt = path.join(tmpBase, `wt-${createHash("sha1").update(`${baselineCommit}${Date.now()}${Math.random()}`).digest("hex").slice(0, 8)}`);
501
+ const add = await git(["worktree", "add", "--detach", wt, baselineCommit], cwd);
502
+ if (add.error) {
503
+ return { ran: false, passed: null, detail: `clean-worktree check skipped: ${add.error}` };
504
+ }
505
+ try {
506
+ // Bring the agent's changed files into the clean baseline so we test the
507
+ // work, not the agent's whole dirty tree.
508
+ for (const rel of changedFiles) {
509
+ const src = path.join(cwd, rel);
510
+ const dst = path.join(wt, rel);
511
+ try {
512
+ await mkdir(path.dirname(dst), { recursive: true });
513
+ await writeFile(dst, await readFile(src));
514
+ } catch { /* deleted/binary file: leave baseline version */ }
515
+ }
516
+ const result = await runShell(verify, wt);
517
+ return {
518
+ ran: true,
519
+ passed: result.exitCode === 0,
520
+ detail: result.exitCode === 0
521
+ ? "pass reproduced from baseline + changed files in a clean checkout"
522
+ : "pass did NOT reproduce in a clean checkout (green depended on uncommitted local state)"
523
+ };
524
+ } finally {
525
+ await git(["worktree", "remove", "--force", wt], cwd);
526
+ }
527
+ }
528
+
529
+ function formatOutcomeReceipt(r) {
530
+ const usd = (n) => (n === null || n === undefined ? "n/a" : fmtUsd(n));
531
+ const lines = [
532
+ "Runcap outcome receipt",
533
+ "======================",
534
+ `Task: ${r.task}`,
535
+ `Agent: ${r.agent.command.join(" ")}`,
536
+ `Model(s): ${r.models.length ? r.models.join(", ") : "none (no LLM calls through gateway)"}`,
537
+ `Planned cap: ${r.cost.plannedCapUsd === null ? "no cap set" : usd(r.cost.plannedCapUsd)}`,
538
+ `Actual cost: ${usd(r.cost.actualCostUsd)} (${r.cost.llmCalls} priced LLM call(s), truth: ${r.cost.truth})`,
539
+ `Agent runtime: ${(r.work.agentDurationMs / 1000).toFixed(1)}s exit ${r.work.agentExitCode}`,
540
+ `Verify runtime: ${(r.work.verifyDurationMs / 1000).toFixed(1)}s`,
541
+ `Retries: not tracked (v0.1)`,
542
+ `Changed files: ${r.work.changedFileCount}${r.work.changedFileCount ? " (" + r.work.changedFiles.join(", ") + ")" : ""}`,
543
+ `Verification: \`${r.verify.command}\``,
544
+ `Verify result: ${r.verify.passed ? "PASSED" : "FAILED"} (exit ${r.verify.exitCode}, truth: observed)`,
545
+ "",
546
+ `Outcome: ${r.outcome}`
547
+ ];
548
+ if (r.outcome === "VERIFIED") {
549
+ lines.push(`Verified Outcome Cost: ${usd(r.cost.verifiedOutcomeCostUsd)} (money that bought a verified result)`);
550
+ } else {
551
+ lines.push(`Verified Outcome Cost: N/A (verification did not pass)`);
552
+ lines.push(`Money spent without verified delivery: ${usd(r.cost.moneySpentWithoutVerifiedDeliveryUsd)}`);
553
+ }
554
+ if (r.verificationIntegrity) {
555
+ const vi = r.verificationIntegrity;
556
+ lines.push("");
557
+ lines.push(`Verification integrity: ${vi.status}`);
558
+ lines.push(` ${vi.reason}`);
559
+ if (vi.violations.length) {
560
+ lines.push(` Failed checks (${vi.violations.length}):`);
561
+ for (const c of vi.checks.filter((x) => !x.ok)) lines.push(` - ${c.id}: ${c.detail}`);
562
+ }
563
+ if (vi.cleanRoom && vi.cleanRoom.ran) lines.push(` Clean-checkout re-verify: ${vi.cleanRoom.passed ? "PASSED" : "FAILED"} (${vi.cleanRoom.detail})`);
564
+ }
565
+ if (r.policy) {
566
+ lines.push("");
567
+ lines.push(...formatPolicyBlock(r.policy));
568
+ }
569
+ return lines.join("\n") + "\n";
570
+ }
571
+
572
+ export async function latestOutcomeId() {
573
+ try {
574
+ return (await readFile(path.join(OUTCOMES_DIR, "latest"), "utf8")).trim();
575
+ } catch {
576
+ return null;
577
+ }
578
+ }
579
+
580
+ export async function renderOutcome(id) {
581
+ const file = path.join(OUTCOMES_DIR, id, "receipt.md");
582
+ return (await readFile(file, "utf8"));
583
+ }
584
+
150
585
  export async function latestMissionId() {
151
586
  try {
152
587
  return (await readFile(path.join(STORE_DIR, "latest"), "utf8")).trim();
@@ -528,6 +963,13 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
528
963
  // but-not-identical turns, which plain hashing never catches.
529
964
  const loopEnabled = (process.env.AIM_LOOP_DETECT ?? "on").toLowerCase() !== "off";
530
965
  const shapeHistory = [];
966
+ // Response signatures aligned with shapeHistory (the observation each prior
967
+ // prompt produced). Lets the loop detector tell circling from convergence:
968
+ // similar prompts only count as a loop when the response did not move either.
969
+ // Each entry is a mutable holder { sig } so the slot for an in-flight turn can
970
+ // be captured by reference and filled once its upstream response returns, even
971
+ // if concurrent turns push new entries or shift() trims the array meanwhile.
972
+ const responseHistory = [];
531
973
  const SHAPE_HISTORY_MAX = 12;
532
974
  const server = http.createServer(async (request, response) => {
533
975
  const started = Date.now();
@@ -551,15 +993,32 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
551
993
 
552
994
  const bodyText = await readRequestBody(request);
553
995
  const requestBody = safeJson(bodyText) ?? {};
554
- // Loop signal: compare this request's shape against the recent run.
996
+ // Loop signal: compare this request's shape against the recent run. The
997
+ // response signatures gate prompt-similarity so a converging run (similar
998
+ // prompts, but the error/output is changing) is not flagged as circling.
555
999
  let loop = null;
1000
+ let currentShape = null;
1001
+ let responseSlot = null; // holder for THIS turn's response signature
556
1002
  if (loopEnabled) {
557
1003
  const shape = requestShapeText(requestBody);
558
1004
  if (shape) {
559
- const result = detectLoop(shape, shapeHistory);
560
- loop = { looping: result.looping, repeats: result.repeats, similarity: result.similarity, truth: "calculated" };
1005
+ currentShape = shape;
1006
+ const result = detectLoop(shape, shapeHistory, {
1007
+ responseSignatures: responseHistory.map((h) => h.sig),
1008
+ currentResponseSignature: responseHistory.length ? responseHistory[responseHistory.length - 1].sig : null
1009
+ });
1010
+ loop = {
1011
+ looping: result.looping,
1012
+ repeats: result.repeats,
1013
+ similarity: result.similarity,
1014
+ responseMoved: result.responseMoved,
1015
+ truth: "calculated"
1016
+ };
561
1017
  shapeHistory.push(shape);
1018
+ responseSlot = { sig: "" }; // filled by reference once upstream returns
1019
+ responseHistory.push(responseSlot);
562
1020
  if (shapeHistory.length > SHAPE_HISTORY_MAX) shapeHistory.shift();
1021
+ if (responseHistory.length > SHAPE_HISTORY_MAX) responseHistory.shift();
563
1022
  }
564
1023
  }
565
1024
  const budget = readBudget();
@@ -639,6 +1098,8 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
639
1098
  if (gatewayMode === "mock") {
640
1099
  const responseBody = mockCompletion(requestBody, url.pathname);
641
1100
  const responseText = JSON.stringify(responseBody);
1101
+ // Record before unblocking the client so a concurrent next turn sees it.
1102
+ if (responseSlot) responseSlot.sig = responseSignature(responseBody);
642
1103
  send(response, 200, responseText, "application/json; charset=utf-8");
643
1104
  await appendGatewayEvent({
644
1105
  at: new Date().toISOString(),
@@ -685,13 +1146,14 @@ function createGatewayServer({ port = 8792, mock = false, upstream = {} } = {})
685
1146
  body: forwardBody
686
1147
  });
687
1148
  const responseText = await upstreamResponse.text();
1149
+ const responseBody = safeJson(responseText) ?? {};
1150
+ // Record before unblocking the client so a concurrent next turn sees it.
1151
+ if (responseSlot) responseSlot.sig = responseSignature(responseBody);
688
1152
  response.writeHead(upstreamResponse.status, {
689
1153
  "content-type": upstreamResponse.headers.get("content-type") ?? "application/json",
690
1154
  "cache-control": "no-store"
691
1155
  });
692
1156
  response.end(responseText);
693
-
694
- const responseBody = safeJson(responseText) ?? {};
695
1157
  await appendGatewayEvent({
696
1158
  at: new Date().toISOString(),
697
1159
  path: url.pathname,
@@ -761,7 +1223,7 @@ export async function startGateway({ port = 8792, mock = false } = {}) {
761
1223
  // it down afterward. Upstream is pinned from the CURRENT env before the child's
762
1224
  // base URLs are rewritten, so the gateway proxies to the real provider, not to
763
1225
  // itself.
764
- async function startEphemeralGateway({ mock = false } = {}) {
1226
+ export async function startEphemeralGateway({ mock = false } = {}) {
765
1227
  await ensureStore();
766
1228
  const upstream = {
767
1229
  openaiKey: process.env.AIM_UPSTREAM_API_KEY ?? process.env.OPENAI_API_KEY,
@@ -1572,7 +2034,12 @@ const MODEL_PRICES = [
1572
2034
  { match: ["gpt-4.1-mini"], inputPerMillion: 0.4, outputPerMillion: 1.6, cacheReadPerMillion: 0.1, provider: "openai" },
1573
2035
  { match: ["gpt-4.1"], inputPerMillion: 2, outputPerMillion: 8, cacheReadPerMillion: 0.5, provider: "openai" },
1574
2036
  { match: ["gpt-4o-mini"], inputPerMillion: 0.15, outputPerMillion: 0.6, cacheReadPerMillion: 0.075, provider: "openai" },
1575
- { match: ["gpt-4o"], inputPerMillion: 2.5, outputPerMillion: 10, cacheReadPerMillion: 1.25, provider: "openai" }
2037
+ { match: ["gpt-4o"], inputPerMillion: 2.5, outputPerMillion: 10, cacheReadPerMillion: 1.25, provider: "openai" },
2038
+ // DeepSeek (api-docs.deepseek.com/quick_start/pricing). OpenAI-compatible API,
2039
+ // so the same gateway prices and caps it with no extra setup. deepseek-chat /
2040
+ // deepseek-reasoner are the non-thinking / thinking modes of deepseek-v4-flash.
2041
+ { match: ["deepseek-v4-pro"], inputPerMillion: 0.435, outputPerMillion: 0.87, cacheReadPerMillion: 0.003625, provider: "deepseek" },
2042
+ { match: ["deepseek-v4-flash", "deepseek-chat", "deepseek-reasoner", "deepseek"], inputPerMillion: 0.14, outputPerMillion: 0.28, cacheReadPerMillion: 0.0028, provider: "deepseek" }
1576
2043
  ];
1577
2044
 
1578
2045
  function estimateApiCost(usage, model) {