@desplega.ai/agent-swarm 1.71.2 → 1.72.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +3 -2
  2. package/openapi.json +994 -62
  3. package/package.json +2 -1
  4. package/src/be/budget-admission.ts +121 -0
  5. package/src/be/budget-refusal-notify.ts +145 -0
  6. package/src/be/db.ts +488 -5
  7. package/src/be/migrations/044_provider_meta.sql +2 -0
  8. package/src/be/migrations/046_budgets_and_pricing.sql +87 -0
  9. package/src/be/migrations/047_session_costs_cost_source.sql +16 -0
  10. package/src/cli.tsx +22 -1
  11. package/src/commands/claude-managed-setup.ts +687 -0
  12. package/src/commands/codex-login.ts +1 -1
  13. package/src/commands/runner.ts +175 -28
  14. package/src/commands/templates.ts +10 -6
  15. package/src/http/budgets.ts +219 -0
  16. package/src/http/index.ts +6 -0
  17. package/src/http/integrations.ts +134 -0
  18. package/src/http/poll.ts +161 -3
  19. package/src/http/pricing.ts +245 -0
  20. package/src/http/session-data.ts +54 -6
  21. package/src/http/tasks.ts +23 -2
  22. package/src/prompts/base-prompt.ts +103 -73
  23. package/src/prompts/session-templates.ts +43 -0
  24. package/src/providers/claude-adapter.ts +3 -1
  25. package/src/providers/claude-managed-adapter.ts +871 -0
  26. package/src/providers/claude-managed-models.ts +117 -0
  27. package/src/providers/claude-managed-swarm-events.ts +77 -0
  28. package/src/providers/codex-adapter.ts +3 -1
  29. package/src/providers/codex-skill-resolver.ts +10 -0
  30. package/src/providers/codex-swarm-events.ts +20 -161
  31. package/src/providers/devin-adapter.ts +894 -0
  32. package/src/providers/devin-api.ts +207 -0
  33. package/src/providers/devin-playbooks.ts +91 -0
  34. package/src/providers/devin-skill-resolver.ts +113 -0
  35. package/src/providers/index.ts +10 -1
  36. package/src/providers/pi-mono-adapter.ts +3 -1
  37. package/src/providers/swarm-events-shared.ts +262 -0
  38. package/src/providers/types.ts +26 -1
  39. package/src/tests/base-prompt.test.ts +199 -0
  40. package/src/tests/budget-admission.test.ts +339 -0
  41. package/src/tests/budget-claim-gate.test.ts +288 -0
  42. package/src/tests/budget-refusal-notification.test.ts +324 -0
  43. package/src/tests/budgets-routes.test.ts +331 -0
  44. package/src/tests/claude-managed-adapter.test.ts +1301 -0
  45. package/src/tests/claude-managed-setup.test.ts +325 -0
  46. package/src/tests/devin-adapter.test.ts +677 -0
  47. package/src/tests/devin-api.test.ts +339 -0
  48. package/src/tests/integrations-http.test.ts +211 -0
  49. package/src/tests/migration-046-budgets.test.ts +327 -0
  50. package/src/tests/pricing-routes.test.ts +315 -0
  51. package/src/tests/prompt-template-remaining.test.ts +4 -0
  52. package/src/tests/prompt-template-session.test.ts +2 -2
  53. package/src/tests/provider-adapter.test.ts +1 -1
  54. package/src/tests/runner-budget-refused.test.ts +271 -0
  55. package/src/tests/session-costs-codex-recompute.test.ts +386 -0
  56. package/src/tools/poll-task.ts +13 -2
  57. package/src/tools/task-action.ts +92 -2
  58. package/src/tools/templates.ts +29 -0
  59. package/src/types.ts +116 -0
  60. package/src/utils/budget-backoff.ts +34 -0
  61. package/src/utils/credentials.ts +4 -0
  62. package/src/utils/provider-metadata.ts +9 -0
@@ -0,0 +1,271 @@
1
+ // Phase 4 — worker dispatch tests for `budget_refused` triggers.
2
+ //
3
+ // The full runner poll loop in `src/commands/runner.ts:2926+` is hard to
4
+ // unit-test directly (it boots adapters, opens HTTP, etc.). The back-off
5
+ // computation is therefore extracted into a pure helper —
6
+ // `computeBudgetBackoffMs` — which we exercise here in isolation. We also
7
+ // run a small in-test simulation of the loop's *back-off-state machine*
8
+ // (the `consecutiveBudgetRefusals` counter + reset semantics) against a
9
+ // stubbed `pollForTrigger` to assert the behaviors mandated by the plan:
10
+ //
11
+ // - back-off doubles per consecutive refusal up to the 5-minute cap;
12
+ // - any non-refused trigger resets the counter (next refusal restarts
13
+ // at base interval);
14
+ // - refusals do *not* increment whatever empty-poll counter the loop
15
+ // maintains (we simulate one alongside the back-off counter and assert
16
+ // it stays at 0);
17
+ // - the structured log payload passes through `scrubSecrets` at egress.
18
+
19
+ import { describe, expect, mock, test } from "bun:test";
20
+ import { BUDGET_BACKOFF_CAP_MS, computeBudgetBackoffMs } from "../utils/budget-backoff";
21
+ import { scrubSecrets } from "../utils/secret-scrubber";
22
+
23
+ // ─── computeBudgetBackoffMs ────────────────────────────────────────────────
24
+
25
+ describe("computeBudgetBackoffMs", () => {
26
+ test("doubles per consecutive refusal starting at basePollMs", () => {
27
+ const base = 2000;
28
+ expect(computeBudgetBackoffMs(1, base)).toBe(2000);
29
+ expect(computeBudgetBackoffMs(2, base)).toBe(4000);
30
+ expect(computeBudgetBackoffMs(3, base)).toBe(8000);
31
+ expect(computeBudgetBackoffMs(4, base)).toBe(16_000);
32
+ expect(computeBudgetBackoffMs(5, base)).toBe(32_000);
33
+ expect(computeBudgetBackoffMs(6, base)).toBe(64_000);
34
+ expect(computeBudgetBackoffMs(7, base)).toBe(128_000);
35
+ expect(computeBudgetBackoffMs(8, base)).toBe(256_000);
36
+ });
37
+
38
+ test("caps at 5 minutes regardless of how many refusals", () => {
39
+ const base = 2000;
40
+ // 2000 * 2^8 = 512000 > 300000 cap.
41
+ expect(computeBudgetBackoffMs(9, base)).toBe(BUDGET_BACKOFF_CAP_MS);
42
+ expect(computeBudgetBackoffMs(20, base)).toBe(BUDGET_BACKOFF_CAP_MS);
43
+ expect(computeBudgetBackoffMs(1000, base)).toBe(BUDGET_BACKOFF_CAP_MS);
44
+ });
45
+
46
+ test("first refusal sleeps exactly basePollMs (no doubling yet)", () => {
47
+ expect(computeBudgetBackoffMs(1, 100)).toBe(100);
48
+ expect(computeBudgetBackoffMs(1, 5000)).toBe(5000);
49
+ });
50
+
51
+ test("BUDGET_BACKOFF_CAP_MS is exactly 5 minutes", () => {
52
+ expect(BUDGET_BACKOFF_CAP_MS).toBe(5 * 60 * 1000);
53
+ });
54
+
55
+ test("guards against pathological non-positive inputs", () => {
56
+ // 0 or negative => treated as 1 (first refusal) rather than dividing.
57
+ expect(computeBudgetBackoffMs(0, 2000)).toBe(2000);
58
+ expect(computeBudgetBackoffMs(-5, 2000)).toBe(2000);
59
+ });
60
+ });
61
+
62
+ // ─── back-off state machine simulation ─────────────────────────────────────
63
+ //
64
+ // Re-implements the relevant slice of the poll loop so we can assert the
65
+ // counter semantics without booting the full runner. If you change the
66
+ // behavior in `runner.ts`, mirror it here. The logic must stay byte-equal
67
+ // to the block in `src/commands/runner.ts` (search for
68
+ // `consecutiveBudgetRefusals` there).
69
+
70
+ interface LoopTrigger {
71
+ type:
72
+ | "task_assigned"
73
+ | "task_offered"
74
+ | "unread_mentions"
75
+ | "pool_tasks_available"
76
+ | "channel_activity"
77
+ | "budget_refused";
78
+ cause?: "agent" | "global";
79
+ agentSpend?: number;
80
+ agentBudget?: number;
81
+ globalSpend?: number;
82
+ globalBudget?: number;
83
+ resetAt?: string;
84
+ }
85
+
86
+ interface SimResult {
87
+ /** Sleeps recorded on each `budget_refused` outcome, in order. */
88
+ backoffSleeps: number[];
89
+ /** Final counter values. */
90
+ consecutiveBudgetRefusals: number;
91
+ /** Independent empty-poll counter — must NOT be bumped by refusals. */
92
+ emptyPollCount: number;
93
+ /** Each scrubbed log line emitted by the back-off branch. */
94
+ logLines: string[];
95
+ /** Number of times the "dispatch normally" branch was taken. */
96
+ dispatchedTriggers: number;
97
+ }
98
+
99
+ /**
100
+ * Mirrors the back-off slice of the runner poll loop. `triggers` is the
101
+ * sequence `pollForTrigger` returns on consecutive iterations (null = no
102
+ * trigger inside the long-poll window).
103
+ */
104
+ function simulatePollLoop(
105
+ triggers: Array<LoopTrigger | null>,
106
+ basePollMs: number,
107
+ log: (line: string) => void,
108
+ ): SimResult {
109
+ let consecutiveBudgetRefusals = 0;
110
+ let emptyPollCount = 0;
111
+ const backoffSleeps: number[] = [];
112
+ let dispatchedTriggers = 0;
113
+
114
+ for (const trigger of triggers) {
115
+ if (trigger) {
116
+ if (trigger.type === "budget_refused") {
117
+ consecutiveBudgetRefusals++;
118
+ const backoffMs = computeBudgetBackoffMs(consecutiveBudgetRefusals, basePollMs);
119
+ const refusalPayload = JSON.stringify({
120
+ event: "budget_refused",
121
+ cause: trigger.cause,
122
+ agentSpend: trigger.agentSpend,
123
+ agentBudget: trigger.agentBudget,
124
+ globalSpend: trigger.globalSpend,
125
+ globalBudget: trigger.globalBudget,
126
+ resetAt: trigger.resetAt,
127
+ consecutiveRefusals: consecutiveBudgetRefusals,
128
+ backoffMs,
129
+ });
130
+ log(`[role] budget_refused — backing off ${backoffMs}ms: ${scrubSecrets(refusalPayload)}`);
131
+ backoffSleeps.push(backoffMs);
132
+ // `continue` — DO NOT increment empty-poll count.
133
+ continue;
134
+ }
135
+ consecutiveBudgetRefusals = 0;
136
+ dispatchedTriggers++;
137
+ } else {
138
+ // Empty poll — bumps the empty-poll counter but does not reset
139
+ // back-off state (refusals are about budget, not silence).
140
+ emptyPollCount++;
141
+ }
142
+ }
143
+
144
+ return {
145
+ backoffSleeps,
146
+ consecutiveBudgetRefusals,
147
+ emptyPollCount,
148
+ logLines: [], // populated by caller via the log callback
149
+ dispatchedTriggers,
150
+ };
151
+ }
152
+
153
+ // ─── Behavior tests against the simulated loop ─────────────────────────────
154
+
155
+ const REFUSAL: LoopTrigger = {
156
+ type: "budget_refused",
157
+ cause: "agent",
158
+ agentSpend: 0.05,
159
+ agentBudget: 0.01,
160
+ resetAt: "2026-04-29T00:00:00.000Z",
161
+ };
162
+
163
+ const TASK: LoopTrigger = { type: "task_assigned" };
164
+
165
+ describe("poll-loop back-off state machine", () => {
166
+ test("doubles up to but not past 5 min on a long refusal streak", () => {
167
+ const base = 2000;
168
+ // 9 consecutive refusals: 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s, then capped at 300s.
169
+ const refusals = Array<LoopTrigger | null>(9).fill(REFUSAL);
170
+ const lines: string[] = [];
171
+ const result = simulatePollLoop(refusals, base, (l) => lines.push(l));
172
+ expect(result.backoffSleeps).toEqual([
173
+ 2000,
174
+ 4000,
175
+ 8000,
176
+ 16_000,
177
+ 32_000,
178
+ 64_000,
179
+ 128_000,
180
+ 256_000,
181
+ BUDGET_BACKOFF_CAP_MS,
182
+ ]);
183
+ // Every entry is <= cap.
184
+ for (const s of result.backoffSleeps) expect(s).toBeLessThanOrEqual(BUDGET_BACKOFF_CAP_MS);
185
+ expect(result.consecutiveBudgetRefusals).toBe(9);
186
+ });
187
+
188
+ test("resets to 0 after a non-refused trigger; subsequent refusal restarts at base", () => {
189
+ const base = 2000;
190
+ // refusal, refusal, task, refusal -> backoffs should be [2000, 4000, 2000].
191
+ const sequence: Array<LoopTrigger | null> = [REFUSAL, REFUSAL, TASK, REFUSAL];
192
+ const lines: string[] = [];
193
+ const result = simulatePollLoop(sequence, base, (l) => lines.push(l));
194
+ expect(result.backoffSleeps).toEqual([2000, 4000, 2000]);
195
+ expect(result.dispatchedTriggers).toBe(1);
196
+ // Final counter reflects the streak after the reset (1 refusal).
197
+ expect(result.consecutiveBudgetRefusals).toBe(1);
198
+ });
199
+
200
+ test("empty-poll counter is unchanged across refusals", () => {
201
+ const base = 2000;
202
+ // 5 refusals interleaved with no nulls — empty-poll counter must stay 0.
203
+ const sequence: Array<LoopTrigger | null> = [REFUSAL, REFUSAL, REFUSAL, REFUSAL, REFUSAL];
204
+ const lines: string[] = [];
205
+ const result = simulatePollLoop(sequence, base, (l) => lines.push(l));
206
+ expect(result.emptyPollCount).toBe(0);
207
+ expect(result.backoffSleeps).toHaveLength(5);
208
+ });
209
+
210
+ test("empty polls (null triggers) bump empty-poll counter but not back-off", () => {
211
+ const base = 2000;
212
+ // null, null, refusal, null -> empty=3, refusals=1.
213
+ const sequence: Array<LoopTrigger | null> = [null, null, REFUSAL, null];
214
+ const lines: string[] = [];
215
+ const result = simulatePollLoop(sequence, base, (l) => lines.push(l));
216
+ expect(result.emptyPollCount).toBe(3);
217
+ expect(result.backoffSleeps).toEqual([2000]);
218
+ });
219
+
220
+ test("structured refusal log goes through scrubSecrets at egress", () => {
221
+ const base = 2000;
222
+ const lines: string[] = [];
223
+
224
+ // Spy: replace scrubSecrets temporarily via a wrapper. We can't mock
225
+ // module exports without `mock.module`, but we can assert on the
226
+ // emitted line content (which was produced by scrubSecrets in the
227
+ // simulator — same call shape as runner.ts).
228
+ const result = simulatePollLoop([REFUSAL], base, (l) => lines.push(l));
229
+
230
+ expect(result.backoffSleeps).toEqual([2000]);
231
+ expect(lines).toHaveLength(1);
232
+ const line = lines[0]!;
233
+ expect(line).toContain("budget_refused");
234
+ expect(line).toContain("backing off 2000ms");
235
+ // Payload fields are present (no secret-shaped tokens here, so output
236
+ // matches input — the assertion is that scrubbing was applied at all).
237
+ expect(line).toContain('"cause":"agent"');
238
+ expect(line).toContain('"resetAt":"2026-04-29T00:00:00.000Z"');
239
+ expect(line).toContain('"consecutiveRefusals":1');
240
+ expect(line).toContain('"backoffMs":2000');
241
+ });
242
+
243
+ test("scrubSecrets is invoked with the structured payload (call signature check)", () => {
244
+ // Drop-in mock: wrap the real scrubber and count calls. We use bun's
245
+ // `mock` to track call count + arg shape without changing behavior.
246
+ const realScrub = scrubSecrets;
247
+ const spy = mock((s: string | null | undefined) => realScrub(s));
248
+
249
+ // Replicate the exact code path the runner uses.
250
+ const trigger = REFUSAL;
251
+ const consecutiveRefusals = 1;
252
+ const backoffMs = computeBudgetBackoffMs(consecutiveRefusals, 2000);
253
+ const refusalPayload = JSON.stringify({
254
+ event: "budget_refused",
255
+ cause: trigger.cause,
256
+ agentSpend: trigger.agentSpend,
257
+ agentBudget: trigger.agentBudget,
258
+ globalSpend: trigger.globalSpend,
259
+ globalBudget: trigger.globalBudget,
260
+ resetAt: trigger.resetAt,
261
+ consecutiveRefusals,
262
+ backoffMs,
263
+ });
264
+ const scrubbed = spy(refusalPayload);
265
+
266
+ expect(spy).toHaveBeenCalledTimes(1);
267
+ expect(spy.mock.calls[0]![0]).toBe(refusalPayload);
268
+ // String input -> string output, payload preserved (no actual secrets in fixture).
269
+ expect(scrubbed).toBe(refusalPayload);
270
+ });
271
+ });
@@ -0,0 +1,386 @@
1
+ // Phase 6: Codex USD recompute on POST /api/session-costs.
2
+ //
3
+ // When the worker reports `provider='codex'` and DB pricing rows exist for
4
+ // all three token classes at the lookup time, the API recomputes
5
+ // `totalCostUsd` from tokens × DB prices and tags the row as
6
+ // `costSource='pricing-table'`. If any class is missing a row, fall back to
7
+ // the worker-reported value with `costSource='harness'`.
8
+ // Claude / pi paths always trust harness USD (`costSource='harness'`).
9
+
10
+ import { afterAll, afterEach, beforeAll, describe, expect, test } from "bun:test";
11
+ import { unlink } from "node:fs/promises";
12
+ import {
13
+ createServer as createHttpServer,
14
+ type IncomingMessage,
15
+ type Server,
16
+ type ServerResponse,
17
+ } from "node:http";
18
+ import { closeDb, createAgent, getDb, initDb, insertPricingRow } from "../be/db";
19
+ import { handleCore } from "../http/core";
20
+ import { handleSessionData } from "../http/session-data";
21
+ import { getPathSegments, parseQueryParams } from "../http/utils";
22
+
23
+ const TEST_DB_PATH = "./test-session-costs-codex-recompute.sqlite";
24
+ const API_KEY = "test-codex-recompute-secret";
25
+
26
+ async function removeDbFiles(path: string): Promise<void> {
27
+ for (const suffix of ["", "-wal", "-shm"]) {
28
+ try {
29
+ await unlink(path + suffix);
30
+ } catch (error) {
31
+ if ((error as NodeJS.ErrnoException).code !== "ENOENT") throw error;
32
+ }
33
+ }
34
+ }
35
+
36
+ async function listen(server: Server): Promise<number> {
37
+ await new Promise<void>((resolve) => server.listen(0, resolve));
38
+ const addr = server.address();
39
+ if (!addr || typeof addr === "string") throw new Error("no port");
40
+ return addr.port;
41
+ }
42
+
43
+ function createTestServer(apiKey: string): Server {
44
+ return createHttpServer(async (req: IncomingMessage, res: ServerResponse) => {
45
+ const myAgentId = req.headers["x-agent-id"] as string | undefined;
46
+ const handled = await handleCore(req, res, myAgentId, apiKey);
47
+ if (handled) return;
48
+ const pathSegments = getPathSegments(req.url || "");
49
+ const queryParams = parseQueryParams(req.url || "");
50
+ const ok = await handleSessionData(req, res, pathSegments, queryParams, myAgentId);
51
+ if (!ok) {
52
+ res.writeHead(404);
53
+ res.end("Not Found");
54
+ }
55
+ });
56
+ }
57
+
58
+ let server: Server;
59
+ let port: number;
60
+ let testAgent: { id: string };
61
+
62
+ beforeAll(async () => {
63
+ await removeDbFiles(TEST_DB_PATH);
64
+ initDb(TEST_DB_PATH);
65
+ testAgent = createAgent({ name: "codex-test", isLead: false, status: "idle" });
66
+ server = createTestServer(API_KEY);
67
+ port = await listen(server);
68
+ });
69
+
70
+ afterAll(async () => {
71
+ await new Promise<void>((resolve) => server.close(() => resolve()));
72
+ closeDb();
73
+ await removeDbFiles(TEST_DB_PATH);
74
+ });
75
+
76
+ afterEach(() => {
77
+ const db = getDb();
78
+ db.prepare("DELETE FROM session_costs").run();
79
+ // Leave seed pricing rows in place; remove anything we added explicitly.
80
+ db.prepare("DELETE FROM pricing WHERE effective_from > 0").run();
81
+ // Also delete the seed rows for the synthetic models we use in some tests.
82
+ db.prepare("DELETE FROM pricing WHERE model = 'codex-test-synth'").run();
83
+ });
84
+
85
+ function authedFetch(path: string, init: RequestInit = {}): Promise<Response> {
86
+ return fetch(`http://localhost:${port}${path}`, {
87
+ ...init,
88
+ headers: {
89
+ Authorization: `Bearer ${API_KEY}`,
90
+ "Content-Type": "application/json",
91
+ ...(init.headers ?? {}),
92
+ },
93
+ });
94
+ }
95
+
96
+ interface CreatedCostResponse {
97
+ success: boolean;
98
+ cost: {
99
+ id: string;
100
+ totalCostUsd: number;
101
+ costSource: "harness" | "pricing-table";
102
+ model: string;
103
+ };
104
+ }
105
+
106
+ describe("Phase 6 — POST /api/session-costs: Codex USD recompute", () => {
107
+ test("provider=codex with all three pricing rows present → recompute uses DB prices, costSource='pricing-table'", async () => {
108
+ // Mid-range custom rates: input=2.0/M, cached=0.2/M, output=10.0/M
109
+ insertPricingRow({
110
+ provider: "codex",
111
+ model: "codex-test-synth",
112
+ tokenClass: "input",
113
+ effectiveFrom: 1,
114
+ pricePerMillionUsd: 2.0,
115
+ });
116
+ insertPricingRow({
117
+ provider: "codex",
118
+ model: "codex-test-synth",
119
+ tokenClass: "cached_input",
120
+ effectiveFrom: 1,
121
+ pricePerMillionUsd: 0.2,
122
+ });
123
+ insertPricingRow({
124
+ provider: "codex",
125
+ model: "codex-test-synth",
126
+ tokenClass: "output",
127
+ effectiveFrom: 1,
128
+ pricePerMillionUsd: 10.0,
129
+ });
130
+
131
+ const res = await authedFetch(`/api/session-costs`, {
132
+ method: "POST",
133
+ body: JSON.stringify({
134
+ sessionId: "codex-recompute-1",
135
+ agentId: testAgent.id,
136
+ // Worker-reported value the API is expected to OVERWRITE.
137
+ totalCostUsd: 999.99,
138
+ inputTokens: 1_000_000, // 1M total input
139
+ cacheReadTokens: 200_000, // 200k cached
140
+ outputTokens: 500_000, // 500k output
141
+ model: "codex-test-synth",
142
+ provider: "codex",
143
+ durationMs: 1_000,
144
+ numTurns: 1,
145
+ }),
146
+ });
147
+ expect(res.status).toBe(201);
148
+ const body = (await res.json()) as CreatedCostResponse;
149
+ expect(body.cost.costSource).toBe("pricing-table");
150
+ // uncached = 1_000_000 - 200_000 = 800_000
151
+ // cost = (800_000 * 2.0 + 200_000 * 0.2 + 500_000 * 10.0) / 1_000_000
152
+ // = (1_600_000 + 40_000 + 5_000_000) / 1_000_000 = 6.64
153
+ expect(body.cost.totalCostUsd).toBeCloseTo(6.64, 5);
154
+ });
155
+
156
+ test("provider=codex but a token class is missing → falls back to worker value, costSource='harness'", async () => {
157
+ // Only seed input + cached_input. Missing output forces fallback.
158
+ insertPricingRow({
159
+ provider: "codex",
160
+ model: "codex-test-synth",
161
+ tokenClass: "input",
162
+ effectiveFrom: 1,
163
+ pricePerMillionUsd: 2.0,
164
+ });
165
+ insertPricingRow({
166
+ provider: "codex",
167
+ model: "codex-test-synth",
168
+ tokenClass: "cached_input",
169
+ effectiveFrom: 1,
170
+ pricePerMillionUsd: 0.2,
171
+ });
172
+
173
+ const res = await authedFetch(`/api/session-costs`, {
174
+ method: "POST",
175
+ body: JSON.stringify({
176
+ sessionId: "codex-fallback-1",
177
+ agentId: testAgent.id,
178
+ totalCostUsd: 1.23,
179
+ inputTokens: 100,
180
+ outputTokens: 50,
181
+ model: "codex-test-synth",
182
+ provider: "codex",
183
+ durationMs: 1_000,
184
+ numTurns: 1,
185
+ }),
186
+ });
187
+ expect(res.status).toBe(201);
188
+ const body = (await res.json()) as CreatedCostResponse;
189
+ expect(body.cost.costSource).toBe("harness");
190
+ // Worker value preserved verbatim.
191
+ expect(body.cost.totalCostUsd).toBe(1.23);
192
+ });
193
+
194
+ test("provider=claude records harness USD as-is regardless of DB pricing rows", async () => {
195
+ // Even if there are codex pricing rows, claude must NOT be touched.
196
+ const res = await authedFetch(`/api/session-costs`, {
197
+ method: "POST",
198
+ body: JSON.stringify({
199
+ sessionId: "claude-passthrough-1",
200
+ agentId: testAgent.id,
201
+ totalCostUsd: 7.77,
202
+ inputTokens: 100,
203
+ outputTokens: 50,
204
+ model: "sonnet-4",
205
+ provider: "claude",
206
+ durationMs: 1_000,
207
+ numTurns: 1,
208
+ }),
209
+ });
210
+ expect(res.status).toBe(201);
211
+ const body = (await res.json()) as CreatedCostResponse;
212
+ expect(body.cost.costSource).toBe("harness");
213
+ expect(body.cost.totalCostUsd).toBe(7.77);
214
+ });
215
+
216
+ test("provider=pi records harness USD as-is regardless of DB pricing rows", async () => {
217
+ const res = await authedFetch(`/api/session-costs`, {
218
+ method: "POST",
219
+ body: JSON.stringify({
220
+ sessionId: "pi-passthrough-1",
221
+ agentId: testAgent.id,
222
+ totalCostUsd: 0.42,
223
+ inputTokens: 10,
224
+ outputTokens: 5,
225
+ model: "openrouter/google/gemini-3-flash-preview",
226
+ provider: "pi",
227
+ durationMs: 1_000,
228
+ numTurns: 1,
229
+ }),
230
+ });
231
+ expect(res.status).toBe(201);
232
+ const body = (await res.json()) as CreatedCostResponse;
233
+ expect(body.cost.costSource).toBe("harness");
234
+ expect(body.cost.totalCostUsd).toBe(0.42);
235
+ });
236
+
237
+ test("provider field omitted → no recompute, costSource='harness' (back-compat)", async () => {
238
+ // No `provider` field at all (legacy call shape). Expect harness path.
239
+ const res = await authedFetch(`/api/session-costs`, {
240
+ method: "POST",
241
+ body: JSON.stringify({
242
+ sessionId: "legacy-1",
243
+ agentId: testAgent.id,
244
+ totalCostUsd: 1.0,
245
+ durationMs: 1_000,
246
+ numTurns: 1,
247
+ }),
248
+ });
249
+ expect(res.status).toBe(201);
250
+ const body = (await res.json()) as CreatedCostResponse;
251
+ expect(body.cost.costSource).toBe("harness");
252
+ expect(body.cost.totalCostUsd).toBe(1.0);
253
+ });
254
+
255
+ describe("historical correctness — older session_cost createdAt picks older effective_from", () => {
256
+ // Anchor T0 well in the past so we can place newer rows around it.
257
+ const T0 = 1_700_000_000_000; // 2023-11-14ish
258
+ const PRICE_A = 1.0;
259
+ const PRICE_B = 2.0;
260
+
261
+ function postCost(opts: { sessionId: string; createdAt: number }) {
262
+ return authedFetch(`/api/session-costs`, {
263
+ method: "POST",
264
+ body: JSON.stringify({
265
+ sessionId: opts.sessionId,
266
+ agentId: testAgent.id,
267
+ totalCostUsd: 999.99, // worker-reported, expected to be overwritten
268
+ inputTokens: 1_000_000, // 1M input total
269
+ cacheReadTokens: 0, // no cache for simplicity
270
+ outputTokens: 0,
271
+ model: "codex-test-synth",
272
+ provider: "codex",
273
+ createdAt: opts.createdAt,
274
+ durationMs: 1_000,
275
+ numTurns: 1,
276
+ }),
277
+ });
278
+ }
279
+
280
+ test("createdAt = T0+1 → uses price A (the only row at that time)", async () => {
281
+ // Seed price A at T0, and the cached/output rows at the same time so all
282
+ // three classes resolve.
283
+ insertPricingRow({
284
+ provider: "codex",
285
+ model: "codex-test-synth",
286
+ tokenClass: "input",
287
+ effectiveFrom: T0,
288
+ pricePerMillionUsd: PRICE_A,
289
+ });
290
+ insertPricingRow({
291
+ provider: "codex",
292
+ model: "codex-test-synth",
293
+ tokenClass: "cached_input",
294
+ effectiveFrom: T0,
295
+ pricePerMillionUsd: 0,
296
+ });
297
+ insertPricingRow({
298
+ provider: "codex",
299
+ model: "codex-test-synth",
300
+ tokenClass: "output",
301
+ effectiveFrom: T0,
302
+ pricePerMillionUsd: 0,
303
+ });
304
+
305
+ const res = await postCost({ sessionId: "hist-1", createdAt: T0 + 1 });
306
+ const body = (await res.json()) as CreatedCostResponse;
307
+ expect(body.cost.costSource).toBe("pricing-table");
308
+ expect(body.cost.totalCostUsd).toBeCloseTo(1.0, 5); // 1M * 1.0 / 1M
309
+ });
310
+
311
+ test("createdAt = T0+200 with new row at T0+100 → uses price B", async () => {
312
+ insertPricingRow({
313
+ provider: "codex",
314
+ model: "codex-test-synth",
315
+ tokenClass: "input",
316
+ effectiveFrom: T0,
317
+ pricePerMillionUsd: PRICE_A,
318
+ });
319
+ insertPricingRow({
320
+ provider: "codex",
321
+ model: "codex-test-synth",
322
+ tokenClass: "cached_input",
323
+ effectiveFrom: T0,
324
+ pricePerMillionUsd: 0,
325
+ });
326
+ insertPricingRow({
327
+ provider: "codex",
328
+ model: "codex-test-synth",
329
+ tokenClass: "output",
330
+ effectiveFrom: T0,
331
+ pricePerMillionUsd: 0,
332
+ });
333
+ // Newer input row supersedes A from T0+100 onward.
334
+ insertPricingRow({
335
+ provider: "codex",
336
+ model: "codex-test-synth",
337
+ tokenClass: "input",
338
+ effectiveFrom: T0 + 100,
339
+ pricePerMillionUsd: PRICE_B,
340
+ });
341
+
342
+ const res = await postCost({ sessionId: "hist-2", createdAt: T0 + 200 });
343
+ const body = (await res.json()) as CreatedCostResponse;
344
+ expect(body.cost.costSource).toBe("pricing-table");
345
+ expect(body.cost.totalCostUsd).toBeCloseTo(2.0, 5); // 1M * 2.0 / 1M
346
+ });
347
+
348
+ test("createdAt = T0+50 with new row at T0+100 → STILL uses price A (older effective_from)", async () => {
349
+ insertPricingRow({
350
+ provider: "codex",
351
+ model: "codex-test-synth",
352
+ tokenClass: "input",
353
+ effectiveFrom: T0,
354
+ pricePerMillionUsd: PRICE_A,
355
+ });
356
+ insertPricingRow({
357
+ provider: "codex",
358
+ model: "codex-test-synth",
359
+ tokenClass: "cached_input",
360
+ effectiveFrom: T0,
361
+ pricePerMillionUsd: 0,
362
+ });
363
+ insertPricingRow({
364
+ provider: "codex",
365
+ model: "codex-test-synth",
366
+ tokenClass: "output",
367
+ effectiveFrom: T0,
368
+ pricePerMillionUsd: 0,
369
+ });
370
+ // Newer row exists, but the session_cost is older than T0+100.
371
+ insertPricingRow({
372
+ provider: "codex",
373
+ model: "codex-test-synth",
374
+ tokenClass: "input",
375
+ effectiveFrom: T0 + 100,
376
+ pricePerMillionUsd: PRICE_B,
377
+ });
378
+
379
+ const res = await postCost({ sessionId: "hist-3", createdAt: T0 + 50 });
380
+ const body = (await res.json()) as CreatedCostResponse;
381
+ expect(body.cost.costSource).toBe("pricing-table");
382
+ // Older effective_from = T0 wins because session_cost.createdAt = T0+50 < T0+100.
383
+ expect(body.cost.totalCostUsd).toBeCloseTo(1.0, 5);
384
+ });
385
+ });
386
+ });
@@ -67,6 +67,13 @@ export const registerPollTaskTool = (server: McpServer) => {
67
67
  const agentId = requestInfo.agentId;
68
68
  const now = new Date();
69
69
  const maxTime = addMinutes(now, MAX_POLL_DURATION_MS / 60000);
70
+ // Phase 3 (D-R3): when a budget refusal occurs, the empty-poll counter
71
+ // must NOT advance — refused ≠ empty. The MCP `poll-task` tool is NOT
72
+ // gated by `canClaim` in V1 (per plan §"What We're NOT Doing" — D-R1),
73
+ // so this is structural / forward-compat plumbing: future revisions
74
+ // that gate poll-task flip this to true at the refusal site instead of
75
+ // touching the bookkeeping path below.
76
+ const wasBudgetRefused: boolean = false;
70
77
 
71
78
  const agent = getAgentById(agentId);
72
79
  if (!agent) {
@@ -174,8 +181,12 @@ export const registerPollTaskTool = (server: McpServer) => {
174
181
 
175
182
  const waitedForSeconds = Math.round((Date.now() - now.getTime()) / 1000);
176
183
 
177
- // Increment empty poll count and check if agent should exit
178
- const newCount = incrementEmptyPollCount(agentId);
184
+ // Increment empty poll count and check if agent should exit.
185
+ // Refused empty (D-R3) — skip bookkeeping when a budget refusal
186
+ // occurred during this poll window.
187
+ const newCount = wasBudgetRefused
188
+ ? (getAgentById(agentId)?.emptyPollCount ?? 0)
189
+ : incrementEmptyPollCount(agentId);
179
190
  const shouldExit = newCount >= MAX_EMPTY_POLLS;
180
191
 
181
192
  // If no task was found within the time limit