npm - @desplega.ai/agent-swarm - Versions diffs - 1.71.2 → 1.72.0 - Mend

@desplega.ai/agent-swarm 1.71.2 → 1.72.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/README.md +3 -2
package/openapi.json +994 -62
package/package.json +2 -1
package/src/be/budget-admission.ts +121 -0
package/src/be/budget-refusal-notify.ts +145 -0
package/src/be/db.ts +488 -5
package/src/be/migrations/044_provider_meta.sql +2 -0
package/src/be/migrations/046_budgets_and_pricing.sql +87 -0
package/src/be/migrations/047_session_costs_cost_source.sql +16 -0
package/src/cli.tsx +22 -1
package/src/commands/claude-managed-setup.ts +687 -0
package/src/commands/codex-login.ts +1 -1
package/src/commands/runner.ts +175 -28
package/src/commands/templates.ts +10 -6
package/src/http/budgets.ts +219 -0
package/src/http/index.ts +6 -0
package/src/http/integrations.ts +134 -0
package/src/http/poll.ts +161 -3
package/src/http/pricing.ts +245 -0
package/src/http/session-data.ts +54 -6
package/src/http/tasks.ts +23 -2
package/src/prompts/base-prompt.ts +103 -73
package/src/prompts/session-templates.ts +43 -0
package/src/providers/claude-adapter.ts +3 -1
package/src/providers/claude-managed-adapter.ts +871 -0
package/src/providers/claude-managed-models.ts +117 -0
package/src/providers/claude-managed-swarm-events.ts +77 -0
package/src/providers/codex-adapter.ts +3 -1
package/src/providers/codex-skill-resolver.ts +10 -0
package/src/providers/codex-swarm-events.ts +20 -161
package/src/providers/devin-adapter.ts +894 -0
package/src/providers/devin-api.ts +207 -0
package/src/providers/devin-playbooks.ts +91 -0
package/src/providers/devin-skill-resolver.ts +113 -0
package/src/providers/index.ts +10 -1
package/src/providers/pi-mono-adapter.ts +3 -1
package/src/providers/swarm-events-shared.ts +262 -0
package/src/providers/types.ts +26 -1
package/src/tests/base-prompt.test.ts +199 -0
package/src/tests/budget-admission.test.ts +339 -0
package/src/tests/budget-claim-gate.test.ts +288 -0
package/src/tests/budget-refusal-notification.test.ts +324 -0
package/src/tests/budgets-routes.test.ts +331 -0
package/src/tests/claude-managed-adapter.test.ts +1301 -0
package/src/tests/claude-managed-setup.test.ts +325 -0
package/src/tests/devin-adapter.test.ts +677 -0
package/src/tests/devin-api.test.ts +339 -0
package/src/tests/integrations-http.test.ts +211 -0
package/src/tests/migration-046-budgets.test.ts +327 -0
package/src/tests/pricing-routes.test.ts +315 -0
package/src/tests/prompt-template-remaining.test.ts +4 -0
package/src/tests/prompt-template-session.test.ts +2 -2
package/src/tests/provider-adapter.test.ts +1 -1
package/src/tests/runner-budget-refused.test.ts +271 -0
package/src/tests/session-costs-codex-recompute.test.ts +386 -0
package/src/tools/poll-task.ts +13 -2
package/src/tools/task-action.ts +92 -2
package/src/tools/templates.ts +29 -0
package/src/types.ts +116 -0
package/src/utils/budget-backoff.ts +34 -0
package/src/utils/credentials.ts +4 -0
package/src/utils/provider-metadata.ts +9 -0

package/src/tests/runner-budget-refused.test.ts ADDED Viewed

@@ -0,0 +1,271 @@
+// Phase 4 — worker dispatch tests for `budget_refused` triggers.
+//
+// The full runner poll loop in `src/commands/runner.ts:2926+` is hard to
+// unit-test directly (it boots adapters, opens HTTP, etc.). The back-off
+// computation is therefore extracted into a pure helper —
+// `computeBudgetBackoffMs` — which we exercise here in isolation. We also
+// run a small in-test simulation of the loop's *back-off-state machine*
+// (the `consecutiveBudgetRefusals` counter + reset semantics) against a
+// stubbed `pollForTrigger` to assert the behaviors mandated by the plan:
+//
+//   - back-off doubles per consecutive refusal up to the 5-minute cap;
+//   - any non-refused trigger resets the counter (next refusal restarts
+//     at base interval);
+//   - refusals do *not* increment whatever empty-poll counter the loop
+//     maintains (we simulate one alongside the back-off counter and assert
+//     it stays at 0);
+//   - the structured log payload passes through `scrubSecrets` at egress.
+import { describe, expect, mock, test } from "bun:test";
+import { BUDGET_BACKOFF_CAP_MS, computeBudgetBackoffMs } from "../utils/budget-backoff";
+import { scrubSecrets } from "../utils/secret-scrubber";
+// ─── computeBudgetBackoffMs ────────────────────────────────────────────────
+describe("computeBudgetBackoffMs", () => {
+  test("doubles per consecutive refusal starting at basePollMs", () => {
+    const base = 2000;
+    expect(computeBudgetBackoffMs(1, base)).toBe(2000);
+    expect(computeBudgetBackoffMs(2, base)).toBe(4000);
+    expect(computeBudgetBackoffMs(3, base)).toBe(8000);
+    expect(computeBudgetBackoffMs(4, base)).toBe(16_000);
+    expect(computeBudgetBackoffMs(5, base)).toBe(32_000);
+    expect(computeBudgetBackoffMs(6, base)).toBe(64_000);
+    expect(computeBudgetBackoffMs(7, base)).toBe(128_000);
+    expect(computeBudgetBackoffMs(8, base)).toBe(256_000);
+  });
+  test("caps at 5 minutes regardless of how many refusals", () => {
+    const base = 2000;
+    // 2000 * 2^8 = 512000 > 300000 cap.
+    expect(computeBudgetBackoffMs(9, base)).toBe(BUDGET_BACKOFF_CAP_MS);
+    expect(computeBudgetBackoffMs(20, base)).toBe(BUDGET_BACKOFF_CAP_MS);
+    expect(computeBudgetBackoffMs(1000, base)).toBe(BUDGET_BACKOFF_CAP_MS);
+  });
+  test("first refusal sleeps exactly basePollMs (no doubling yet)", () => {
+    expect(computeBudgetBackoffMs(1, 100)).toBe(100);
+    expect(computeBudgetBackoffMs(1, 5000)).toBe(5000);
+  });
+  test("BUDGET_BACKOFF_CAP_MS is exactly 5 minutes", () => {
+    expect(BUDGET_BACKOFF_CAP_MS).toBe(5 * 60 * 1000);
+  });
+  test("guards against pathological non-positive inputs", () => {
+    // 0 or negative => treated as 1 (first refusal) rather than dividing.
+    expect(computeBudgetBackoffMs(0, 2000)).toBe(2000);
+    expect(computeBudgetBackoffMs(-5, 2000)).toBe(2000);
+  });
+});
+// ─── back-off state machine simulation ─────────────────────────────────────
+//
+// Re-implements the relevant slice of the poll loop so we can assert the
+// counter semantics without booting the full runner. If you change the
+// behavior in `runner.ts`, mirror it here. The logic must stay byte-equal
+// to the block in `src/commands/runner.ts` (search for
+// `consecutiveBudgetRefusals` there).
+interface LoopTrigger {
+  type:
+    | "task_assigned"
+    | "task_offered"
+    | "unread_mentions"
+    | "pool_tasks_available"
+    | "channel_activity"
+    | "budget_refused";
+  cause?: "agent" | "global";
+  agentSpend?: number;
+  agentBudget?: number;
+  globalSpend?: number;
+  globalBudget?: number;
+  resetAt?: string;
+}
+interface SimResult {
+  /** Sleeps recorded on each `budget_refused` outcome, in order. */
+  backoffSleeps: number[];
+  /** Final counter values. */
+  consecutiveBudgetRefusals: number;
+  /** Independent empty-poll counter — must NOT be bumped by refusals. */
+  emptyPollCount: number;
+  /** Each scrubbed log line emitted by the back-off branch. */
+  logLines: string[];
+  /** Number of times the "dispatch normally" branch was taken. */
+  dispatchedTriggers: number;
+}
+/**
+ * Mirrors the back-off slice of the runner poll loop. `triggers` is the
+ * sequence `pollForTrigger` returns on consecutive iterations (null = no
+ * trigger inside the long-poll window).
+ */
+function simulatePollLoop(
+  triggers: Array<LoopTrigger | null>,
+  basePollMs: number,
+  log: (line: string) => void,
+): SimResult {
+  let consecutiveBudgetRefusals = 0;
+  let emptyPollCount = 0;
+  const backoffSleeps: number[] = [];
+  let dispatchedTriggers = 0;
+  for (const trigger of triggers) {
+    if (trigger) {
+      if (trigger.type === "budget_refused") {
+        consecutiveBudgetRefusals++;
+        const backoffMs = computeBudgetBackoffMs(consecutiveBudgetRefusals, basePollMs);
+        const refusalPayload = JSON.stringify({
+          event: "budget_refused",
+          cause: trigger.cause,
+          agentSpend: trigger.agentSpend,
+          agentBudget: trigger.agentBudget,
+          globalSpend: trigger.globalSpend,
+          globalBudget: trigger.globalBudget,
+          resetAt: trigger.resetAt,
+          consecutiveRefusals: consecutiveBudgetRefusals,
+          backoffMs,
+        });
+        log(`[role] budget_refused — backing off ${backoffMs}ms: ${scrubSecrets(refusalPayload)}`);
+        backoffSleeps.push(backoffMs);
+        // `continue` — DO NOT increment empty-poll count.
+        continue;
+      }
+      consecutiveBudgetRefusals = 0;
+      dispatchedTriggers++;
+    } else {
+      // Empty poll — bumps the empty-poll counter but does not reset
+      // back-off state (refusals are about budget, not silence).
+      emptyPollCount++;
+    }
+  }
+  return {
+    backoffSleeps,
+    consecutiveBudgetRefusals,
+    emptyPollCount,
+    logLines: [], // populated by caller via the log callback
+    dispatchedTriggers,
+  };
+}
+// ─── Behavior tests against the simulated loop ─────────────────────────────
+const REFUSAL: LoopTrigger = {
+  type: "budget_refused",
+  cause: "agent",
+  agentSpend: 0.05,
+  agentBudget: 0.01,
+  resetAt: "2026-04-29T00:00:00.000Z",
+};
+const TASK: LoopTrigger = { type: "task_assigned" };
+describe("poll-loop back-off state machine", () => {
+  test("doubles up to but not past 5 min on a long refusal streak", () => {
+    const base = 2000;
+    // 9 consecutive refusals: 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s, then capped at 300s.
+    const refusals = Array<LoopTrigger | null>(9).fill(REFUSAL);
+    const lines: string[] = [];
+    const result = simulatePollLoop(refusals, base, (l) => lines.push(l));
+    expect(result.backoffSleeps).toEqual([
+      2000,
+      4000,
+      8000,
+      16_000,
+      32_000,
+      64_000,
+      128_000,
+      256_000,
+      BUDGET_BACKOFF_CAP_MS,
+    ]);
+    // Every entry is <= cap.
+    for (const s of result.backoffSleeps) expect(s).toBeLessThanOrEqual(BUDGET_BACKOFF_CAP_MS);
+    expect(result.consecutiveBudgetRefusals).toBe(9);
+  });
+  test("resets to 0 after a non-refused trigger; subsequent refusal restarts at base", () => {
+    const base = 2000;
+    // refusal, refusal, task, refusal -> backoffs should be [2000, 4000, 2000].
+    const sequence: Array<LoopTrigger | null> = [REFUSAL, REFUSAL, TASK, REFUSAL];
+    const lines: string[] = [];
+    const result = simulatePollLoop(sequence, base, (l) => lines.push(l));
+    expect(result.backoffSleeps).toEqual([2000, 4000, 2000]);
+    expect(result.dispatchedTriggers).toBe(1);
+    // Final counter reflects the streak after the reset (1 refusal).
+    expect(result.consecutiveBudgetRefusals).toBe(1);
+  });
+  test("empty-poll counter is unchanged across refusals", () => {
+    const base = 2000;
+    // 5 refusals interleaved with no nulls — empty-poll counter must stay 0.
+    const sequence: Array<LoopTrigger | null> = [REFUSAL, REFUSAL, REFUSAL, REFUSAL, REFUSAL];
+    const lines: string[] = [];
+    const result = simulatePollLoop(sequence, base, (l) => lines.push(l));
+    expect(result.emptyPollCount).toBe(0);
+    expect(result.backoffSleeps).toHaveLength(5);
+  });
+  test("empty polls (null triggers) bump empty-poll counter but not back-off", () => {
+    const base = 2000;
+    // null, null, refusal, null -> empty=3, refusals=1.
+    const sequence: Array<LoopTrigger | null> = [null, null, REFUSAL, null];
+    const lines: string[] = [];
+    const result = simulatePollLoop(sequence, base, (l) => lines.push(l));
+    expect(result.emptyPollCount).toBe(3);
+    expect(result.backoffSleeps).toEqual([2000]);
+  });
+  test("structured refusal log goes through scrubSecrets at egress", () => {
+    const base = 2000;
+    const lines: string[] = [];
+    // Spy: replace scrubSecrets temporarily via a wrapper. We can't mock
+    // module exports without `mock.module`, but we can assert on the
+    // emitted line content (which was produced by scrubSecrets in the
+    // simulator — same call shape as runner.ts).
+    const result = simulatePollLoop([REFUSAL], base, (l) => lines.push(l));
+    expect(result.backoffSleeps).toEqual([2000]);
+    expect(lines).toHaveLength(1);
+    const line = lines[0]!;
+    expect(line).toContain("budget_refused");
+    expect(line).toContain("backing off 2000ms");
+    // Payload fields are present (no secret-shaped tokens here, so output
+    // matches input — the assertion is that scrubbing was applied at all).
+    expect(line).toContain('"cause":"agent"');
+    expect(line).toContain('"resetAt":"2026-04-29T00:00:00.000Z"');
+    expect(line).toContain('"consecutiveRefusals":1');
+    expect(line).toContain('"backoffMs":2000');
+  });
+  test("scrubSecrets is invoked with the structured payload (call signature check)", () => {
+    // Drop-in mock: wrap the real scrubber and count calls. We use bun's
+    // `mock` to track call count + arg shape without changing behavior.
+    const realScrub = scrubSecrets;
+    const spy = mock((s: string | null | undefined) => realScrub(s));
+    // Replicate the exact code path the runner uses.
+    const trigger = REFUSAL;
+    const consecutiveRefusals = 1;
+    const backoffMs = computeBudgetBackoffMs(consecutiveRefusals, 2000);
+    const refusalPayload = JSON.stringify({
+      event: "budget_refused",
+      cause: trigger.cause,
+      agentSpend: trigger.agentSpend,
+      agentBudget: trigger.agentBudget,
+      globalSpend: trigger.globalSpend,
+      globalBudget: trigger.globalBudget,
+      resetAt: trigger.resetAt,
+      consecutiveRefusals,
+      backoffMs,
+    });
+    const scrubbed = spy(refusalPayload);
+    expect(spy).toHaveBeenCalledTimes(1);
+    expect(spy.mock.calls[0]![0]).toBe(refusalPayload);
+    // String input -> string output, payload preserved (no actual secrets in fixture).
+    expect(scrubbed).toBe(refusalPayload);
+  });
+});

package/src/tests/session-costs-codex-recompute.test.ts ADDED Viewed

@@ -0,0 +1,386 @@
+// Phase 6: Codex USD recompute on POST /api/session-costs.
+//
+// When the worker reports `provider='codex'` and DB pricing rows exist for
+// all three token classes at the lookup time, the API recomputes
+// `totalCostUsd` from tokens × DB prices and tags the row as
+// `costSource='pricing-table'`. If any class is missing a row, fall back to
+// the worker-reported value with `costSource='harness'`.
+// Claude / pi paths always trust harness USD (`costSource='harness'`).
+import { afterAll, afterEach, beforeAll, describe, expect, test } from "bun:test";
+import { unlink } from "node:fs/promises";
+import {
+  createServer as createHttpServer,
+  type IncomingMessage,
+  type Server,
+  type ServerResponse,
+} from "node:http";
+import { closeDb, createAgent, getDb, initDb, insertPricingRow } from "../be/db";
+import { handleCore } from "../http/core";
+import { handleSessionData } from "../http/session-data";
+import { getPathSegments, parseQueryParams } from "../http/utils";
+const TEST_DB_PATH = "./test-session-costs-codex-recompute.sqlite";
+const API_KEY = "test-codex-recompute-secret";
+async function removeDbFiles(path: string): Promise<void> {
+  for (const suffix of ["", "-wal", "-shm"]) {
+    try {
+      await unlink(path + suffix);
+    } catch (error) {
+      if ((error as NodeJS.ErrnoException).code !== "ENOENT") throw error;
+    }
+  }
+}
+async function listen(server: Server): Promise<number> {
+  await new Promise<void>((resolve) => server.listen(0, resolve));
+  const addr = server.address();
+  if (!addr || typeof addr === "string") throw new Error("no port");
+  return addr.port;
+}
+function createTestServer(apiKey: string): Server {
+  return createHttpServer(async (req: IncomingMessage, res: ServerResponse) => {
+    const myAgentId = req.headers["x-agent-id"] as string | undefined;
+    const handled = await handleCore(req, res, myAgentId, apiKey);
+    if (handled) return;
+    const pathSegments = getPathSegments(req.url || "");
+    const queryParams = parseQueryParams(req.url || "");
+    const ok = await handleSessionData(req, res, pathSegments, queryParams, myAgentId);
+    if (!ok) {
+      res.writeHead(404);
+      res.end("Not Found");
+    }
+  });
+}
+let server: Server;
+let port: number;
+let testAgent: { id: string };
+beforeAll(async () => {
+  await removeDbFiles(TEST_DB_PATH);
+  initDb(TEST_DB_PATH);
+  testAgent = createAgent({ name: "codex-test", isLead: false, status: "idle" });
+  server = createTestServer(API_KEY);
+  port = await listen(server);
+});
+afterAll(async () => {
+  await new Promise<void>((resolve) => server.close(() => resolve()));
+  closeDb();
+  await removeDbFiles(TEST_DB_PATH);
+});
+afterEach(() => {
+  const db = getDb();
+  db.prepare("DELETE FROM session_costs").run();
+  // Leave seed pricing rows in place; remove anything we added explicitly.
+  db.prepare("DELETE FROM pricing WHERE effective_from > 0").run();
+  // Also delete the seed rows for the synthetic models we use in some tests.
+  db.prepare("DELETE FROM pricing WHERE model = 'codex-test-synth'").run();
+});
+function authedFetch(path: string, init: RequestInit = {}): Promise<Response> {
+  return fetch(`http://localhost:${port}${path}`, {
+    ...init,
+    headers: {
+      Authorization: `Bearer ${API_KEY}`,
+      "Content-Type": "application/json",
+      ...(init.headers ?? {}),
+    },
+  });
+}
+interface CreatedCostResponse {
+  success: boolean;
+  cost: {
+    id: string;
+    totalCostUsd: number;
+    costSource: "harness" | "pricing-table";
+    model: string;
+  };
+}
+describe("Phase 6 — POST /api/session-costs: Codex USD recompute", () => {
+  test("provider=codex with all three pricing rows present → recompute uses DB prices, costSource='pricing-table'", async () => {
+    // Mid-range custom rates: input=2.0/M, cached=0.2/M, output=10.0/M
+    insertPricingRow({
+      provider: "codex",
+      model: "codex-test-synth",
+      tokenClass: "input",
+      effectiveFrom: 1,
+      pricePerMillionUsd: 2.0,
+    });
+    insertPricingRow({
+      provider: "codex",
+      model: "codex-test-synth",
+      tokenClass: "cached_input",
+      effectiveFrom: 1,
+      pricePerMillionUsd: 0.2,
+    });
+    insertPricingRow({
+      provider: "codex",
+      model: "codex-test-synth",
+      tokenClass: "output",
+      effectiveFrom: 1,
+      pricePerMillionUsd: 10.0,
+    });
+    const res = await authedFetch(`/api/session-costs`, {
+      method: "POST",
+      body: JSON.stringify({
+        sessionId: "codex-recompute-1",
+        agentId: testAgent.id,
+        // Worker-reported value the API is expected to OVERWRITE.
+        totalCostUsd: 999.99,
+        inputTokens: 1_000_000, // 1M total input
+        cacheReadTokens: 200_000, // 200k cached
+        outputTokens: 500_000, // 500k output
+        model: "codex-test-synth",
+        provider: "codex",
+        durationMs: 1_000,
+        numTurns: 1,
+      }),
+    });
+    expect(res.status).toBe(201);
+    const body = (await res.json()) as CreatedCostResponse;
+    expect(body.cost.costSource).toBe("pricing-table");
+    // uncached = 1_000_000 - 200_000 = 800_000
+    // cost = (800_000 * 2.0 + 200_000 * 0.2 + 500_000 * 10.0) / 1_000_000
+    //      = (1_600_000 + 40_000 + 5_000_000) / 1_000_000 = 6.64
+    expect(body.cost.totalCostUsd).toBeCloseTo(6.64, 5);
+  });
+  test("provider=codex but a token class is missing → falls back to worker value, costSource='harness'", async () => {
+    // Only seed input + cached_input. Missing output forces fallback.
+    insertPricingRow({
+      provider: "codex",
+      model: "codex-test-synth",
+      tokenClass: "input",
+      effectiveFrom: 1,
+      pricePerMillionUsd: 2.0,
+    });
+    insertPricingRow({
+      provider: "codex",
+      model: "codex-test-synth",
+      tokenClass: "cached_input",
+      effectiveFrom: 1,
+      pricePerMillionUsd: 0.2,
+    });
+    const res = await authedFetch(`/api/session-costs`, {
+      method: "POST",
+      body: JSON.stringify({
+        sessionId: "codex-fallback-1",
+        agentId: testAgent.id,
+        totalCostUsd: 1.23,
+        inputTokens: 100,
+        outputTokens: 50,
+        model: "codex-test-synth",
+        provider: "codex",
+        durationMs: 1_000,
+        numTurns: 1,
+      }),
+    });
+    expect(res.status).toBe(201);
+    const body = (await res.json()) as CreatedCostResponse;
+    expect(body.cost.costSource).toBe("harness");
+    // Worker value preserved verbatim.
+    expect(body.cost.totalCostUsd).toBe(1.23);
+  });
+  test("provider=claude records harness USD as-is regardless of DB pricing rows", async () => {
+    // Even if there are codex pricing rows, claude must NOT be touched.
+    const res = await authedFetch(`/api/session-costs`, {
+      method: "POST",
+      body: JSON.stringify({
+        sessionId: "claude-passthrough-1",
+        agentId: testAgent.id,
+        totalCostUsd: 7.77,
+        inputTokens: 100,
+        outputTokens: 50,
+        model: "sonnet-4",
+        provider: "claude",
+        durationMs: 1_000,
+        numTurns: 1,
+      }),
+    });
+    expect(res.status).toBe(201);
+    const body = (await res.json()) as CreatedCostResponse;
+    expect(body.cost.costSource).toBe("harness");
+    expect(body.cost.totalCostUsd).toBe(7.77);
+  });
+  test("provider=pi records harness USD as-is regardless of DB pricing rows", async () => {
+    const res = await authedFetch(`/api/session-costs`, {
+      method: "POST",
+      body: JSON.stringify({
+        sessionId: "pi-passthrough-1",
+        agentId: testAgent.id,
+        totalCostUsd: 0.42,
+        inputTokens: 10,
+        outputTokens: 5,
+        model: "openrouter/google/gemini-3-flash-preview",
+        provider: "pi",
+        durationMs: 1_000,
+        numTurns: 1,
+      }),
+    });
+    expect(res.status).toBe(201);
+    const body = (await res.json()) as CreatedCostResponse;
+    expect(body.cost.costSource).toBe("harness");
+    expect(body.cost.totalCostUsd).toBe(0.42);
+  });
+  test("provider field omitted → no recompute, costSource='harness' (back-compat)", async () => {
+    // No `provider` field at all (legacy call shape). Expect harness path.
+    const res = await authedFetch(`/api/session-costs`, {
+      method: "POST",
+      body: JSON.stringify({
+        sessionId: "legacy-1",
+        agentId: testAgent.id,
+        totalCostUsd: 1.0,
+        durationMs: 1_000,
+        numTurns: 1,
+      }),
+    });
+    expect(res.status).toBe(201);
+    const body = (await res.json()) as CreatedCostResponse;
+    expect(body.cost.costSource).toBe("harness");
+    expect(body.cost.totalCostUsd).toBe(1.0);
+  });
+  describe("historical correctness — older session_cost createdAt picks older effective_from", () => {
+    // Anchor T0 well in the past so we can place newer rows around it.
+    const T0 = 1_700_000_000_000; // 2023-11-14ish
+    const PRICE_A = 1.0;
+    const PRICE_B = 2.0;
+    function postCost(opts: { sessionId: string; createdAt: number }) {
+      return authedFetch(`/api/session-costs`, {
+        method: "POST",
+        body: JSON.stringify({
+          sessionId: opts.sessionId,
+          agentId: testAgent.id,
+          totalCostUsd: 999.99, // worker-reported, expected to be overwritten
+          inputTokens: 1_000_000, // 1M input total
+          cacheReadTokens: 0, // no cache for simplicity
+          outputTokens: 0,
+          model: "codex-test-synth",
+          provider: "codex",
+          createdAt: opts.createdAt,
+          durationMs: 1_000,
+          numTurns: 1,
+        }),
+      });
+    }
+    test("createdAt = T0+1 → uses price A (the only row at that time)", async () => {
+      // Seed price A at T0, and the cached/output rows at the same time so all
+      // three classes resolve.
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "input",
+        effectiveFrom: T0,
+        pricePerMillionUsd: PRICE_A,
+      });
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "cached_input",
+        effectiveFrom: T0,
+        pricePerMillionUsd: 0,
+      });
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "output",
+        effectiveFrom: T0,
+        pricePerMillionUsd: 0,
+      });
+      const res = await postCost({ sessionId: "hist-1", createdAt: T0 + 1 });
+      const body = (await res.json()) as CreatedCostResponse;
+      expect(body.cost.costSource).toBe("pricing-table");
+      expect(body.cost.totalCostUsd).toBeCloseTo(1.0, 5); // 1M * 1.0 / 1M
+    });
+    test("createdAt = T0+200 with new row at T0+100 → uses price B", async () => {
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "input",
+        effectiveFrom: T0,
+        pricePerMillionUsd: PRICE_A,
+      });
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "cached_input",
+        effectiveFrom: T0,
+        pricePerMillionUsd: 0,
+      });
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "output",
+        effectiveFrom: T0,
+        pricePerMillionUsd: 0,
+      });
+      // Newer input row supersedes A from T0+100 onward.
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "input",
+        effectiveFrom: T0 + 100,
+        pricePerMillionUsd: PRICE_B,
+      });
+      const res = await postCost({ sessionId: "hist-2", createdAt: T0 + 200 });
+      const body = (await res.json()) as CreatedCostResponse;
+      expect(body.cost.costSource).toBe("pricing-table");
+      expect(body.cost.totalCostUsd).toBeCloseTo(2.0, 5); // 1M * 2.0 / 1M
+    });
+    test("createdAt = T0+50 with new row at T0+100 → STILL uses price A (older effective_from)", async () => {
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "input",
+        effectiveFrom: T0,
+        pricePerMillionUsd: PRICE_A,
+      });
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "cached_input",
+        effectiveFrom: T0,
+        pricePerMillionUsd: 0,
+      });
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "output",
+        effectiveFrom: T0,
+        pricePerMillionUsd: 0,
+      });
+      // Newer row exists, but the session_cost is older than T0+100.
+      insertPricingRow({
+        provider: "codex",
+        model: "codex-test-synth",
+        tokenClass: "input",
+        effectiveFrom: T0 + 100,
+        pricePerMillionUsd: PRICE_B,
+      });
+      const res = await postCost({ sessionId: "hist-3", createdAt: T0 + 50 });
+      const body = (await res.json()) as CreatedCostResponse;
+      expect(body.cost.costSource).toBe("pricing-table");
+      // Older effective_from = T0 wins because session_cost.createdAt = T0+50 < T0+100.
+      expect(body.cost.totalCostUsd).toBeCloseTo(1.0, 5);
+    });
+  });
+});

package/src/tools/poll-task.ts CHANGED Viewed

@@ -67,6 +67,13 @@ export const registerPollTaskTool = (server: McpServer) => {
       const agentId = requestInfo.agentId;
       const now = new Date();
       const maxTime = addMinutes(now, MAX_POLL_DURATION_MS / 60000);
+      // Phase 3 (D-R3): when a budget refusal occurs, the empty-poll counter
+      // must NOT advance — refused ≠ empty. The MCP `poll-task` tool is NOT
+      // gated by `canClaim` in V1 (per plan §"What We're NOT Doing" — D-R1),
+      // so this is structural / forward-compat plumbing: future revisions
+      // that gate poll-task flip this to true at the refusal site instead of
+      // touching the bookkeeping path below.
+      const wasBudgetRefused: boolean = false;
       const agent = getAgentById(agentId);
       if (!agent) {
@@ -174,8 +181,12 @@ export const registerPollTaskTool = (server: McpServer) => {
       const waitedForSeconds = Math.round((Date.now() - now.getTime()) / 1000);
-      // Increment empty poll count and check if agent should exit
-      const newCount = incrementEmptyPollCount(agentId);
+      // Increment empty poll count and check if agent should exit.
+      // Refused ≠ empty (D-R3) — skip bookkeeping when a budget refusal
+      // occurred during this poll window.
+      const newCount = wasBudgetRefused
+        ? (getAgentById(agentId)?.emptyPollCount ?? 0)
+        : incrementEmptyPollCount(agentId);
       const shouldExit = newCount >= MAX_EMPTY_POLLS;
       // If no task was found within the time limit