@desplega.ai/agent-swarm 1.79.4 → 1.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/openapi.json +98 -19
  2. package/package.json +12 -6
  3. package/src/be/db.ts +101 -30
  4. package/src/be/migrations/063_cost_context_schema_relax.sql +133 -0
  5. package/src/be/pricing-normalize.ts +81 -0
  6. package/src/be/seed-pricing.ts +293 -0
  7. package/src/commands/claude-managed-setup.ts +19 -3
  8. package/src/commands/runner.ts +592 -237
  9. package/src/http/context.ts +6 -2
  10. package/src/http/index.ts +115 -68
  11. package/src/http/session-data.ts +74 -23
  12. package/src/otel-impl.ts +200 -0
  13. package/src/otel.ts +127 -0
  14. package/src/providers/claude-adapter.ts +30 -5
  15. package/src/providers/claude-managed-adapter.ts +43 -17
  16. package/src/providers/claude-managed-pricing.ts +34 -0
  17. package/src/providers/codex-adapter.ts +38 -27
  18. package/src/providers/codex-models.ts +22 -3
  19. package/src/providers/devin-adapter.ts +11 -0
  20. package/src/providers/opencode-adapter.ts +31 -7
  21. package/src/providers/pi-mono-adapter.ts +39 -7
  22. package/src/providers/pricing-sources.md +52 -0
  23. package/src/providers/swarm-events-shared.ts +8 -4
  24. package/src/providers/types.ts +33 -10
  25. package/src/server.ts +6 -0
  26. package/src/tests/claude-managed-adapter.test.ts +17 -3
  27. package/src/tests/claude-managed-setup.test.ts +10 -1
  28. package/src/tests/codex-adapter.test.ts +20 -19
  29. package/src/tests/context-snapshot.test.ts +2 -2
  30. package/src/tests/context-window.test.ts +65 -1
  31. package/src/tests/devin-adapter.test.ts +2 -0
  32. package/src/tests/http/context-routes.test.ts +161 -0
  33. package/src/tests/migration-063-schema-relax.test.ts +109 -0
  34. package/src/tests/opencode-adapter.test.ts +146 -1
  35. package/src/tests/otel-impl-secret-scrubbing.test.ts +33 -0
  36. package/src/tests/pages-view-count.test.ts +30 -5
  37. package/src/tests/providers/codex-cost.test.ts +18 -0
  38. package/src/tests/providers/opencode-cost.test.ts +74 -0
  39. package/src/tests/providers/pi-cost.test.ts +128 -0
  40. package/src/tests/secret-scrubber.test.ts +19 -0
  41. package/src/tests/session-costs-codex-recompute.test.ts +35 -22
  42. package/src/tests/session-costs-model-key-normalize.test.ts +271 -0
  43. package/src/tests/session-costs-recompute-all-providers.test.ts +170 -0
  44. package/src/tests/store-progress-cost.test.ts +6 -1
  45. package/src/tools/store-progress.ts +16 -60
  46. package/src/tools/utils.ts +65 -12
  47. package/src/types.ts +62 -9
  48. package/src/utils/context-window.ts +104 -4
  49. package/src/utils/secret-scrubber.ts +7 -0
@@ -264,7 +264,7 @@ function cleanupAgentsMdSymlink(cwd: string): void {
264
264
  }
265
265
  }
266
266
 
267
- class PiMonoSession implements ProviderSession {
267
+ export class PiMonoSession implements ProviderSession {
268
268
  private listeners: Array<(event: ProviderEvent) => void> = [];
269
269
  private eventQueue: ProviderEvent[] = [];
270
270
  private _sessionId: string | undefined;
@@ -275,6 +275,14 @@ class PiMonoSession implements ProviderSession {
275
275
  private logFileHandle: ReturnType<ReturnType<typeof Bun.file>["writer"]>;
276
276
  /** Track last emitted message text to avoid duplicates across turns */
277
277
  private lastEmittedMessage = "";
278
+ /** Phase 7: wallclock start so we can populate `durationMs` on the cost row. */
279
+ private sessionStartedAt: number = Date.now();
280
+ /**
281
+ * Phase 7: previous output-token total — used to derive per-turn delta for
282
+ * `context_usage.outputTokens` since pi-ai's `getContextUsage()` doesn't
283
+ * surface it directly.
284
+ */
285
+ private prevOutputTokens = 0;
278
286
 
279
287
  constructor(agentSession: AgentSession, config: ProviderSessionConfig, createdSymlink: boolean) {
280
288
  this.agentSession = agentSession;
@@ -282,6 +290,7 @@ class PiMonoSession implements ProviderSession {
282
290
  this.createdSymlink = createdSymlink;
283
291
  this.logFileHandle = Bun.file(config.logFile).writer();
284
292
  this._sessionId = agentSession.sessionId;
293
+ this.sessionStartedAt = Date.now();
285
294
 
286
295
  // Emit session_init immediately
287
296
  this.emit({ type: "session_init", sessionId: this._sessionId, provider: "pi" });
@@ -293,6 +302,18 @@ class PiMonoSession implements ProviderSession {
293
302
  this.completionPromise = this.runSession();
294
303
  }
295
304
 
305
+ /**
306
+ * Canonical model slug for downstream reporting (latestModel, raw_log envelopes).
307
+ * Composes `${provider}/${id}` from the resolved pi-ai model so the UI snapshot
308
+ * lookup matches (e.g. `openrouter/deepseek/deepseek-v4-flash`). Falls back to
309
+ * the configured model string if the session didn't resolve one.
310
+ */
311
+ private reportedModel(): string {
312
+ const m = this.agentSession.model;
313
+ if (m) return `${m.provider}/${m.id}`;
314
+ return this.config.model;
315
+ }
316
+
296
317
  private emit(event: ProviderEvent): void {
297
318
  // Scrub secrets from raw_log / raw_stderr content before egress (log file
298
319
  // write, listener dispatch, downstream session-logs push + pretty-print).
@@ -329,7 +350,7 @@ class PiMonoSession implements ProviderSession {
329
350
  .trim()
330
351
  : String(msg.content || "").trim();
331
352
  if (text && text !== this.lastEmittedMessage) {
332
- const model = this.agentSession.model?.name ?? this.config.model;
353
+ const model = this.reportedModel();
333
354
  this.emit({
334
355
  type: "raw_log",
335
356
  content: JSON.stringify({
@@ -344,21 +365,30 @@ class PiMonoSession implements ProviderSession {
344
365
  this.lastEmittedMessage = text;
345
366
  }
346
367
  }
347
- // Emit context_usage for dashboard tracking
368
+ // Emit context_usage for dashboard tracking.
369
+ // Phase 7: derive `outputTokens` from `SessionStats` delta (pi-ai's
370
+ // `getContextUsage()` doesn't expose per-turn output tokens, but the
371
+ // session-stats counter is monotonic so a delta is correct).
348
372
  const usage = this.agentSession.getContextUsage();
349
373
  if (usage && usage.tokens != null) {
374
+ const stats = this.agentSession.getSessionStats();
375
+ const currOutput = stats?.tokens?.output ?? 0;
376
+ const outputDelta = Math.max(0, currOutput - this.prevOutputTokens);
377
+ this.prevOutputTokens = currOutput;
350
378
  this.emit({
351
379
  type: "context_usage",
352
380
  contextUsedTokens: usage.tokens,
353
381
  contextTotalTokens: usage.contextWindow,
354
382
  contextPercent: usage.percent ?? 0,
355
- outputTokens: 0,
383
+ outputTokens: outputDelta,
384
+ // Phase 9: pi-ai owns the formula — we just relay its number.
385
+ contextFormula: "pi-delegated",
356
386
  });
357
387
  }
358
388
  break;
359
389
  }
360
390
  case "tool_execution_start": {
361
- const model = this.agentSession.model?.name ?? this.config.model;
391
+ const model = this.reportedModel();
362
392
  this.emit({
363
393
  type: "raw_log",
364
394
  content: JSON.stringify({
@@ -489,9 +519,11 @@ class PiMonoSession implements ProviderSession {
489
519
  outputTokens: stats.tokens.output,
490
520
  cacheReadTokens: stats.tokens.cacheRead,
491
521
  cacheWriteTokens: stats.tokens.cacheWrite,
492
- durationMs: 0, // Not directly available from SessionStats
522
+ // Phase 7: real wallclock duration; pi-ai SessionStats doesn't carry
523
+ // one so we track it on this adapter instance.
524
+ durationMs: Date.now() - this.sessionStartedAt,
493
525
  numTurns: stats.userMessages + stats.assistantMessages,
494
- model: this.agentSession.model?.name ?? this.config.model,
526
+ model: this.reportedModel(),
495
527
  isError: false,
496
528
  provider: "pi",
497
529
  };
@@ -0,0 +1,52 @@
1
+ # Pricing sources
2
+
3
+ This page lists the sources that feed the `pricing` table at server boot.
4
+ Operators bumping a rate by hand should also update this file.
5
+
6
+ ## Primary: vendored models.dev snapshot
7
+
8
+ - **Path**: `ui/src/lib/modelsdev-cache.json`
9
+ - **Loaded by**: `src/be/seed-pricing.ts` → `seedPricingFromModelsDev()`,
10
+ called from `src/server.ts` after `initDb`.
11
+ - **Projection rules** (see the same module for code-level detail):
12
+ - Anthropic models → rows under `provider='claude'` AND `provider='claude-managed'`.
13
+ Shortnames (`opus`, `sonnet`, `haiku`) ALSO get rows keyed by the current
14
+ default full id (e.g. `opus → claude-opus-4-7`). Pi-mono uses the same
15
+ shortname forms, so they're projected under `provider='pi'` as well.
16
+ - OpenAI models → rows under `provider='codex'`.
17
+ - OpenRouter models → rows under `provider='opencode'`. Any `google/...`
18
+ row additionally gets projected under `provider='gemini'` (both the
19
+ stripped name and the full `google/...` id) so internal-ai callers find
20
+ a hit either way.
21
+
22
+ - **Refresh procedure** (the only place to update the snapshot):
23
+ - Run `bun run scripts/refresh-modelsdev-pricing.ts` (Phase 2 — adds the
24
+ script). It fetches the latest snapshot from models.dev, diffs against
25
+ the vendored copy, prints a summary, and writes the new file.
26
+ - Commit the regenerated `modelsdev-cache.json` together with a bump
27
+ note in the PR description.
28
+
29
+ ## Manual overrides
30
+
31
+ Two cost components models.dev doesn't carry are encoded in
32
+ `MANUAL_PRICING_OVERRIDES` inside `src/be/seed-pricing.ts`:
33
+
34
+ | Provider | Model | Token class | Rate | Source | Verified |
35
+ |------------------|-------|----------------|--------------|---------------------------------------------------------------------------------|------------|
36
+ | `claude-managed` | `*` | `runtime_hour` | $0.08 / hour | <https://docs.claude.com/en/api/agent-sdk/managed-runtime#pricing> | 2026-04-28 |
37
+ | `devin` | `*` | `acu` | $2.25 / ACU | <https://devin.ai/pricing> | 2026-04-28 |
38
+
39
+ The `pricePerMillionUsd` column carries these as `rate * 1_000_000` so the
40
+ same schema fits — the adapter scales by the underlying unit (hours / ACUs),
41
+ not by tokens. The unit convention is specific to those `token_class` values.
42
+
43
+ ## When a model is missing
44
+
45
+ If `POST /api/session-costs` arrives with a `(provider, model)` pair that has
46
+ no input/output pricing rows at the lookup time, the row is persisted with
47
+ `costSource='unpriced'` (rather than 'harness'). The UI surfaces this as a
48
+ yellow badge.
49
+
50
+ To fix: either add the model to `modelsdev-cache.json` (preferred — the
51
+ upstream snapshot probably needs refreshing) or add a manual override row via
52
+ the existing admin route `POST /api/pricing`.
@@ -167,8 +167,10 @@ export function createSwarmEventHandler(
167
167
 
168
168
  const progressContextUsage = (event: {
169
169
  contextUsedTokens: number;
170
- contextTotalTokens: number;
171
- contextPercent: number;
170
+ // Migration 063: nullable for adapters that can't resolve a window.
171
+ contextTotalTokens: number | null;
172
+ contextPercent: number | null;
173
+ contextFormula?: string;
172
174
  }): void => {
173
175
  if (opts.taskId && shouldRun("context-progress", CONTEXT_THROTTLE_MS)) {
174
176
  fireAndForget(`${opts.apiUrl}/api/tasks/${encodeURIComponent(opts.taskId)}/context`, {
@@ -178,8 +180,9 @@ export function createSwarmEventHandler(
178
180
  eventType: "progress",
179
181
  sessionId: sessionId ?? `${opts.sessionIdFallbackPrefix ?? "session"}-${opts.taskId}`,
180
182
  contextUsedTokens: event.contextUsedTokens,
181
- contextTotalTokens: event.contextTotalTokens,
182
- contextPercent: event.contextPercent,
183
+ contextTotalTokens: event.contextTotalTokens ?? undefined,
184
+ contextPercent: event.contextPercent ?? undefined,
185
+ contextFormula: event.contextFormula,
183
186
  }),
184
187
  });
185
188
  }
@@ -239,6 +242,7 @@ export function createSwarmEventHandler(
239
242
  contextUsedTokens: event.contextUsedTokens,
240
243
  contextTotalTokens: event.contextTotalTokens,
241
244
  contextPercent: event.contextPercent,
245
+ contextFormula: event.contextFormula,
242
246
  });
243
247
  break;
244
248
  }
@@ -7,18 +7,30 @@ export interface CostData {
7
7
  inputTokens?: number;
8
8
  outputTokens?: number;
9
9
  cacheReadTokens?: number;
10
+ /**
11
+ * Migration 063: undefined means "the harness can't report this" (e.g. the
12
+ * Codex SDK has no cache-write field). Zero is reserved for "really zero".
13
+ */
10
14
  cacheWriteTokens?: number;
15
+ /** Migration 063: codex reasoning_output_tokens (and similar) for reasoning models. */
16
+ reasoningOutputTokens?: number;
17
+ /** Migration 063: claude extended-thinking tokens from CLI's `usage.thinking_input_tokens`. */
18
+ thinkingTokens?: number;
11
19
  durationMs: number;
12
- numTurns: number;
20
+ /**
21
+ * Migration 063: nullable — some adapters (claude when `num_turns` is absent)
22
+ * can't honestly report a turn count; null is preferred over a faked 1.
23
+ */
24
+ numTurns: number | null;
13
25
  model: string;
14
26
  isError: boolean;
15
27
  /**
16
- * Phase 6: tells the API which recompute path to use on
17
- * `POST /api/session-costs`. Codex triggers the pricing-table recompute
18
- * (when DB pricing rows exist for all three token classes); Claude / pi
19
- * always trust the harness-reported `totalCostUsd` as-is.
28
+ * Phase 6 (extended migration 063): tells the API which recompute path to
29
+ * use on `POST /api/session-costs`. After Phase 2 the recompute path runs
30
+ * for every provider with seeded pricing rows, so every adapter should
31
+ * populate this field.
20
32
  */
21
- provider?: "claude" | "codex" | "pi" | "opencode";
33
+ provider?: "claude" | "claude-managed" | "codex" | "pi" | "opencode" | "devin";
22
34
  }
23
35
 
24
36
  import type { ProviderName } from "../types";
@@ -43,14 +55,25 @@ export type ProviderEvent =
43
55
  | {
44
56
  type: "context_usage";
45
57
  contextUsedTokens: number;
46
- contextTotalTokens: number;
47
- contextPercent: number;
48
- outputTokens: number;
58
+ // Migration 063: nullable so adapters (e.g. devin without a context API)
59
+ // can emit a snapshot that records cumulative tokens without faking a window.
60
+ contextTotalTokens: number | null;
61
+ // Migration 063: null if contextTotalTokens is missing (no divide-by-zero).
62
+ contextPercent: number | null;
63
+ // Migration 063: null when the adapter can't honestly report output tokens.
64
+ outputTokens: number | null;
65
+ /**
66
+ * Migration 063 — the formula the adapter used to compute
67
+ * contextUsedTokens. See `ContextFormulaSchema` in `src/types.ts` for the
68
+ * canonical value list. Adapters should always populate this going
69
+ * forward; it powers cross-provider apples-to-apples comparison.
70
+ */
71
+ contextFormula?: string;
49
72
  }
50
73
  | {
51
74
  type: "compaction";
52
75
  preCompactTokens: number;
53
- compactTrigger: "auto" | "manual";
76
+ compactTrigger: "auto" | "manual" | "auto-inferred";
54
77
  contextTotalTokens: number;
55
78
  };
56
79
 
package/src/server.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
2
  import pkg from "../package.json";
3
3
  import { initDb } from "./be/db";
4
+ import { seedPricingFromModelsDev } from "./be/seed-pricing";
4
5
  import { registerCancelTaskTool } from "./tools/cancel-task";
5
6
  import { registerContextDiffTool } from "./tools/context-diff";
6
7
  import { registerContextHistoryTool } from "./tools/context-history";
@@ -147,6 +148,11 @@ export function createServer() {
147
148
  // Initialize database with WAL mode
148
149
  // Uses DATABASE_PATH env var for Docker volume compatibility (WAL needs .sqlite, .sqlite-wal, .sqlite-shm on same filesystem)
149
150
  initDb(process.env.DATABASE_PATH);
151
+ // Phase 2: project the vendored models.dev snapshot into the pricing table.
152
+ // Idempotent (INSERT OR IGNORE keyed on PK with effective_from=0); safe to
153
+ // call on every boot. See src/be/seed-pricing.ts for the projection logic
154
+ // and the manual-override constants for runtime-fee / ACU pricing.
155
+ seedPricingFromModelsDev();
150
156
 
151
157
  const server = new McpServer(
152
158
  {
@@ -319,11 +319,14 @@ describe("ClaudeManagedAdapter (Phase 3) — session lifecycle", () => {
319
319
  }
320
320
 
321
321
  // context_usage emitted on span.model_request_end.
322
+ // Phase 5 / Phase 9 unified formula = input + cache_read + cache_write + output.
322
323
  const ctx = emitted.find((e) => e.type === "context_usage");
323
324
  expect(ctx).toBeDefined();
324
325
  if (ctx && ctx.type === "context_usage") {
325
- expect(ctx.contextUsedTokens).toBe(150); // 100 input + 50 output
326
+ expect(ctx.contextUsedTokens).toBe(165); // 100 + 10 + 5 + 50
326
327
  expect(ctx.outputTokens).toBe(50);
328
+ // Phase 9: every snapshot carries the formula tag.
329
+ expect(ctx.contextFormula).toBe("input-cache-output");
327
330
  }
328
331
 
329
332
  // result emitted with accumulated cost. Phase 3 leaves totalCostUsd at 0
@@ -345,6 +348,8 @@ describe("ClaudeManagedAdapter (Phase 3) — session lifecycle", () => {
345
348
  expect(resultEvent.cost.totalCostUsd).toBeGreaterThanOrEqual(0);
346
349
  expect(Number.isFinite(resultEvent.cost.totalCostUsd)).toBe(true);
347
350
  expect(resultEvent.output).toBe("Hello from managed agent");
351
+ // Phase 3 — provider tag is required so the API recompute path engages.
352
+ expect(resultEvent.cost.provider).toBe("claude-managed");
348
353
  }
349
354
 
350
355
  // ProviderResult.
@@ -644,17 +649,24 @@ describe("ClaudeManagedAdapter (Phase 4) — repo provisioning + cost data", ()
644
649
  process.env.ANTHROPIC_API_KEY = "sk-test";
645
650
  process.env.MANAGED_AGENT_ID = "agent_x";
646
651
  process.env.MANAGED_ENVIRONMENT_ID = "env_x";
652
+ // Defensive: vault env vars may leak in from the host .env (Bun auto-loads
653
+ // it); each vault-related test sets exactly what it asserts on.
654
+ delete process.env.MANAGED_GITHUB_TOKEN;
655
+ delete process.env.MANAGED_GITHUB_VAULT_ID;
656
+ delete process.env.MANAGED_MCP_VAULT_ID;
647
657
  });
648
658
 
649
659
  afterAll(() => {
650
660
  rmSync(tmpLogDir, { recursive: true, force: true });
651
661
  delete process.env.MANAGED_GITHUB_TOKEN;
652
662
  delete process.env.MANAGED_GITHUB_VAULT_ID;
663
+ delete process.env.MANAGED_MCP_VAULT_ID;
653
664
  });
654
665
 
655
666
  afterEach(() => {
656
667
  delete process.env.MANAGED_GITHUB_TOKEN;
657
668
  delete process.env.MANAGED_GITHUB_VAULT_ID;
669
+ delete process.env.MANAGED_MCP_VAULT_ID;
658
670
  });
659
671
 
660
672
  test("normalizeRepoUrl: passes through https URLs and expands owner/repo shorthand", () => {
@@ -1266,9 +1278,11 @@ describe("ClaudeManagedAdapter (Phase 6) — full happy-path integration", () =>
1266
1278
  const ctxUsage = emitted.find((e) => e.type === "context_usage");
1267
1279
  expect(ctxUsage?.type).toBe("context_usage");
1268
1280
  if (ctxUsage?.type === "context_usage") {
1269
- // 1M input + 200k output = 1.2M used; output = 200k.
1270
- expect(ctxUsage.contextUsedTokens).toBe(1_200_000);
1281
+ // Phase 5 / Phase 9 unified: input + cache_read + cache_write + output.
1282
+ // 1M + 50k + 25k + 200k = 1,275,000.
1283
+ expect(ctxUsage.contextUsedTokens).toBe(1_275_000);
1271
1284
  expect(ctxUsage.outputTokens).toBe(200_000);
1285
+ expect(ctxUsage.contextFormula).toBe("input-cache-output");
1272
1286
  }
1273
1287
 
1274
1288
  // The terminal `result` ProviderEvent — the contract Phase 4 hardened —
@@ -88,13 +88,22 @@ describe("runClaudeManagedSetupFlow — happy path", () => {
88
88
  const agentCallArgs = agentsCreate.mock.calls[0]?.[0] as {
89
89
  name: string;
90
90
  model: string;
91
- tools: Array<{ type: string }>;
91
+ tools: Array<{
92
+ type: string;
93
+ default_config?: { permission_policy?: { type: string } };
94
+ }>;
92
95
  skills: Array<{ type: string; skill_id: string }>;
93
96
  mcp_servers: Array<{ name: string; type: string; url: string }>;
94
97
  };
95
98
  expect(agentCallArgs.name).toBe("swarm-worker");
96
99
  expect(agentCallArgs.model).toBe("claude-sonnet-4-6");
97
100
  expect(agentCallArgs.tools[0]?.type).toBe("agent_toolset_20260401");
101
+ // Headless workers can't approve interactively — both toolsets must be
102
+ // configured with `always_allow` so the sandbox executes tool calls
103
+ // without parking them in `awaiting approval`.
104
+ for (const tool of agentCallArgs.tools) {
105
+ expect(tool.default_config?.permission_policy?.type).toBe("always_allow");
106
+ }
98
107
  expect(agentCallArgs.skills.map((s) => s.skill_id)).toEqual([
99
108
  "skill_work-on-task",
100
109
  "skill_create-pr",
@@ -191,16 +191,16 @@ describe("CodexSession event mapping", () => {
191
191
  expect(messages[0].content).toBe("Hello from codex");
192
192
  }
193
193
 
194
- // context_usage event fired with the *uncached + output* peak proxy
195
- // (input=100, cached=25, output=50 uncached=75 peak=125)
196
- // contextPercent is on a 0-100 scale (claude/pi convention).
194
+ // Phase 9: unified `input + output` formula (Codex `input_tokens` already
195
+ // includes cached input, so we don't add cache_read separately).
196
+ // input=100 + output=50 contextUsed=150.
197
197
  const contextUsage = emitted.find((e) => e.type === "context_usage");
198
198
  expect(contextUsage).toBeDefined();
199
199
  if (contextUsage && contextUsage.type === "context_usage") {
200
- expect(contextUsage.contextUsedTokens).toBe(125);
200
+ expect(contextUsage.contextUsedTokens).toBe(150);
201
201
  expect(contextUsage.contextTotalTokens).toBe(200_000);
202
- // 125 / 200_000 × 100 = 0.0625
203
- expect(contextUsage.contextPercent).toBeCloseTo((125 / 200_000) * 100, 6);
202
+ expect(contextUsage.contextPercent).toBeCloseTo((150 / 200_000) * 100, 6);
203
+ expect(contextUsage.contextFormula).toBe("input-cache-output");
204
204
  }
205
205
 
206
206
  // result event is final and non-error, with cost computed from token counts
@@ -225,14 +225,15 @@ describe("CodexSession event mapping", () => {
225
225
  expect(result.sessionId).toBe("thread-abc");
226
226
  });
227
227
 
228
- test("chatty turn: peakContextPercent uses uncached + output, not raw input_tokens", async () => {
229
- // Reproduces the verify-plan finding: a chatty turn where the SDK reports
230
- // input_tokens far in excess of the model's context window because the
231
- // total represents the SUM of every prompt across all model invocations
232
- // in the turn (with cache reuses billed at every roundtrip). Without the
233
- // peak-proxy fix this would clamp `contextPercent` to 1.0 even though no
234
- // single model call hit the limit. Use realistic numbers from the actual
235
- // E2E lead transcript captured during verification.
228
+ test("Phase 9: chatty turn clamps contextPercent to 100% under the unified formula", async () => {
229
+ // Phase 9 deliberately swapped Codex's per-adapter peak-proxy formula
230
+ // (`(input - cached) + output`) for the unified `input + output` formula
231
+ // shared with every other provider. The trade-off: a chatty Codex turn
232
+ // where `input_tokens` is the SUM across every model call in the turn
233
+ // — over-reports compared to the peak-proxy variant. The clamp at 100%
234
+ // keeps the gauge sensible; downstream consumers reading the new
235
+ // `contextFormula='input-cache-output'` tag know it's apples-to-apples
236
+ // across providers. Numbers below are from the verify-plan transcript.
236
237
  const agentMsg: AgentMessageItem = {
237
238
  id: "msg-1",
238
239
  type: "agent_message",
@@ -262,12 +263,12 @@ describe("CodexSession event mapping", () => {
262
263
  const contextUsage = emitted.find((e) => e.type === "context_usage");
263
264
  expect(contextUsage).toBeDefined();
264
265
  if (contextUsage && contextUsage.type === "context_usage") {
265
- // peak proxy = (357142 - 278912) + 2156 = 78230 + 2156 = 80386
266
- expect(contextUsage.contextUsedTokens).toBe(80386);
266
+ // Phase 9 unified: input + output = 357142 + 2156 = 359298 (above 200k).
267
+ expect(contextUsage.contextUsedTokens).toBe(359298);
267
268
  expect(contextUsage.contextTotalTokens).toBe(200_000);
268
- // 80386 / 200000 × 100 = 40.193 — on the 0-100 scale, NOT clamped to 100
269
- expect(contextUsage.contextPercent).toBeCloseTo(40.193, 2);
270
- expect(contextUsage.contextPercent).toBeLessThan(100);
269
+ // Above 100% raw clamped to exactly 100.
270
+ expect(contextUsage.contextPercent).toBe(100);
271
+ expect(contextUsage.contextFormula).toBe("input-cache-output");
271
272
  }
272
273
 
273
274
  // Cost still uses the full input_tokens — billing semantics are
@@ -82,7 +82,7 @@ describe("Context Snapshots", () => {
82
82
 
83
83
  // The summary should preserve the last known context usage, not null/0
84
84
  const summary = getContextSummaryByTaskId(taskId);
85
- expect(summary.totalContextTokensUsed).toBe(80000);
85
+ expect(summary.peakContextTokens).toBe(80000);
86
86
  expect(summary.contextWindowSize).toBe(200000);
87
87
  expect(summary.peakContextPercent).toBe(40);
88
88
  });
@@ -113,7 +113,7 @@ describe("Context Snapshots", () => {
113
113
  });
114
114
 
115
115
  const summary = getContextSummaryByTaskId(task2.id);
116
- expect(summary.totalContextTokensUsed).toBe(60000);
116
+ expect(summary.peakContextTokens).toBe(60000);
117
117
  expect(summary.contextWindowSize).toBe(200000);
118
118
  });
119
119
 
@@ -1,8 +1,15 @@
1
1
  import { describe, expect, test } from "bun:test";
2
- import { computeContextUsed, getContextWindowSize } from "../utils/context-window";
2
+ import {
3
+ CONTEXT_FORMULA,
4
+ clampContextPercent,
5
+ computeContextUsed,
6
+ computeContextUsedUnified,
7
+ getContextWindowSize,
8
+ } from "../utils/context-window";
3
9
 
4
10
  describe("getContextWindowSize", () => {
5
11
  test("returns 1M for opus models", () => {
12
+ expect(getContextWindowSize("claude-opus-4-7")).toBe(1_000_000);
6
13
  expect(getContextWindowSize("claude-opus-4-6")).toBe(1_000_000);
7
14
  expect(getContextWindowSize("opus")).toBe(1_000_000);
8
15
  });
@@ -26,6 +33,20 @@ describe("getContextWindowSize", () => {
26
33
  test("returns default entry value", () => {
27
34
  expect(getContextWindowSize("default")).toBe(200_000);
28
35
  });
36
+
37
+ test("Phase 4: dated full ids resolve via date-suffix stripping", () => {
38
+ // The regression this fixes: pre-Phase 4 these all fell to the 200k
39
+ // default, wildly understating opus/sonnet 4.x context.
40
+ expect(getContextWindowSize("claude-sonnet-4-6-20251004")).toBe(1_000_000);
41
+ expect(getContextWindowSize("claude-opus-4-7-20251201")).toBe(1_000_000);
42
+ expect(getContextWindowSize("claude-haiku-4-5-20251001")).toBe(200_000);
43
+ });
44
+
45
+ test("Phase 4: legacy 3.x family ids resolve", () => {
46
+ expect(getContextWindowSize("claude-3-5-sonnet")).toBe(200_000);
47
+ expect(getContextWindowSize("claude-3-5-sonnet-20241022")).toBe(200_000);
48
+ expect(getContextWindowSize("claude-3-opus")).toBe(200_000);
49
+ });
29
50
  });
30
51
 
31
52
  describe("computeContextUsed", () => {
@@ -64,3 +85,46 @@ describe("computeContextUsed", () => {
64
85
  ).toBe(5000);
65
86
  });
66
87
  });
88
+
89
+ describe("computeContextUsedUnified (Phase 9 unified formula)", () => {
90
+ test("sums input + cache_read + cache_create + output", () => {
91
+ expect(
92
+ computeContextUsedUnified({
93
+ inputTokens: 1000,
94
+ cacheReadTokens: 200,
95
+ cacheCreateTokens: 300,
96
+ outputTokens: 500,
97
+ }),
98
+ ).toBe(2000);
99
+ });
100
+
101
+ test("treats missing/null fields as zero", () => {
102
+ expect(computeContextUsedUnified({})).toBe(0);
103
+ expect(computeContextUsedUnified({ inputTokens: 100, outputTokens: null })).toBe(100);
104
+ });
105
+ });
106
+
107
+ describe("clampContextPercent (Phase 9)", () => {
108
+ test("returns the clamped percent for valid inputs", () => {
109
+ expect(clampContextPercent(50_000, 200_000)).toBe(25);
110
+ expect(clampContextPercent(0, 200_000)).toBe(0);
111
+ });
112
+
113
+ test("clamps to [0, 100]", () => {
114
+ expect(clampContextPercent(500_000, 200_000)).toBe(100);
115
+ expect(clampContextPercent(-10, 200_000)).toBe(0);
116
+ });
117
+
118
+ test("returns null for missing/zero/negative total (no divide-by-zero NaN)", () => {
119
+ expect(clampContextPercent(100, 0)).toBeNull();
120
+ expect(clampContextPercent(100, null)).toBeNull();
121
+ expect(clampContextPercent(100, undefined)).toBeNull();
122
+ expect(clampContextPercent(100, -1)).toBeNull();
123
+ });
124
+ });
125
+
126
+ describe("CONTEXT_FORMULA constant", () => {
127
+ test("is 'input-cache-output' so adapters stamp the same value on snapshots", () => {
128
+ expect(CONTEXT_FORMULA).toBe("input-cache-output");
129
+ });
130
+ });
@@ -598,6 +598,8 @@ describe("CostData mapping", () => {
598
598
  expect(resultEvent.cost.model).toBe("devin");
599
599
  expect(resultEvent.cost.inputTokens).toBe(0);
600
600
  expect(resultEvent.cost.outputTokens).toBe(0);
601
+ // Phase 3 — provider tag is required so the API recompute path engages.
602
+ expect(resultEvent.cost.provider).toBe("devin");
601
603
  }
602
604
  });
603
605