@desplega.ai/agent-swarm 1.79.3 → 1.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/openapi.json +98 -19
  2. package/package.json +12 -6
  3. package/src/be/db.ts +101 -30
  4. package/src/be/migrations/063_cost_context_schema_relax.sql +133 -0
  5. package/src/be/pricing-normalize.ts +81 -0
  6. package/src/be/seed-pricing.ts +293 -0
  7. package/src/commands/claude-managed-setup.ts +19 -3
  8. package/src/commands/runner.ts +592 -237
  9. package/src/http/context.ts +6 -2
  10. package/src/http/index.ts +115 -68
  11. package/src/http/session-data.ts +74 -23
  12. package/src/otel-impl.ts +200 -0
  13. package/src/otel.ts +127 -0
  14. package/src/providers/claude-adapter.ts +30 -5
  15. package/src/providers/claude-managed-adapter.ts +43 -17
  16. package/src/providers/claude-managed-pricing.ts +34 -0
  17. package/src/providers/codex-adapter.ts +38 -27
  18. package/src/providers/codex-models.ts +22 -3
  19. package/src/providers/devin-adapter.ts +11 -0
  20. package/src/providers/opencode-adapter.ts +31 -7
  21. package/src/providers/pi-mono-adapter.ts +39 -7
  22. package/src/providers/pricing-sources.md +52 -0
  23. package/src/providers/swarm-events-shared.ts +8 -4
  24. package/src/providers/types.ts +33 -10
  25. package/src/server.ts +6 -0
  26. package/src/tests/claude-managed-adapter.test.ts +17 -3
  27. package/src/tests/claude-managed-setup.test.ts +10 -1
  28. package/src/tests/codex-adapter.test.ts +20 -19
  29. package/src/tests/context-snapshot.test.ts +2 -2
  30. package/src/tests/context-window.test.ts +65 -1
  31. package/src/tests/devin-adapter.test.ts +2 -0
  32. package/src/tests/http/context-routes.test.ts +161 -0
  33. package/src/tests/migration-063-schema-relax.test.ts +109 -0
  34. package/src/tests/opencode-adapter.test.ts +146 -1
  35. package/src/tests/otel-impl-secret-scrubbing.test.ts +33 -0
  36. package/src/tests/pages-view-count.test.ts +30 -5
  37. package/src/tests/providers/codex-cost.test.ts +18 -0
  38. package/src/tests/providers/opencode-cost.test.ts +74 -0
  39. package/src/tests/providers/pi-cost.test.ts +128 -0
  40. package/src/tests/secret-scrubber.test.ts +19 -0
  41. package/src/tests/session-costs-codex-recompute.test.ts +35 -22
  42. package/src/tests/session-costs-model-key-normalize.test.ts +271 -0
  43. package/src/tests/session-costs-recompute-all-providers.test.ts +170 -0
  44. package/src/tests/store-progress-cost.test.ts +6 -1
  45. package/src/tools/store-progress.ts +16 -60
  46. package/src/tools/utils.ts +65 -12
  47. package/src/types.ts +62 -9
  48. package/src/utils/context-window.ts +104 -4
  49. package/src/utils/secret-scrubber.ts +7 -0
package/src/otel.ts ADDED
@@ -0,0 +1,127 @@
1
+ export type AttributeValue = string | number | boolean | string[] | number[] | boolean[];
2
+ export type Attributes = Record<string, AttributeValue | undefined>;
3
+
4
+ type SpanStatus = {
5
+ code: number;
6
+ message?: string;
7
+ };
8
+
9
+ export type SwarmSpan = {
10
+ setAttribute: (key: string, value: AttributeValue) => SwarmSpan;
11
+ setAttributes: (attributes: Attributes) => SwarmSpan;
12
+ addEvent: (name: string, attributes?: Attributes) => SwarmSpan;
13
+ recordException: (error: unknown) => void;
14
+ setStatus: (status: SpanStatus) => SwarmSpan;
15
+ end: () => void;
16
+ };
17
+
18
+ const enabled = Boolean(process.env.OTEL_EXPORTER_OTLP_ENDPOINT);
19
+
20
+ const NOOP_SPAN: SwarmSpan = {
21
+ setAttribute: () => NOOP_SPAN,
22
+ setAttributes: () => NOOP_SPAN,
23
+ addEvent: () => NOOP_SPAN,
24
+ recordException: () => {},
25
+ setStatus: () => NOOP_SPAN,
26
+ end: () => {},
27
+ };
28
+
29
+ let initialized = false;
30
+ let realWithSpan:
31
+ | (<T>(
32
+ name: string,
33
+ fn: (span: SwarmSpan) => Promise<T> | T,
34
+ attributes?: Attributes,
35
+ ) => Promise<T>)
36
+ | undefined;
37
+ let realStartSpan: ((name: string, attributes?: Attributes) => SwarmSpan) | undefined;
38
+ let realWithRemoteContext:
39
+ | (<T>(carrier: Record<string, unknown>, fn: () => Promise<T> | T) => Promise<T>)
40
+ | undefined;
41
+ let realWithSpanContext: (<T>(span: SwarmSpan, fn: () => T) => T) | undefined;
42
+ let realInjectTraceContext:
43
+ | ((headers: Record<string, string>) => Record<string, string>)
44
+ | undefined;
45
+ let realShutdown: (() => Promise<void>) | undefined;
46
+
47
+ export function isOtelEnabled(): boolean {
48
+ return enabled;
49
+ }
50
+
51
+ export async function initOtel(serviceRole = process.env.AGENT_ROLE || "api"): Promise<void> {
52
+ if (!enabled || initialized) return;
53
+ initialized = true;
54
+
55
+ try {
56
+ const impl = await import("./otel-impl");
57
+ await impl.boot(serviceRole);
58
+ realWithSpan = impl.withSpan;
59
+ realStartSpan = impl.startSpan;
60
+ realWithRemoteContext = impl.withRemoteContext;
61
+ realWithSpanContext = impl.withSpanContext;
62
+ realInjectTraceContext = impl.injectTraceContext;
63
+ realShutdown = impl.shutdown;
64
+ console.log(
65
+ `[OTel] enabled for ${process.env.OTEL_SERVICE_NAME ?? "agent-swarm"} (${serviceRole})`,
66
+ );
67
+ } catch (error) {
68
+ console.warn(`[OTel] disabled after initialization failure: ${error}`);
69
+ }
70
+ }
71
+
72
+ export async function withSpan<T>(
73
+ name: string,
74
+ fn: (span: SwarmSpan) => Promise<T> | T,
75
+ attributes?: Attributes,
76
+ ): Promise<T> {
77
+ if (!enabled || !realWithSpan) {
78
+ return fn(NOOP_SPAN);
79
+ }
80
+ return realWithSpan(name, fn, attributes);
81
+ }
82
+
83
+ export function startSpan(name: string, attributes?: Attributes): SwarmSpan {
84
+ if (!enabled || !realStartSpan) {
85
+ return NOOP_SPAN;
86
+ }
87
+ return realStartSpan(name, attributes);
88
+ }
89
+
90
+ export function withSpanContext<T>(span: SwarmSpan, fn: () => T): T {
91
+ if (!enabled || !realWithSpanContext) {
92
+ return fn();
93
+ }
94
+ return realWithSpanContext(span, fn);
95
+ }
96
+
97
+ export async function withRemoteContext<T>(
98
+ carrier: Record<string, unknown>,
99
+ fn: () => Promise<T> | T,
100
+ ): Promise<T> {
101
+ if (!enabled || !realWithRemoteContext) {
102
+ return fn();
103
+ }
104
+ return realWithRemoteContext(carrier, fn);
105
+ }
106
+
107
+ export function injectTraceContext(headers: Record<string, string>): Record<string, string> {
108
+ if (!enabled || !realInjectTraceContext) {
109
+ return headers;
110
+ }
111
+ return realInjectTraceContext(headers);
112
+ }
113
+
114
+ export async function shutdownOtel(): Promise<void> {
115
+ if (!realShutdown) return;
116
+ await realShutdown();
117
+ }
118
+
119
+ export function _resetOtelForTests() {
120
+ initialized = false;
121
+ realWithSpan = undefined;
122
+ realStartSpan = undefined;
123
+ realWithRemoteContext = undefined;
124
+ realWithSpanContext = undefined;
125
+ realInjectTraceContext = undefined;
126
+ realShutdown = undefined;
127
+ }
@@ -1,7 +1,12 @@
1
1
  import { readFile, unlink, writeFile } from "node:fs/promises";
2
2
  import { homedir } from "node:os";
3
3
  import { dirname, join } from "node:path";
4
- import { computeContextUsed, getContextWindowSize } from "../utils/context-window";
4
+ import {
5
+ CONTEXT_FORMULA,
6
+ clampContextPercent,
7
+ computeContextUsedUnified,
8
+ getContextWindowSize,
9
+ } from "../utils/context-window";
5
10
  import { validateClaudeCredentials } from "../utils/credentials";
6
11
  import {
7
12
  parseStderrForErrors,
@@ -465,6 +470,10 @@ class ClaudeSession implements ProviderSession {
465
470
  this._sessionId = json.session_id;
466
471
  this.emit({ type: "session_init", sessionId: json.session_id, provider: "claude" });
467
472
  if (json.model) {
473
+ // Phase 4: the CLI's `init.model` reflects the actual model after any
474
+ // backoff/fallback. Update `this.model` so subsequent CostData rows
475
+ // (and the pricing lookup the API runs) use the right rate.
476
+ this.model = json.model;
468
477
  this.contextWindowSize = getContextWindowSize(json.model);
469
478
  }
470
479
  }
@@ -487,6 +496,10 @@ class ClaudeSession implements ProviderSession {
487
496
  output_tokens?: number;
488
497
  cache_read_input_tokens?: number;
489
498
  cache_creation_input_tokens?: number;
499
+ // Phase 4: claude extended-thinking flows surface this — the
500
+ // CLI emits `thinking_input_tokens` when the model produced
501
+ // thinking content during the turn.
502
+ thinking_input_tokens?: number;
490
503
  }
491
504
  | undefined;
492
505
 
@@ -499,8 +512,12 @@ class ClaudeSession implements ProviderSession {
499
512
  outputTokens: usage?.output_tokens ?? 0,
500
513
  cacheReadTokens: usage?.cache_read_input_tokens ?? 0,
501
514
  cacheWriteTokens: usage?.cache_creation_input_tokens ?? 0,
515
+ // Phase 4: surface thinking tokens; previously dropped on the floor.
516
+ thinkingTokens: usage?.thinking_input_tokens ?? 0,
502
517
  durationMs: json.duration_ms || 0,
503
- numTurns: json.num_turns || 1,
518
+ // Phase 4: honest null when the CLI omits num_turns instead of a
519
+ // faked `1` (would have under-counted in dashboards).
520
+ numTurns: json.num_turns ?? null,
504
521
  model: this.model,
505
522
  isError: json.is_error || false,
506
523
  provider: "claude",
@@ -539,18 +556,26 @@ class ClaudeSession implements ProviderSession {
539
556
  }
540
557
  }
541
558
 
542
- // Context usage extraction from assistant message usage
559
+ // Context usage extraction from assistant message usage.
560
+ // Phase 9: unified `input + cache + output` formula across every
561
+ // provider so cross-provider percent comparisons are meaningful.
543
562
  if (json.message.usage) {
544
563
  const usage = json.message.usage;
545
- const contextUsed = computeContextUsed(usage);
564
+ const contextUsed = computeContextUsedUnified({
565
+ inputTokens: usage.input_tokens,
566
+ cacheReadTokens: usage.cache_read_input_tokens,
567
+ cacheCreateTokens: usage.cache_creation_input_tokens,
568
+ outputTokens: usage.output_tokens,
569
+ });
546
570
  const contextTotal = this.contextWindowSize;
547
571
 
548
572
  this.emit({
549
573
  type: "context_usage",
550
574
  contextUsedTokens: contextUsed,
551
575
  contextTotalTokens: contextTotal,
552
- contextPercent: contextTotal > 0 ? (contextUsed / contextTotal) * 100 : 0,
576
+ contextPercent: clampContextPercent(contextUsed, contextTotal) ?? 0,
553
577
  outputTokens: usage.output_tokens ?? 0,
578
+ contextFormula: CONTEXT_FORMULA,
554
579
  });
555
580
  }
556
581
  }
@@ -59,8 +59,15 @@ import type {
59
59
  import type { SkillCreateResponse as Skill } from "@anthropic-ai/sdk/resources/beta/skills";
60
60
 
61
61
  import { checkToolLoop } from "../hooks/tool-loop-detection";
62
+ import {
63
+ CONTEXT_FORMULA,
64
+ clampContextPercent,
65
+ computeContextUsedUnified,
66
+ getContextWindowSize,
67
+ } from "../utils/context-window";
62
68
  import { scrubSecrets } from "../utils/secret-scrubber";
63
69
  import { computeClaudeManagedCostUsd } from "./claude-managed-models";
70
+ import { getRuntimeFeePerHour } from "./claude-managed-pricing";
64
71
  import { createClaudeManagedSwarmEventHandler } from "./claude-managed-swarm-events";
65
72
  import type {
66
73
  CostData,
@@ -113,13 +120,10 @@ const REQUIRED_ENV_VARS = [
113
120
  "MANAGED_ENVIRONMENT_ID",
114
121
  ] as const;
115
122
 
116
- /**
117
- * Default context window for managed Claude sessions when we don't have a
118
- * model-specific override. Sized to match Sonnet 4.x (1M extended-context
119
- * variant). The Phase 4 pricing-table commit will replace this with a
120
- * per-model lookup.
121
- */
122
- const DEFAULT_CONTEXT_TOTAL_TOKENS = 1_000_000;
123
+ // Phase 5: removed the hardcoded `DEFAULT_CONTEXT_TOTAL_TOKENS = 1_000_000`.
124
+ // The adapter now calls `getContextWindowSize(this.model)` from
125
+ // `src/utils/context-window.ts`, which resolves shortnames + dated full ids
126
+ // so haiku-4-5 sessions don't pretend to have a 1M window.
123
127
 
124
128
  /**
125
129
  * Compose the per-session user-message content blocks. Returns two blocks:
@@ -187,6 +191,8 @@ function emptyCost(config: ProviderSessionConfig, model: string): CostData {
187
191
  numTurns: 0,
188
192
  model,
189
193
  isError: false,
194
+ // Phase 3 — tag every emitted CostData so the API's recompute path engages.
195
+ provider: "claude-managed",
190
196
  };
191
197
  }
192
198
 
@@ -374,6 +380,11 @@ class ClaudeManagedSession implements ProviderSession {
374
380
  * 2. Anthropic's $0.08/session-hour runtime fee — billed continuously by
375
381
  * Anthropic regardless of model usage, so we add it here to surface in
376
382
  * the swarm's per-session cost UI.
383
+ *
384
+ * Phase 5: the harness-local USD is still computed here, but the server-side
385
+ * recompute path (`POST /api/session-costs` after Phase 2) will reprice the
386
+ * row against the seeded pricing-table values and tag `costSource='pricing-table'`.
387
+ * The runtime fee comes from the same table now (`token_class='runtime_hour'`).
377
388
  */
378
389
  private snapshotCost(isError: boolean): CostData {
379
390
  const durationMs = Date.now() - this.startedAt;
@@ -384,9 +395,11 @@ class ClaudeManagedSession implements ProviderSession {
384
395
  this.cost.cacheReadTokens ?? 0,
385
396
  this.cost.cacheWriteTokens ?? 0,
386
397
  );
387
- // $0.08 / session-hour. Sandbox runtime is billed by wallclock, so we
388
- // amortize linearly across the session's `durationMs`.
389
- const runtimeFeeUsd = (durationMs / 3_600_000) * 0.08;
398
+ // Phase 5: read the runtime fee from the pricing table when available so
399
+ // we have one source of truth. Falls back to the historical $0.08/hr
400
+ // constant if the row hasn't been seeded yet (e.g. on a fresh DB before
401
+ // seed-pricing.ts ran).
402
+ const runtimeFeeUsd = (durationMs / 3_600_000) * getRuntimeFeePerHour();
390
403
  return {
391
404
  ...this.cost,
392
405
  durationMs,
@@ -506,12 +519,15 @@ class ClaudeManagedSession implements ProviderSession {
506
519
  // this event. Emit a `compaction` ProviderEvent with the values we
507
520
  // *do* know; consumers that need richer data can subscribe to
508
521
  // `raw_log` for the original payload.
522
+ // Phase 5 — pre-compact tokens are an inferred proxy (running input
523
+ // total); flag the compactTrigger as 'auto-inferred' so downstream
524
+ // dashboards can distinguish a real trigger value from our guess.
509
525
  const _cc = event as BetaManagedAgentsAgentThreadContextCompactedEvent;
510
526
  this.emit({
511
527
  type: "compaction",
512
528
  preCompactTokens: this.cost.inputTokens ?? 0,
513
- compactTrigger: "auto",
514
- contextTotalTokens: DEFAULT_CONTEXT_TOTAL_TOKENS,
529
+ compactTrigger: "auto-inferred",
530
+ contextTotalTokens: getContextWindowSize(this.cost.model),
515
531
  });
516
532
  return { terminal: false, isError: false };
517
533
  }
@@ -524,16 +540,26 @@ class ClaudeManagedSession implements ProviderSession {
524
540
  (this.cost.cacheReadTokens ?? 0) + usage.cache_read_input_tokens;
525
541
  this.cost.cacheWriteTokens =
526
542
  (this.cost.cacheWriteTokens ?? 0) + usage.cache_creation_input_tokens;
527
- this.cost.numTurns += 1;
528
-
529
- const used = (this.cost.inputTokens ?? 0) + (this.cost.outputTokens ?? 0);
530
- const total = DEFAULT_CONTEXT_TOTAL_TOKENS;
543
+ this.cost.numTurns = (this.cost.numTurns ?? 0) + 1;
544
+
545
+ // Phase 5 + Phase 9: unified `input + cache + output` formula AND a
546
+ // per-model window via `getContextWindowSize`. Previously this used
547
+ // a hardcoded 1M window and ignored cache — fine for sonnet/opus,
548
+ // wrong for haiku and any future smaller-window model.
549
+ const used = computeContextUsedUnified({
550
+ inputTokens: this.cost.inputTokens,
551
+ cacheReadTokens: this.cost.cacheReadTokens,
552
+ cacheCreateTokens: this.cost.cacheWriteTokens,
553
+ outputTokens: this.cost.outputTokens,
554
+ });
555
+ const total = getContextWindowSize(this.cost.model);
531
556
  this.emit({
532
557
  type: "context_usage",
533
558
  contextUsedTokens: used,
534
559
  contextTotalTokens: total,
535
- contextPercent: Math.min(100, (used / total) * 100),
560
+ contextPercent: clampContextPercent(used, total),
536
561
  outputTokens: this.cost.outputTokens ?? 0,
562
+ contextFormula: CONTEXT_FORMULA,
537
563
  });
538
564
  return { terminal: false, isError: false };
539
565
  }
@@ -0,0 +1,34 @@
1
+ /**
2
+ * Phase 5 — small adapter-side pricing constants for claude-managed.
3
+ *
4
+ * The API server's pricing table is the canonical store (seeded by
5
+ * `src/be/seed-pricing.ts`). Workers can't touch the DB directly (DB
6
+ * boundary), so the adapter keeps a local constant for the runtime fee
7
+ * and lets the API-side recompute path (Phase 2) override the resulting
8
+ * `totalCostUsd` with the canonical figure. The constant here is what
9
+ * shows up in the worker's local logs before the row hits the server.
10
+ *
11
+ * If/when we plumb pricing through the worker bootstrap (HTTP fetch of
12
+ * `/api/pricing` at session start), this module is the place to swap.
13
+ */
14
+
15
+ /**
16
+ * USD per session-hour for managed claude runtime. Source:
17
+ * https://docs.claude.com/en/api/agent-sdk/managed-runtime#pricing
18
+ * (verified 2026-04-28). Override at runtime via env for ops bumps without
19
+ * a redeploy.
20
+ */
21
+ export const RUNTIME_FEE_USD_PER_HOUR = (() => {
22
+ const raw = process.env.CLAUDE_MANAGED_RUNTIME_FEE_USD_PER_HOUR;
23
+ const n = raw ? Number(raw) : NaN;
24
+ if (Number.isFinite(n) && n >= 0) return n;
25
+ return 0.08;
26
+ })();
27
+
28
+ /**
29
+ * Adapter helper. Always returns a finite number — never crashes the
30
+ * cost snapshot.
31
+ */
32
+ export function getRuntimeFeePerHour(): number {
33
+ return RUNTIME_FEE_USD_PER_HOUR;
34
+ }
@@ -66,6 +66,11 @@ import {
66
66
  type WebSearchItem,
67
67
  } from "@openai/codex-sdk";
68
68
  import { buildRatingsFromLlm, fetchRetrievalsForTask, postRatings } from "../be/memory/raters/llm";
69
+ import {
70
+ CONTEXT_FORMULA,
71
+ clampContextPercent,
72
+ computeContextUsedUnified,
73
+ } from "../utils/context-window";
69
74
  import { summarizeSession as runSummarize } from "../utils/internal-ai";
70
75
  import { scrubSecrets } from "../utils/secret-scrubber";
71
76
  import { type CodexAgentsMdHandle, writeCodexAgentsMd } from "./codex-agents-md";
@@ -523,6 +528,11 @@ class CodexSession implements ProviderSession {
523
528
  const inputTokens = usage?.input_tokens ?? 0;
524
529
  const cachedInputTokens = usage?.cached_input_tokens ?? 0;
525
530
  const outputTokens = usage?.output_tokens ?? 0;
531
+ // Phase 6: Codex SDK surfaces `reasoning_output_tokens` separately from
532
+ // `output_tokens` for reasoning models (gpt-5.3-codex, gpt-5.4 thinking).
533
+ // Pre-fix this number was read into `lastUsage` but never reached
534
+ // `CostData`, so reasoning-heavy sessions silently under-billed.
535
+ const reasoningOutputTokens = usage?.reasoning_output_tokens ?? 0;
526
536
  return {
527
537
  // Runner overrides with its own session id.
528
538
  sessionId: "",
@@ -540,9 +550,12 @@ class CodexSession implements ProviderSession {
540
550
  ),
541
551
  inputTokens,
542
552
  outputTokens,
553
+ reasoningOutputTokens,
543
554
  cacheReadTokens: cachedInputTokens,
544
- // Codex does not distinguish cache writes in its Usage payload.
545
- cacheWriteTokens: 0,
555
+ // Phase 6: undefined (NOT 0). Codex SDK can't honestly report cache
556
+ // writes; leaving it undefined preserves that distinction in the DB
557
+ // instead of mixing genuine zeros with "unknown".
558
+ cacheWriteTokens: undefined,
546
559
  durationMs: Date.now() - this.startedAt,
547
560
  numTurns: this.numTurns,
548
561
  model: this.resolvedModel,
@@ -760,36 +773,34 @@ class CodexSession implements ProviderSession {
760
773
  case "turn.completed": {
761
774
  this.lastUsage = event.usage;
762
775
  if (event.usage) {
763
- // The Codex SDK reports `input_tokens` as the SUM of every prompt
764
- // sent to the model across the entire turn (one `codex exec` call
765
- // can fan out to dozens of model invocations as MCP tools roundtrip
766
- // back and forth). For chatty turns this number routinely exceeds
767
- // the model's context window, even though no single model call did.
776
+ // Phase 9: switch from the codex-specific "peak proxy" formula
777
+ // (`uncached_input + output`) to the unified
778
+ // `input + cache_read + cache_create + output` so cross-provider
779
+ // percent comparisons are meaningful.
768
780
  //
769
- // For peak-context reporting we want a proxy for "the largest
770
- // single-call prompt". We approximate it as the uncached portion
771
- // (cached tokens are reused across calls so they count once toward
772
- // the actual peak), plus the output. This isn't perfect — the SDK
773
- // would have to expose per-call stats for thatbut it's far more
774
- // representative than `(input + output) / window` which clamps to
775
- // 1.0 the moment a turn makes any meaningful tool history.
776
- const uncachedInput = Math.max(
777
- 0,
778
- event.usage.input_tokens - event.usage.cached_input_tokens,
779
- );
780
- const peakProxy = uncachedInput + event.usage.output_tokens;
781
- // `contextPercent` is on a 0-100 scale across all providers — claude
782
- // emits `(used / total) * 100`, pi-mono passes through `usage.percent`
783
- // which is already 0-100. The dashboard at
784
- // ui/src/pages/tasks/[id]/page.tsx renders it via `.toFixed(0)`
785
- // expecting an integer percent, so a 0-1 fraction would render as
786
- // "0%" instead of e.g. "40%".
781
+ // Note: Codex's `input_tokens` already includes cached_input_tokens
782
+ // (it's the TOTAL across the turn — see the longer comment that
783
+ // used to live here, preserved in git history). We therefore pass
784
+ // `cacheReadTokens: 0` to avoid double-counting the cached portion.
785
+ // The trade-off the old comment flagged is still real a chatty
786
+ // turn can over-report because `input_tokens` is the SUM across
787
+ // every model call in the turn but having the SAME formula
788
+ // everywhere wins over the local optimum. Clamp catches the
789
+ // chatty-turn overshoot at 100%. Old rows tagged 'peak-proxy'
790
+ // remain in `task_context_snapshots`; the UI surfaces both.
791
+ const contextUsed = computeContextUsedUnified({
792
+ inputTokens: event.usage.input_tokens,
793
+ cacheReadTokens: 0,
794
+ cacheCreateTokens: 0,
795
+ outputTokens: event.usage.output_tokens,
796
+ });
787
797
  this.emit({
788
798
  type: "context_usage",
789
- contextUsedTokens: peakProxy,
799
+ contextUsedTokens: contextUsed,
790
800
  contextTotalTokens: this.contextWindow,
791
- contextPercent: Math.min(100, (peakProxy / this.contextWindow) * 100),
801
+ contextPercent: clampContextPercent(contextUsed, this.contextWindow) ?? 0,
792
802
  outputTokens: event.usage.output_tokens,
803
+ contextFormula: CONTEXT_FORMULA,
793
804
  });
794
805
  }
795
806
  break;
@@ -126,12 +126,22 @@ export const CODEX_MODEL_PRICING: Record<CodexModel, CodexModelPricing> = {
126
126
  },
127
127
  };
128
128
 
129
+ /**
130
+ * Phase 6 — one-warning-per-process tracking so unknown models log once
131
+ * instead of spamming the worker log on every turn.
132
+ */
133
+ const _warnedUnknownCodexModels = new Set<string>();
134
+
129
135
  /**
130
136
  * Compute USD cost from a Codex `Usage` payload. The Codex SDK reports
131
137
  * `input_tokens` as the TOTAL input fed to the model across the turn (cached
132
138
  * + uncached), so we subtract `cached_input_tokens` before billing the
133
- * uncached portion at the full rate. Returns 0 for unknown models so we never
134
- * inflate cost on a typo.
139
+ * uncached portion at the full rate.
140
+ *
141
+ * Phase 6: returns 0 for unknown models AND logs a one-time warning, so an
142
+ * operator running `MODEL_OVERRIDE=gpt-future-2027` notices that the worker
143
+ * is silently dropping cost. The server-side recompute path (Phase 2) tags
144
+ * such rows `costSource='unpriced'`, which surfaces as a yellow UI badge.
135
145
  */
136
146
  export function computeCodexCostUsd(
137
147
  model: string,
@@ -140,7 +150,16 @@ export function computeCodexCostUsd(
140
150
  outputTokens: number,
141
151
  ): number {
142
152
  const pricing = CODEX_MODEL_PRICING[model as CodexModel];
143
- if (!pricing) return 0;
153
+ if (!pricing) {
154
+ if (!_warnedUnknownCodexModels.has(model)) {
155
+ _warnedUnknownCodexModels.add(model);
156
+ console.warn(
157
+ `[codex] unpriced model ${JSON.stringify(model)} — adapter cost will report $0; ` +
158
+ "server-side recompute will tag costSource='unpriced' if the pricing table has no rows.",
159
+ );
160
+ }
161
+ return 0;
162
+ }
144
163
  const uncachedInput = Math.max(0, inputTokens - cachedInputTokens);
145
164
  const inputCost = (uncachedInput / 1_000_000) * pricing.inputPerMillion;
146
165
  const cachedCost = (cachedInputTokens / 1_000_000) * pricing.cachedInputPerMillion;
@@ -279,6 +279,12 @@ class DevinSession implements ProviderSession {
279
279
  if (this.settled || this.aborted) return;
280
280
  this.pollCount += 1;
281
281
 
282
+ // Phase 8: Devin's session API does NOT report per-poll context-window
283
+ // info (the model is fully managed by Devin). We deliberately don't emit
284
+ // a synthetic `context_usage` event here — faking one with `contextUsedTokens=0`
285
+ // would be misleading. `peakContextTokens` stays null for devin tasks,
286
+ // which the UI surfaces as "not available" rather than "0".
287
+
282
288
  let response: DevinSessionResponse;
283
289
  try {
284
290
  response = await getSession(this.orgId, this.devinApiKey, this._sessionId!);
@@ -788,6 +794,11 @@ class DevinSession implements ProviderSession {
788
794
  numTurns: this.pollCount,
789
795
  model: "devin",
790
796
  isError,
797
+ // Phase 3 — tag CostData so the API recompute path engages. Devin's
798
+ // pricing is ACU-based (one row under `provider='devin', model='*',
799
+ // token_class='acu'`); the harness USD value above is already correct,
800
+ // but tagging the row exposes its source to the UI badge.
801
+ provider: "devin",
791
802
  };
792
803
  }
793
804
  }
@@ -12,7 +12,11 @@ import { existsSync, mkdirSync } from "node:fs";
12
12
  import { join } from "node:path";
13
13
  import type { AssistantMessage, Config, Event as OpencodeEvent } from "@opencode-ai/sdk";
14
14
  import { createOpencode } from "@opencode-ai/sdk";
15
- import { getContextWindowSize } from "../utils/context-window";
15
+ import {
16
+ CONTEXT_FORMULA,
17
+ clampContextPercent,
18
+ getContextWindowSize,
19
+ } from "../utils/context-window";
16
20
  import { validateOpencodeCredentials } from "../utils/credentials";
17
21
  import { fetchInstalledMcpServers } from "../utils/mcp-server-fetcher";
18
22
  import { scrubSecrets } from "../utils/secret-scrubber";
@@ -104,7 +108,7 @@ function resolvePluginPath(): string {
104
108
  return join(import.meta.dir, "../../plugin/opencode-plugins/agent-swarm.ts");
105
109
  }
106
110
 
107
- class OpencodeSession implements ProviderSession {
111
+ export class OpencodeSession implements ProviderSession {
108
112
  private _sessionId: string;
109
113
  private listeners: Array<(event: ProviderEvent) => void> = [];
110
114
  // Buffer for events emitted before any listener is attached.
@@ -115,6 +119,7 @@ class OpencodeSession implements ProviderSession {
115
119
  // leaving agent_tasks.provider/.model NULL. Buffer + flush on first attach.
116
120
  private pendingEvents: ProviderEvent[] = [];
117
121
  private completionResolve!: (result: ProviderResult) => void;
122
+ // biome-ignore lint/correctness/noUnusedPrivateClassMembers: reserved for future error-propagation paths; symmetric with completionResolve.
118
123
  private completionReject!: (err: Error) => void;
119
124
  private completionPromise: Promise<ProviderResult>;
120
125
  private server: { url: string; close(): void };
@@ -237,6 +242,15 @@ class OpencodeSession implements ProviderSession {
237
242
  case "message.updated": {
238
243
  const msg = ev.properties.info;
239
244
  if (!isAssistantMessage(msg) || msg.sessionID !== this._sessionId) break;
245
+ // Phase 9 fix: opencode fires `message.updated` repeatedly during a single
246
+ // assistant turn (streaming text deltas, tool transitions, etc.) and only
247
+ // populates `tokens`/`cost` on the FINAL update once `time.completed` is
248
+ // set. Accumulating on every event would either no-op (zero tokens) or —
249
+ // if opencode ever back-fills intermediate snapshots — multi-count. Gate
250
+ // the accumulator AND the context emit on the finalized signal so both
251
+ // paths see the same canonical "this turn is done" moment.
252
+ const messageFinalized = msg.time?.completed != null;
253
+ if (!messageFinalized) break;
240
254
  // Accumulate cost from each completed assistant message ("step")
241
255
  this.totalCostUsd += msg.cost;
242
256
  this.inputTokens += msg.tokens?.input ?? 0;
@@ -247,21 +261,31 @@ class OpencodeSession implements ProviderSession {
247
261
  if (!this.model && msg.modelID) this.model = msg.modelID;
248
262
 
249
263
  // Emit context_usage so the runner can POST /api/tasks/:id/context
250
- // (drives the dashboard's context-usage progress bar) and the
251
- // dashboard's activity timeline shows per-turn progress.
264
+ // (drives the dashboard's context-usage progress bar). The runner-side
265
+ // throttle (CONTEXT_THROTTLE_MS = 30s) means the FIRST emit wins for any
266
+ // short task — so this MUST carry real numbers, not the zero-tokens
267
+ // placeholder opencode sends on intermediate streaming updates. The
268
+ // `time.completed` gate above (in the accumulator block) guarantees we
269
+ // only land here for finalized messages.
252
270
  const turnInput = msg.tokens?.input ?? 0;
253
271
  const turnOutput = msg.tokens?.output ?? 0;
254
272
  const turnCacheRead = msg.tokens?.cache?.read ?? 0;
255
273
  const turnCacheWrite = msg.tokens?.cache?.write ?? 0;
256
- const contextUsed = turnInput + turnCacheRead + turnCacheWrite;
274
+ // Phase 8 + Phase 9: unified `input + cache + output` formula instead
275
+ // of the previous `input + cache_read + cache_write` (which omitted
276
+ // output and slightly mis-counted vs every other adapter).
277
+ const contextUsed = turnInput + turnCacheRead + turnCacheWrite + turnOutput;
257
278
  const contextTotal = getContextWindowSize(this.model || msg.modelID || "default");
258
- if (contextTotal > 0) {
279
+ if (contextTotal > 0 && contextUsed > 0) {
259
280
  this.emit({
260
281
  type: "context_usage",
261
282
  contextUsedTokens: contextUsed,
262
283
  contextTotalTokens: contextTotal,
263
- contextPercent: (contextUsed / contextTotal) * 100,
284
+ // Phase 8: clamp so a turn that briefly overshoots (e.g. due to
285
+ // a stale total) doesn't render as a 130% gauge in the UI.
286
+ contextPercent: clampContextPercent(contextUsed, contextTotal) ?? 0,
264
287
  outputTokens: turnOutput,
288
+ contextFormula: CONTEXT_FORMULA,
265
289
  });
266
290
  }
267
291
  break;