npm - @desplega.ai/agent-swarm - Versions diffs - 1.79.4 → 1.80.0 - Mend

@desplega.ai/agent-swarm 1.79.4 → 1.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/openapi.json +98 -19
package/package.json +12 -6
package/src/be/db.ts +101 -30
package/src/be/migrations/063_cost_context_schema_relax.sql +133 -0
package/src/be/pricing-normalize.ts +81 -0
package/src/be/seed-pricing.ts +293 -0
package/src/commands/claude-managed-setup.ts +19 -3
package/src/commands/runner.ts +592 -237
package/src/http/context.ts +6 -2
package/src/http/index.ts +115 -68
package/src/http/session-data.ts +74 -23
package/src/otel-impl.ts +200 -0
package/src/otel.ts +127 -0
package/src/providers/claude-adapter.ts +30 -5
package/src/providers/claude-managed-adapter.ts +43 -17
package/src/providers/claude-managed-pricing.ts +34 -0
package/src/providers/codex-adapter.ts +38 -27
package/src/providers/codex-models.ts +22 -3
package/src/providers/devin-adapter.ts +11 -0
package/src/providers/opencode-adapter.ts +31 -7
package/src/providers/pi-mono-adapter.ts +39 -7
package/src/providers/pricing-sources.md +52 -0
package/src/providers/swarm-events-shared.ts +8 -4
package/src/providers/types.ts +33 -10
package/src/server.ts +6 -0
package/src/tests/claude-managed-adapter.test.ts +17 -3
package/src/tests/claude-managed-setup.test.ts +10 -1
package/src/tests/codex-adapter.test.ts +20 -19
package/src/tests/context-snapshot.test.ts +2 -2
package/src/tests/context-window.test.ts +65 -1
package/src/tests/devin-adapter.test.ts +2 -0
package/src/tests/http/context-routes.test.ts +161 -0
package/src/tests/migration-063-schema-relax.test.ts +109 -0
package/src/tests/opencode-adapter.test.ts +146 -1
package/src/tests/otel-impl-secret-scrubbing.test.ts +33 -0
package/src/tests/pages-view-count.test.ts +30 -5
package/src/tests/providers/codex-cost.test.ts +18 -0
package/src/tests/providers/opencode-cost.test.ts +74 -0
package/src/tests/providers/pi-cost.test.ts +128 -0
package/src/tests/secret-scrubber.test.ts +19 -0
package/src/tests/session-costs-codex-recompute.test.ts +35 -22
package/src/tests/session-costs-model-key-normalize.test.ts +271 -0
package/src/tests/session-costs-recompute-all-providers.test.ts +170 -0
package/src/tests/store-progress-cost.test.ts +6 -1
package/src/tools/store-progress.ts +16 -60
package/src/tools/utils.ts +65 -12
package/src/types.ts +62 -9
package/src/utils/context-window.ts +104 -4
package/src/utils/secret-scrubber.ts +7 -0

package/src/tests/session-costs-recompute-all-providers.test.ts ADDED Viewed

@@ -0,0 +1,170 @@
+// Phase 2: POST /api/session-costs recompute fires for every provider with
+// seeded pricing rows — not just codex. Unknown (provider, model) pairs are
+// tagged `costSource='unpriced'`.
+import { afterAll, afterEach, beforeAll, describe, expect, test } from "bun:test";
+import { unlink } from "node:fs/promises";
+import {
+  createServer as createHttpServer,
+  type IncomingMessage,
+  type Server,
+  type ServerResponse,
+} from "node:http";
+import { closeDb, createAgent, getDb, initDb, insertPricingRow } from "../be/db";
+import { handleCore } from "../http/core";
+import { handleSessionData } from "../http/session-data";
+import { getPathSegments, parseQueryParams } from "../http/utils";
+const TEST_DB_PATH = "./test-recompute-all-providers.sqlite";
+const API_KEY = "test-recompute-all";
+async function removeDbFiles(path: string): Promise<void> {
+  for (const suffix of ["", "-wal", "-shm"]) {
+    try {
+      await unlink(path + suffix);
+    } catch (error) {
+      if ((error as NodeJS.ErrnoException).code !== "ENOENT") throw error;
+    }
+  }
+}
+async function listen(server: Server): Promise<number> {
+  await new Promise<void>((resolve) => server.listen(0, resolve));
+  const addr = server.address();
+  if (!addr || typeof addr === "string") throw new Error("no port");
+  return addr.port;
+}
+function createTestServer(apiKey: string): Server {
+  return createHttpServer(async (req: IncomingMessage, res: ServerResponse) => {
+    const myAgentId = req.headers["x-agent-id"] as string | undefined;
+    const handled = await handleCore(req, res, myAgentId, apiKey);
+    if (handled) return;
+    const pathSegments = getPathSegments(req.url || "");
+    const queryParams = parseQueryParams(req.url || "");
+    const ok = await handleSessionData(req, res, pathSegments, queryParams, myAgentId);
+    if (!ok) {
+      res.writeHead(404);
+      res.end("Not Found");
+    }
+  });
+}
+let server: Server;
+let port: number;
+let testAgent: { id: string };
+beforeAll(async () => {
+  await removeDbFiles(TEST_DB_PATH);
+  initDb(TEST_DB_PATH);
+  testAgent = createAgent({ name: "recompute-all-test", isLead: false, status: "idle" });
+  server = createTestServer(API_KEY);
+  port = await listen(server);
+});
+afterAll(async () => {
+  await new Promise<void>((resolve) => server.close(() => resolve()));
+  closeDb();
+  await removeDbFiles(TEST_DB_PATH);
+});
+afterEach(() => {
+  const db = getDb();
+  db.prepare("DELETE FROM session_costs").run();
+  // Wipe everything we explicitly inserted (effective_from > 0); leave the
+  // migration-046 codex seeds alone.
+  db.prepare("DELETE FROM pricing WHERE effective_from > 0").run();
+});
+function authedFetch(path: string, init: RequestInit = {}): Promise<Response> {
+  return fetch(`http://localhost:${port}${path}`, {
+    ...init,
+    headers: {
+      Authorization: `Bearer ${API_KEY}`,
+      "Content-Type": "application/json",
+      ...(init.headers ?? {}),
+    },
+  });
+}
+interface CostResponse {
+  success: boolean;
+  cost: {
+    totalCostUsd: number;
+    costSource: "harness" | "pricing-table" | "unpriced";
+  };
+}
+function seedTwoClassRates(provider: string, model: string, inputRate = 1, outputRate = 10) {
+  insertPricingRow({
+    provider: provider as Parameters<typeof insertPricingRow>[0]["provider"],
+    model,
+    tokenClass: "input",
+    effectiveFrom: 1,
+    pricePerMillionUsd: inputRate,
+  });
+  insertPricingRow({
+    provider: provider as Parameters<typeof insertPricingRow>[0]["provider"],
+    model,
+    tokenClass: "output",
+    effectiveFrom: 1,
+    pricePerMillionUsd: outputRate,
+  });
+}
+describe("Phase 2 — POST /api/session-costs recompute fires for every provider", () => {
+  for (const provider of [
+    "claude",
+    "claude-managed",
+    "codex",
+    "pi",
+    "opencode",
+    "devin",
+    "gemini",
+  ] as const) {
+    test(`provider=${provider} with seeded rows → costSource='pricing-table'`, async () => {
+      seedTwoClassRates(provider, `${provider}-test-model`, 2, 10);
+      const res = await authedFetch(`/api/session-costs`, {
+        method: "POST",
+        body: JSON.stringify({
+          sessionId: `${provider}-recompute-1`,
+          agentId: testAgent.id,
+          totalCostUsd: 999.99, // worker-reported; expected to be overwritten
+          inputTokens: 1_000_000, // 1M input
+          outputTokens: 500_000, // 500k output
+          model: `${provider}-test-model`,
+          provider,
+          durationMs: 1_000,
+          numTurns: 1,
+        }),
+      });
+      expect(res.status).toBe(201);
+      const body = (await res.json()) as CostResponse;
+      expect(body.cost.costSource).toBe("pricing-table");
+      // 1M @ 2 + 0.5M @ 10 = $2 + $5 = $7
+      expect(body.cost.totalCostUsd).toBeCloseTo(7.0, 5);
+    });
+  }
+  test("unknown (provider, model) pair → costSource='unpriced', worker value preserved", async () => {
+    const res = await authedFetch(`/api/session-costs`, {
+      method: "POST",
+      body: JSON.stringify({
+        sessionId: "unpriced-1",
+        agentId: testAgent.id,
+        totalCostUsd: 1.23,
+        inputTokens: 100,
+        outputTokens: 50,
+        model: "gpt-future-2027",
+        provider: "codex",
+        durationMs: 1_000,
+        numTurns: 1,
+      }),
+    });
+    expect(res.status).toBe(201);
+    const body = (await res.json()) as CostResponse;
+    expect(body.cost.costSource).toBe("unpriced");
+    expect(body.cost.totalCostUsd).toBe(1.23);
+  });
+});

package/src/tests/store-progress-cost.test.ts CHANGED Viewed

@@ -24,7 +24,12 @@ type TestCostData = {
   model?: string;
 };
-describe("store-progress with cost data", () => {
+// Phase 11 NOTE: the `store-progress` MCP tool no longer accepts a `costData`
+// field — adapters are the sole writers of `session_costs`. These tests now
+// exercise the lower-level `createSessionCost` API directly, which the runner
+// still calls via `POST /api/session-costs`. They verify the DB write path
+// hasn't regressed, NOT the tool's input schema.
+describe("createSessionCost direct API (was: store-progress with cost data)", () => {
   let agentId: string;
   let taskId: string;

package/src/tools/store-progress.ts CHANGED Viewed

@@ -3,7 +3,6 @@ import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 import * as z from "zod";
 import {
   completeTask,
-  createSessionCost,
   createTaskExtended,
   failTask,
   getAgentById,
@@ -24,32 +23,12 @@ import { AgentTaskSchema } from "@/types";
 import "./templates";
 import { validateJsonSchema } from "@/workflows/json-schema-validator";
-// Schema for optional cost data that agents can self-report.
-// In practice the harness adapter (claude/codex/opencode/etc.) is the
-// authoritative source of cost data — it gets written via
-// POST /api/session-costs from the runner. Agents calling store-progress
-// rarely know the real numbers and have been observed echoing the example
-// values from this schema (e.g. model="opus" on a gpt-5-nano run). The
-// handler below silently drops payloads where every numeric field is zero.
-const CostDataSchema = z
-  .object({
-    totalCostUsd: z.number().min(0).describe("Total cost in USD"),
-    inputTokens: z.number().int().min(0).optional().describe("Input tokens used"),
-    outputTokens: z.number().int().min(0).optional().describe("Output tokens used"),
-    cacheReadTokens: z.number().int().min(0).optional().describe("Cache read tokens"),
-    cacheWriteTokens: z.number().int().min(0).optional().describe("Cache write tokens"),
-    durationMs: z.number().int().min(0).optional().describe("Duration in milliseconds"),
-    numTurns: z.number().int().min(1).optional().describe("Number of turns/iterations"),
-    model: z
-      .string()
-      .optional()
-      .describe(
-        "Model identifier reported by the agent (only set if the agent has the real ID; do NOT echo the schema example).",
-      ),
-  })
-  .describe(
-    "Optional self-reported cost data. The harness adapter writes the authoritative cost record automatically — only pass this if you have real, non-zero numbers from a model that doesn't surface usage to the harness.",
-  );
+// Phase 11: the `cost` / `costData` field was removed from this tool's input
+// schema. Adapters (claude/codex/pi/opencode/devin/claude-managed) are the
+// sole writers of `session_costs` rows via `POST /api/session-costs`. Agents
+// calling `store-progress` rarely knew the real numbers and historically
+// echoed the schema example, producing noise rows keyed `mcp-<taskId>-<ts>`
+// that double-counted alongside the harness's authoritative entry.
 export const registerStoreProgressTool = (server: McpServer) => {
   createToolRegistrar(server)(
@@ -72,9 +51,10 @@ export const registerStoreProgressTool = (server: McpServer) => {
           .string()
           .optional()
           .describe("The reason for failure (used when failing)."),
-        costData: CostDataSchema.optional().describe(
-          "Optional cost data for tracking session costs. When provided, a session cost record will be created linked to this task.",
-        ),
+        // Phase 11: `costData` removed. The harness adapter is the sole
+        // writer of `session_costs` (see POST /api/session-costs in the
+        // runner). If a payload still includes the field, Zod's
+        // `unknownKeys` default drops it silently.
       }),
       outputSchema: z.object({
         success: z.boolean(),
@@ -89,7 +69,7 @@ export const registerStoreProgressTool = (server: McpServer) => {
           ),
       }),
     },
-    async ({ taskId, progress, status, output, failureReason, costData }, requestInfo, _meta) => {
+    async ({ taskId, progress, status, output, failureReason }, requestInfo, _meta) => {
       if (!requestInfo.agentId) {
         return {
           content: [
@@ -254,35 +234,11 @@ export const registerStoreProgressTool = (server: McpServer) => {
           }
         }
-        // Store cost data only if the agent provided non-trivial numbers.
-        // Agents observed copying the schema example (e.g. model="opus"
-        // on a gpt-5-nano run) with all-zero token/cost fields, producing
-        // duplicate noise rows in session_costs alongside the harness's
-        // authoritative entry. Drop those silently.
-        const hasRealCost =
-          costData &&
-          (costData.totalCostUsd > 0 ||
-            (costData.inputTokens ?? 0) > 0 ||
-            (costData.outputTokens ?? 0) > 0 ||
-            (costData.cacheReadTokens ?? 0) > 0 ||
-            (costData.cacheWriteTokens ?? 0) > 0);
-        if (hasRealCost && requestInfo.agentId) {
-          createSessionCost({
-            sessionId: `mcp-${taskId}-${Date.now()}`, // Generate unique session ID for MCP-based tasks
-            taskId,
-            agentId: requestInfo.agentId,
-            totalCostUsd: costData.totalCostUsd,
-            inputTokens: costData.inputTokens ?? 0,
-            outputTokens: costData.outputTokens ?? 0,
-            cacheReadTokens: costData.cacheReadTokens ?? 0,
-            cacheWriteTokens: costData.cacheWriteTokens ?? 0,
-            durationMs: costData.durationMs ?? 0,
-            numTurns: costData.numTurns ?? 1,
-            model: costData.model ?? "unknown",
-            isError: status === "failed",
-          });
-        }
+        // Phase 11: removed the per-call `session_costs` insert. The harness
+        // adapter is the sole writer of cost rows now (via the runner's
+        // `POST /api/session-costs`); store-progress historically wrote a
+        // duplicate row keyed `mcp-<taskId>-<ts>` whenever an agent
+        // hallucinated a `costData` payload.
         return {
           success: true,

package/src/tools/utils.ts CHANGED Viewed

@@ -12,6 +12,8 @@ import type {
   ServerRequest,
   ToolAnnotations,
 } from "@modelcontextprotocol/sdk/types.js";
+import { withSpan } from "../otel";
+import { scrubSecrets } from "../utils/secret-scrubber";
 type Meta = RequestHandlerExtra<ServerRequest, ServerNotification>;
@@ -46,6 +48,38 @@ export const getRequestInfo = (req: Meta): RequestInfo => {
   };
 };
+const PREVIEW_LIMIT = 500;
+function previewValue(value: unknown): string | undefined {
+  if (value === undefined) return undefined;
+  try {
+    const serialized = typeof value === "string" ? value : JSON.stringify(value);
+    if (!serialized) return undefined;
+    const scrubbed = scrubSecrets(serialized);
+    return scrubbed.length > PREVIEW_LIMIT ? `${scrubbed.slice(0, PREVIEW_LIMIT)}...` : scrubbed;
+  } catch {
+    return "[unserializable]";
+  }
+}
+function toolRequestAttributes(name: string, requestInfo: RequestInfo, args?: unknown) {
+  return {
+    "mcp.tool.name": name,
+    "mcp.session.id": requestInfo.sessionId,
+    "agent.id": requestInfo.agentId,
+    "agentswarm.task.id": requestInfo.sourceTaskId,
+    "agentswarm.tool.args_preview": previewValue(args),
+  };
+}
+function toolResultAttributes(result: CallToolResult) {
+  return {
+    "mcp.tool.result_content_count": Array.isArray(result.content) ? result.content.length : 0,
+    "mcp.tool.is_error": result.isError ?? false,
+    "agentswarm.tool.result_preview": previewValue(result.content),
+  };
+}
 // Infer the input type from the schema
 type InferInput<Args extends undefined | ZodRawShapeCompat | AnySchema> =
   Args extends ZodRawShapeCompat
@@ -104,23 +138,42 @@ export const createToolRegistrar = (server: McpServer) => {
     // When inputSchema is undefined, the MCP SDK calls handler(extra) with a single arg.
     // When inputSchema is defined, it calls handler(args, extra) with two args.
     if (config.inputSchema === undefined) {
-      return server.registerTool(name, config, ((meta: Meta) => {
+      return server.registerTool(name, config, (async (meta: Meta) => {
         const requestInfo = getRequestInfo(meta);
-        return (
-          cb as (requestInfo: RequestInfo, meta: Meta) => CallToolResult | Promise<CallToolResult>
-        )(requestInfo, meta);
+        return withSpan(
+          "mcp.tool",
+          async (span) => {
+            const result = await (
+              cb as (
+                requestInfo: RequestInfo,
+                meta: Meta,
+              ) => CallToolResult | Promise<CallToolResult>
+            )(requestInfo, meta);
+            span.setAttributes(toolResultAttributes(result));
+            return result;
+          },
+          toolRequestAttributes(name, requestInfo),
+        );
       }) as Parameters<typeof server.registerTool>[2]);
     }
-    return server.registerTool(name, config, ((args: InferInput<InputArgs>, meta: Meta) => {
+    return server.registerTool(name, config, (async (args: InferInput<InputArgs>, meta: Meta) => {
       const requestInfo = getRequestInfo(meta);
-      return (
-        cb as (
-          args: InferInput<InputArgs>,
-          requestInfo: RequestInfo,
-          meta: Meta,
-        ) => CallToolResult | Promise<CallToolResult>
-      )(args, requestInfo, meta);
+      return withSpan(
+        "mcp.tool",
+        async (span) => {
+          const result = await (
+            cb as (
+              args: InferInput<InputArgs>,
+              requestInfo: RequestInfo,
+              meta: Meta,
+            ) => CallToolResult | Promise<CallToolResult>
+          )(args, requestInfo, meta);
+          span.setAttributes(toolResultAttributes(result));
+          return result;
+        },
+        toolRequestAttributes(name, requestInfo, args),
+      );
     }) as Parameters<typeof server.registerTool>[2]);
   };
 };

package/src/types.ts CHANGED Viewed

@@ -192,7 +192,10 @@ export const AgentTaskSchema = z.object({
   // Context usage aggregates
   compactionCount: z.number().int().min(0).optional(),
   peakContextPercent: z.number().min(0).max(100).optional(),
-  totalContextTokensUsed: z.number().int().min(0).optional(),
+  // Migration 063: renamed from totalContextTokensUsed. Semantic is now a
+  // monotonic max across the task's snapshots — "high water mark" rather than
+  // "latest known".
+  peakContextTokens: z.number().int().min(0).optional(),
   contextWindowSize: z.number().int().min(0).optional(),
   // Credential tracking
@@ -574,7 +577,9 @@ export const SessionLogSchema = z.object({
 export type SessionLog = z.infer<typeof SessionLogSchema>;
 // Session Cost Types (aggregated cost data per session)
-export const SessionCostSourceSchema = z.enum(["harness", "pricing-table"]);
+// Migration 063 widened the set to include 'unpriced' for cases where the API
+// recompute path couldn't find pricing rows for the (provider, model, token_class).
+export const SessionCostSourceSchema = z.enum(["harness", "pricing-table", "unpriced"]);
 export type SessionCostSource = z.infer<typeof SessionCostSourceSchema>;
 export const SessionCostSchema = z.object({
@@ -587,13 +592,22 @@ export const SessionCostSchema = z.object({
   outputTokens: z.number().int().min(0).default(0),
   cacheReadTokens: z.number().int().min(0).default(0),
   cacheWriteTokens: z.number().int().min(0).default(0),
+  // Migration 063: reasoning_output_tokens from codex turn.completed events.
+  reasoningOutputTokens: z.number().int().min(0).default(0),
+  // Migration 063: thinking_input_tokens from claude extended-thinking flows.
+  thinkingTokens: z.number().int().min(0).default(0),
   durationMs: z.number().int().min(0),
-  numTurns: z.number().int().min(1),
+  // numTurns is nullable — some adapters (e.g. Claude when num_turns is absent)
+  // can't honestly report a turn count. We prefer null over a faked 1.
+  numTurns: z.number().int().min(1).nullable(),
   model: z.string(),
   isError: z.boolean().default(false),
-  // Phase 6: where the recorded totalCostUsd came from. New rows write the
-  // actual source ('pricing-table' when the API recomputed Codex USD from DB
-  // pricing rows, 'harness' otherwise). Defaults to 'harness' for back-compat.
+  // Phase 6 (extended by migration 063): where the recorded totalCostUsd came from.
+  //   'harness'        — value reported by the harness as-is.
+  //   'pricing-table'  — value recomputed by the API from `pricing` rows.
+  //   'unpriced'       — the API tried to recompute but the (provider, model)
+  //                      had no matching pricing rows; totalCostUsd is whatever
+  //                      the worker submitted (often 0).
   costSource: SessionCostSourceSchema.default("harness"),
   createdAt: z.iso.datetime(),
 });
@@ -1381,6 +1395,21 @@ export type McpServerWithInstallInfo = z.infer<typeof McpServerWithInstallInfoSc
 export const ContextSnapshotEventTypeSchema = z.enum(["progress", "compaction", "completion"]);
 export type ContextSnapshotEventType = z.infer<typeof ContextSnapshotEventTypeSchema>;
+// Migration 063: the formula the emitting adapter used to compute
+// contextUsedTokens. Lets downstream consumers (UI badges, cross-provider
+// comparisons) reason about whether two numbers are commensurable. Values
+// match the inline doc in `src/be/migrations/063_cost_context_schema_relax.sql`.
+export const ContextFormulaSchema = z.enum([
+  "input-cache-output", // unified formula (post-Phase 9)
+  "input-cache-no-output", // pre-unification claude formula
+  "input-output-no-cache", // pre-unification claude-managed formula
+  "peak-proxy", // pre-unification codex formula
+  "pi-delegated", // numbers come from the pi-ai SDK
+  "harness-reported", // numbers come from a harness API (devin)
+  "unknown", // pre-migration backfill or adapter didn't tag
+]);
+export type ContextFormula = z.infer<typeof ContextFormulaSchema>;
 export const ContextSnapshotSchema = z.object({
   id: z.uuid(),
   taskId: z.uuid(),
@@ -1396,13 +1425,18 @@ export const ContextSnapshotSchema = z.object({
   eventType: ContextSnapshotEventTypeSchema,
   // Compaction-specific (null for non-compaction)
-  compactTrigger: z.enum(["auto", "manual"]).optional(),
+  compactTrigger: z.enum(["auto", "manual", "auto-inferred"]).optional(),
   preCompactTokens: z.number().int().min(0).optional(),
   // Cumulative counters at this point
   cumulativeInputTokens: z.number().int().min(0).default(0),
   cumulativeOutputTokens: z.number().int().min(0).default(0),
+  // Migration 063 — adapter stamps the formula it used to compute
+  // contextUsedTokens. Optional so old rows / new providers without a tag
+  // don't break, but every adapter should populate this going forward.
+  contextFormula: ContextFormulaSchema.optional(),
   createdAt: z.iso.datetime(),
 });
@@ -1430,10 +1464,29 @@ export const BudgetSchema = z.object({
 });
 export type Budget = z.infer<typeof BudgetSchema>;
-export const PricingProviderSchema = z.enum(["claude", "codex", "pi"]);
+// Migration 063 widened both enums and dropped the SQL CHECKs to match.
+// New providers can land without an accompanying schema migration; Zod is now
+// the single source of truth for what's a valid (provider, token_class) row.
+export const PricingProviderSchema = z.enum([
+  "claude",
+  "claude-managed",
+  "codex",
+  "pi",
+  "opencode",
+  "devin",
+  "gemini",
+]);
 export type PricingProvider = z.infer<typeof PricingProviderSchema>;
-export const PricingTokenClassSchema = z.enum(["input", "cached_input", "output"]);
+export const PricingTokenClassSchema = z.enum([
+  "input",
+  "cached_input",
+  "output",
+  // Migration 063 additions:
+  "cache_write", // claude / claude-managed cache creation
+  "runtime_hour", // claude-managed runtime fee per hour
+  "acu", // devin Agent Compute Unit
+]);
 export type PricingTokenClass = z.infer<typeof PricingTokenClassSchema>;
 export const PricingRowSchema = z.object({

package/src/utils/context-window.ts CHANGED Viewed

@@ -2,31 +2,95 @@
  * Context window size lookup and usage computation utilities.
  *
  * This module is safe for both API and worker code — it has NO database imports.
+ *
+ * Phase 4 + Phase 9 of the cost-tracking plan:
+ *   - `getContextWindowSize` now resolves shortnames, family-versioned ids
+ *     (`claude-sonnet-4-6`), AND dated full ids (`claude-sonnet-4-6-20251004`)
+ *     by stripping the trailing date suffix. Previously the dated form fell
+ *     to the 200k default — wildly wrong for sonnet/opus 4.x.
+ *   - `computeContextUsedUnified` is the canonical formula every adapter
+ *     should use when emitting a `context_usage` event:
+ *       contextUsedTokens = input + cache_read + cache_create + output
+ *     The matching `CONTEXT_FORMULA` constant is what adapters stamp onto
+ *     the snapshot's `contextFormula` field.
+ *   - The legacy `computeContextUsed` stays for back-compat reads but is
+ *     deprecated; new code should use `computeContextUsedUnified`.
+ */
+/**
+ * Phase 9: stamp this onto every `context_usage` event the adapter emits.
+ * Callers that compute their own number for legacy reasons (e.g. pi-mono
+ * delegates to the pi-ai SDK) use a different value — see `ContextFormula`
+ * in `src/types.ts`.
  */
+export const CONTEXT_FORMULA = "input-cache-output" as const;
 const CONTEXT_WINDOW_DEFAULTS: Record<string, number> = {
+  // Anthropic 4.x family
+  "claude-opus-4-7": 1_000_000,
   "claude-opus-4-6": 1_000_000,
+  "claude-opus-4-5": 1_000_000,
+  "claude-opus-4-1": 200_000,
+  "claude-opus-4-0": 200_000,
   "claude-sonnet-4-6": 1_000_000,
+  "claude-sonnet-4-5": 1_000_000,
+  "claude-sonnet-4-0": 200_000,
   "claude-haiku-4-5": 200_000,
+  // Anthropic 3.x family (legacy)
+  "claude-3-7-sonnet": 200_000,
+  "claude-3-5-sonnet": 200_000,
+  "claude-3-5-haiku": 200_000,
+  "claude-3-opus": 200_000,
+  "claude-3-sonnet": 200_000,
+  "claude-3-haiku": 200_000,
+  // Shortnames used by the local-CLI adapter and pi-mono OpenRouter mirror.
   opus: 1_000_000,
   sonnet: 1_000_000,
   haiku: 200_000,
   default: 200_000,
 };
+const DEFAULT_CONTEXT_WINDOW = 200_000;
 /**
- * Look up the context window size (in tokens) for a given model identifier.
- * Falls back to the "default" entry when the model is not explicitly mapped.
+ * Strip a trailing date suffix from a Claude model id so dated full ids
+ * resolve to the same window as the family-versioned id.
+ *
+ * `claude-sonnet-4-6-20251004` → `claude-sonnet-4-6`
+ * `claude-haiku-4-5-20251001`  → `claude-haiku-4-5`
+ *
+ * Anthropic's dated full ids are always `${family}-${major}-${minor}-${YYYYMMDD}`,
+ * so an 8-digit trailing date is a reliable signal.
  */
-const DEFAULT_CONTEXT_WINDOW = 200_000;
+function stripAnthropicDateSuffix(model: string): string {
+  return model.replace(/-(\d{8})$/, "");
+}
 export function getContextWindowSize(model: string): number {
-  return CONTEXT_WINDOW_DEFAULTS[model] ?? DEFAULT_CONTEXT_WINDOW;
+  // Fast path: exact match (shortname or family-versioned id).
+  if (CONTEXT_WINDOW_DEFAULTS[model] !== undefined) {
+    return CONTEXT_WINDOW_DEFAULTS[model];
+  }
+  // Dated full id → strip suffix and retry.
+  const stripped = stripAnthropicDateSuffix(model);
+  if (stripped !== model && CONTEXT_WINDOW_DEFAULTS[stripped] !== undefined) {
+    return CONTEXT_WINDOW_DEFAULTS[stripped];
+  }
+  // OpenAI / GPT family — most reasoning models have 200k+; we keep this
+  // conservative and let callers override via models.dev rates if they want.
+  // Specific gpt-5.x context windows are >1M but the local-CLI adapter
+  // generally doesn't surface those; the API recompute path uses the rate
+  // table, not the window. The 200k default keeps the math safe.
+  return DEFAULT_CONTEXT_WINDOW;
 }
 /**
  * Compute the total context tokens used from a Claude API usage object.
  * Sums input_tokens + cache_creation_input_tokens + cache_read_input_tokens.
+ *
+ * @deprecated Phase 9 — use {@link computeContextUsedUnified} instead. This
+ * variant excludes output tokens, which is the wrong number when the goal is
+ * "how full is the model's context window right now."
  */
 export function computeContextUsed(usage: {
   input_tokens?: number | null;
@@ -39,3 +103,39 @@ export function computeContextUsed(usage: {
     (usage.cache_read_input_tokens ?? 0)
   );
 }
+/**
+ * Phase 9: the unified context-used formula adapters should use when emitting
+ * `context_usage` events. Sums input + cache_read + cache_create + output,
+ * which is the number the Claude Code status line shows. Cross-provider
+ * comparisons (claude vs codex vs pi) are only meaningful when every adapter
+ * agrees on this formula.
+ *
+ * Returns 0 if every field is missing; callers should check the `contextTotal`
+ * separately and emit `null` for `contextPercent` when the window is unknown.
+ */
+export function computeContextUsedUnified(parts: {
+  inputTokens?: number | null;
+  cacheReadTokens?: number | null;
+  cacheCreateTokens?: number | null;
+  outputTokens?: number | null;
+}): number {
+  return (
+    (parts.inputTokens ?? 0) +
+    (parts.cacheReadTokens ?? 0) +
+    (parts.cacheCreateTokens ?? 0) +
+    (parts.outputTokens ?? 0)
+  );
+}
+/**
+ * Phase 9: clamp a raw context-percent value to [0, 100]. Returns null when
+ * `total` is missing or 0 so callers can show "unknown" instead of a
+ * divide-by-zero NaN/∞.
+ */
+export function clampContextPercent(used: number, total: number | null | undefined): number | null {
+  if (!total || total <= 0) return null;
+  const raw = (used / total) * 100;
+  if (!Number.isFinite(raw)) return null;
+  return Math.min(100, Math.max(0, raw));
+}