@desplega.ai/agent-swarm 1.79.3 → 1.80.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openapi.json +98 -19
- package/package.json +12 -6
- package/src/be/db.ts +101 -30
- package/src/be/migrations/063_cost_context_schema_relax.sql +133 -0
- package/src/be/pricing-normalize.ts +81 -0
- package/src/be/seed-pricing.ts +293 -0
- package/src/commands/claude-managed-setup.ts +19 -3
- package/src/commands/runner.ts +592 -237
- package/src/http/context.ts +6 -2
- package/src/http/index.ts +115 -68
- package/src/http/session-data.ts +74 -23
- package/src/otel-impl.ts +200 -0
- package/src/otel.ts +127 -0
- package/src/providers/claude-adapter.ts +30 -5
- package/src/providers/claude-managed-adapter.ts +43 -17
- package/src/providers/claude-managed-pricing.ts +34 -0
- package/src/providers/codex-adapter.ts +38 -27
- package/src/providers/codex-models.ts +22 -3
- package/src/providers/devin-adapter.ts +11 -0
- package/src/providers/opencode-adapter.ts +31 -7
- package/src/providers/pi-mono-adapter.ts +39 -7
- package/src/providers/pricing-sources.md +52 -0
- package/src/providers/swarm-events-shared.ts +8 -4
- package/src/providers/types.ts +33 -10
- package/src/server.ts +6 -0
- package/src/tests/claude-managed-adapter.test.ts +17 -3
- package/src/tests/claude-managed-setup.test.ts +10 -1
- package/src/tests/codex-adapter.test.ts +20 -19
- package/src/tests/context-snapshot.test.ts +2 -2
- package/src/tests/context-window.test.ts +65 -1
- package/src/tests/devin-adapter.test.ts +2 -0
- package/src/tests/http/context-routes.test.ts +161 -0
- package/src/tests/migration-063-schema-relax.test.ts +109 -0
- package/src/tests/opencode-adapter.test.ts +146 -1
- package/src/tests/otel-impl-secret-scrubbing.test.ts +33 -0
- package/src/tests/pages-view-count.test.ts +30 -5
- package/src/tests/providers/codex-cost.test.ts +18 -0
- package/src/tests/providers/opencode-cost.test.ts +74 -0
- package/src/tests/providers/pi-cost.test.ts +128 -0
- package/src/tests/secret-scrubber.test.ts +19 -0
- package/src/tests/session-costs-codex-recompute.test.ts +35 -22
- package/src/tests/session-costs-model-key-normalize.test.ts +271 -0
- package/src/tests/session-costs-recompute-all-providers.test.ts +170 -0
- package/src/tests/store-progress-cost.test.ts +6 -1
- package/src/tools/store-progress.ts +16 -60
- package/src/tools/utils.ts +65 -12
- package/src/types.ts +62 -9
- package/src/utils/context-window.ts +104 -4
- package/src/utils/secret-scrubber.ts +7 -0
|
@@ -264,7 +264,7 @@ function cleanupAgentsMdSymlink(cwd: string): void {
|
|
|
264
264
|
}
|
|
265
265
|
}
|
|
266
266
|
|
|
267
|
-
class PiMonoSession implements ProviderSession {
|
|
267
|
+
export class PiMonoSession implements ProviderSession {
|
|
268
268
|
private listeners: Array<(event: ProviderEvent) => void> = [];
|
|
269
269
|
private eventQueue: ProviderEvent[] = [];
|
|
270
270
|
private _sessionId: string | undefined;
|
|
@@ -275,6 +275,14 @@ class PiMonoSession implements ProviderSession {
|
|
|
275
275
|
private logFileHandle: ReturnType<ReturnType<typeof Bun.file>["writer"]>;
|
|
276
276
|
/** Track last emitted message text to avoid duplicates across turns */
|
|
277
277
|
private lastEmittedMessage = "";
|
|
278
|
+
/** Phase 7: wallclock start so we can populate `durationMs` on the cost row. */
|
|
279
|
+
private sessionStartedAt: number = Date.now();
|
|
280
|
+
/**
|
|
281
|
+
* Phase 7: previous output-token total — used to derive per-turn delta for
|
|
282
|
+
* `context_usage.outputTokens` since pi-ai's `getContextUsage()` doesn't
|
|
283
|
+
* surface it directly.
|
|
284
|
+
*/
|
|
285
|
+
private prevOutputTokens = 0;
|
|
278
286
|
|
|
279
287
|
constructor(agentSession: AgentSession, config: ProviderSessionConfig, createdSymlink: boolean) {
|
|
280
288
|
this.agentSession = agentSession;
|
|
@@ -282,6 +290,7 @@ class PiMonoSession implements ProviderSession {
|
|
|
282
290
|
this.createdSymlink = createdSymlink;
|
|
283
291
|
this.logFileHandle = Bun.file(config.logFile).writer();
|
|
284
292
|
this._sessionId = agentSession.sessionId;
|
|
293
|
+
this.sessionStartedAt = Date.now();
|
|
285
294
|
|
|
286
295
|
// Emit session_init immediately
|
|
287
296
|
this.emit({ type: "session_init", sessionId: this._sessionId, provider: "pi" });
|
|
@@ -293,6 +302,18 @@ class PiMonoSession implements ProviderSession {
|
|
|
293
302
|
this.completionPromise = this.runSession();
|
|
294
303
|
}
|
|
295
304
|
|
|
305
|
+
/**
|
|
306
|
+
* Canonical model slug for downstream reporting (latestModel, raw_log envelopes).
|
|
307
|
+
* Composes `${provider}/${id}` from the resolved pi-ai model so the UI snapshot
|
|
308
|
+
* lookup matches (e.g. `openrouter/deepseek/deepseek-v4-flash`). Falls back to
|
|
309
|
+
* the configured model string if the session didn't resolve one.
|
|
310
|
+
*/
|
|
311
|
+
private reportedModel(): string {
|
|
312
|
+
const m = this.agentSession.model;
|
|
313
|
+
if (m) return `${m.provider}/${m.id}`;
|
|
314
|
+
return this.config.model;
|
|
315
|
+
}
|
|
316
|
+
|
|
296
317
|
private emit(event: ProviderEvent): void {
|
|
297
318
|
// Scrub secrets from raw_log / raw_stderr content before egress (log file
|
|
298
319
|
// write, listener dispatch, downstream session-logs push + pretty-print).
|
|
@@ -329,7 +350,7 @@ class PiMonoSession implements ProviderSession {
|
|
|
329
350
|
.trim()
|
|
330
351
|
: String(msg.content || "").trim();
|
|
331
352
|
if (text && text !== this.lastEmittedMessage) {
|
|
332
|
-
const model = this.
|
|
353
|
+
const model = this.reportedModel();
|
|
333
354
|
this.emit({
|
|
334
355
|
type: "raw_log",
|
|
335
356
|
content: JSON.stringify({
|
|
@@ -344,21 +365,30 @@ class PiMonoSession implements ProviderSession {
|
|
|
344
365
|
this.lastEmittedMessage = text;
|
|
345
366
|
}
|
|
346
367
|
}
|
|
347
|
-
// Emit context_usage for dashboard tracking
|
|
368
|
+
// Emit context_usage for dashboard tracking.
|
|
369
|
+
// Phase 7: derive `outputTokens` from `SessionStats` delta (pi-ai's
|
|
370
|
+
// `getContextUsage()` doesn't expose per-turn output tokens, but the
|
|
371
|
+
// session-stats counter is monotonic so a delta is correct).
|
|
348
372
|
const usage = this.agentSession.getContextUsage();
|
|
349
373
|
if (usage && usage.tokens != null) {
|
|
374
|
+
const stats = this.agentSession.getSessionStats();
|
|
375
|
+
const currOutput = stats?.tokens?.output ?? 0;
|
|
376
|
+
const outputDelta = Math.max(0, currOutput - this.prevOutputTokens);
|
|
377
|
+
this.prevOutputTokens = currOutput;
|
|
350
378
|
this.emit({
|
|
351
379
|
type: "context_usage",
|
|
352
380
|
contextUsedTokens: usage.tokens,
|
|
353
381
|
contextTotalTokens: usage.contextWindow,
|
|
354
382
|
contextPercent: usage.percent ?? 0,
|
|
355
|
-
outputTokens:
|
|
383
|
+
outputTokens: outputDelta,
|
|
384
|
+
// Phase 9: pi-ai owns the formula — we just relay its number.
|
|
385
|
+
contextFormula: "pi-delegated",
|
|
356
386
|
});
|
|
357
387
|
}
|
|
358
388
|
break;
|
|
359
389
|
}
|
|
360
390
|
case "tool_execution_start": {
|
|
361
|
-
const model = this.
|
|
391
|
+
const model = this.reportedModel();
|
|
362
392
|
this.emit({
|
|
363
393
|
type: "raw_log",
|
|
364
394
|
content: JSON.stringify({
|
|
@@ -489,9 +519,11 @@ class PiMonoSession implements ProviderSession {
|
|
|
489
519
|
outputTokens: stats.tokens.output,
|
|
490
520
|
cacheReadTokens: stats.tokens.cacheRead,
|
|
491
521
|
cacheWriteTokens: stats.tokens.cacheWrite,
|
|
492
|
-
|
|
522
|
+
// Phase 7: real wallclock duration; pi-ai SessionStats doesn't carry
|
|
523
|
+
// one so we track it on this adapter instance.
|
|
524
|
+
durationMs: Date.now() - this.sessionStartedAt,
|
|
493
525
|
numTurns: stats.userMessages + stats.assistantMessages,
|
|
494
|
-
model: this.
|
|
526
|
+
model: this.reportedModel(),
|
|
495
527
|
isError: false,
|
|
496
528
|
provider: "pi",
|
|
497
529
|
};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# Pricing sources
|
|
2
|
+
|
|
3
|
+
This page lists the sources that feed the `pricing` table at server boot.
|
|
4
|
+
Operators bumping a rate by hand should also update this file.
|
|
5
|
+
|
|
6
|
+
## Primary: vendored models.dev snapshot
|
|
7
|
+
|
|
8
|
+
- **Path**: `ui/src/lib/modelsdev-cache.json`
|
|
9
|
+
- **Loaded by**: `src/be/seed-pricing.ts` → `seedPricingFromModelsDev()`,
|
|
10
|
+
called from `src/server.ts` after `initDb`.
|
|
11
|
+
- **Projection rules** (see the same module for code-level detail):
|
|
12
|
+
- Anthropic models → rows under `provider='claude'` AND `provider='claude-managed'`.
|
|
13
|
+
Shortnames (`opus`, `sonnet`, `haiku`) ALSO get rows keyed by the current
|
|
14
|
+
default full id (e.g. `opus → claude-opus-4-7`). Pi-mono uses the same
|
|
15
|
+
shortname forms, so they're projected under `provider='pi'` as well.
|
|
16
|
+
- OpenAI models → rows under `provider='codex'`.
|
|
17
|
+
- OpenRouter models → rows under `provider='opencode'`. Any `google/...`
|
|
18
|
+
row additionally gets projected under `provider='gemini'` (both the
|
|
19
|
+
stripped name and the full `google/...` id) so internal-ai callers find
|
|
20
|
+
a hit either way.
|
|
21
|
+
|
|
22
|
+
- **Refresh procedure** (the only place to update the snapshot):
|
|
23
|
+
- Run `bun run scripts/refresh-modelsdev-pricing.ts` (Phase 2 — adds the
|
|
24
|
+
script). It fetches the latest snapshot from models.dev, diffs against
|
|
25
|
+
the vendored copy, prints a summary, and writes the new file.
|
|
26
|
+
- Commit the regenerated `modelsdev-cache.json` together with a bump
|
|
27
|
+
note in the PR description.
|
|
28
|
+
|
|
29
|
+
## Manual overrides
|
|
30
|
+
|
|
31
|
+
Two cost components models.dev doesn't carry are encoded in
|
|
32
|
+
`MANUAL_PRICING_OVERRIDES` inside `src/be/seed-pricing.ts`:
|
|
33
|
+
|
|
34
|
+
| Provider | Model | Token class | Rate | Source | Verified |
|
|
35
|
+
|------------------|-------|----------------|--------------|---------------------------------------------------------------------------------|------------|
|
|
36
|
+
| `claude-managed` | `*` | `runtime_hour` | $0.08 / hour | <https://docs.claude.com/en/api/agent-sdk/managed-runtime#pricing> | 2026-04-28 |
|
|
37
|
+
| `devin` | `*` | `acu` | $2.25 / ACU | <https://devin.ai/pricing> | 2026-04-28 |
|
|
38
|
+
|
|
39
|
+
The `pricePerMillionUsd` column carries these as `rate * 1_000_000` so the
|
|
40
|
+
same schema fits — the adapter scales by the underlying unit (hours / ACUs),
|
|
41
|
+
not by tokens. The unit convention is specific to those `token_class` values.
|
|
42
|
+
|
|
43
|
+
## When a model is missing
|
|
44
|
+
|
|
45
|
+
If `POST /api/session-costs` arrives with a `(provider, model)` pair that has
|
|
46
|
+
no input/output pricing rows at the lookup time, the row is persisted with
|
|
47
|
+
`costSource='unpriced'` (rather than 'harness'). The UI surfaces this as a
|
|
48
|
+
yellow badge.
|
|
49
|
+
|
|
50
|
+
To fix: either add the model to `modelsdev-cache.json` (preferred — the
|
|
51
|
+
upstream snapshot probably needs refreshing) or add a manual override row via
|
|
52
|
+
the existing admin route `POST /api/pricing`.
|
|
@@ -167,8 +167,10 @@ export function createSwarmEventHandler(
|
|
|
167
167
|
|
|
168
168
|
const progressContextUsage = (event: {
|
|
169
169
|
contextUsedTokens: number;
|
|
170
|
-
|
|
171
|
-
|
|
170
|
+
// Migration 063: nullable for adapters that can't resolve a window.
|
|
171
|
+
contextTotalTokens: number | null;
|
|
172
|
+
contextPercent: number | null;
|
|
173
|
+
contextFormula?: string;
|
|
172
174
|
}): void => {
|
|
173
175
|
if (opts.taskId && shouldRun("context-progress", CONTEXT_THROTTLE_MS)) {
|
|
174
176
|
fireAndForget(`${opts.apiUrl}/api/tasks/${encodeURIComponent(opts.taskId)}/context`, {
|
|
@@ -178,8 +180,9 @@ export function createSwarmEventHandler(
|
|
|
178
180
|
eventType: "progress",
|
|
179
181
|
sessionId: sessionId ?? `${opts.sessionIdFallbackPrefix ?? "session"}-${opts.taskId}`,
|
|
180
182
|
contextUsedTokens: event.contextUsedTokens,
|
|
181
|
-
contextTotalTokens: event.contextTotalTokens,
|
|
182
|
-
contextPercent: event.contextPercent,
|
|
183
|
+
contextTotalTokens: event.contextTotalTokens ?? undefined,
|
|
184
|
+
contextPercent: event.contextPercent ?? undefined,
|
|
185
|
+
contextFormula: event.contextFormula,
|
|
183
186
|
}),
|
|
184
187
|
});
|
|
185
188
|
}
|
|
@@ -239,6 +242,7 @@ export function createSwarmEventHandler(
|
|
|
239
242
|
contextUsedTokens: event.contextUsedTokens,
|
|
240
243
|
contextTotalTokens: event.contextTotalTokens,
|
|
241
244
|
contextPercent: event.contextPercent,
|
|
245
|
+
contextFormula: event.contextFormula,
|
|
242
246
|
});
|
|
243
247
|
break;
|
|
244
248
|
}
|
package/src/providers/types.ts
CHANGED
|
@@ -7,18 +7,30 @@ export interface CostData {
|
|
|
7
7
|
inputTokens?: number;
|
|
8
8
|
outputTokens?: number;
|
|
9
9
|
cacheReadTokens?: number;
|
|
10
|
+
/**
|
|
11
|
+
* Migration 063: undefined means "the harness can't report this" (e.g. the
|
|
12
|
+
* Codex SDK has no cache-write field). Zero is reserved for "really zero".
|
|
13
|
+
*/
|
|
10
14
|
cacheWriteTokens?: number;
|
|
15
|
+
/** Migration 063: codex reasoning_output_tokens (and similar) for reasoning models. */
|
|
16
|
+
reasoningOutputTokens?: number;
|
|
17
|
+
/** Migration 063: claude extended-thinking tokens from CLI's `usage.thinking_input_tokens`. */
|
|
18
|
+
thinkingTokens?: number;
|
|
11
19
|
durationMs: number;
|
|
12
|
-
|
|
20
|
+
/**
|
|
21
|
+
* Migration 063: nullable — some adapters (claude when `num_turns` is absent)
|
|
22
|
+
* can't honestly report a turn count; null is preferred over a faked 1.
|
|
23
|
+
*/
|
|
24
|
+
numTurns: number | null;
|
|
13
25
|
model: string;
|
|
14
26
|
isError: boolean;
|
|
15
27
|
/**
|
|
16
|
-
* Phase 6: tells the API which recompute path to
|
|
17
|
-
* `POST /api/session-costs`.
|
|
18
|
-
*
|
|
19
|
-
*
|
|
28
|
+
* Phase 6 (extended migration 063): tells the API which recompute path to
|
|
29
|
+
* use on `POST /api/session-costs`. After Phase 2 the recompute path runs
|
|
30
|
+
* for every provider with seeded pricing rows, so every adapter should
|
|
31
|
+
* populate this field.
|
|
20
32
|
*/
|
|
21
|
-
provider?: "claude" | "codex" | "pi" | "opencode";
|
|
33
|
+
provider?: "claude" | "claude-managed" | "codex" | "pi" | "opencode" | "devin";
|
|
22
34
|
}
|
|
23
35
|
|
|
24
36
|
import type { ProviderName } from "../types";
|
|
@@ -43,14 +55,25 @@ export type ProviderEvent =
|
|
|
43
55
|
| {
|
|
44
56
|
type: "context_usage";
|
|
45
57
|
contextUsedTokens: number;
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
58
|
+
// Migration 063: nullable so adapters (e.g. devin without a context API)
|
|
59
|
+
// can emit a snapshot that records cumulative tokens without faking a window.
|
|
60
|
+
contextTotalTokens: number | null;
|
|
61
|
+
// Migration 063: null if contextTotalTokens is missing (no divide-by-zero).
|
|
62
|
+
contextPercent: number | null;
|
|
63
|
+
// Migration 063: null when the adapter can't honestly report output tokens.
|
|
64
|
+
outputTokens: number | null;
|
|
65
|
+
/**
|
|
66
|
+
* Migration 063 — the formula the adapter used to compute
|
|
67
|
+
* contextUsedTokens. See `ContextFormulaSchema` in `src/types.ts` for the
|
|
68
|
+
* canonical value list. Adapters should always populate this going
|
|
69
|
+
* forward; it powers cross-provider apples-to-apples comparison.
|
|
70
|
+
*/
|
|
71
|
+
contextFormula?: string;
|
|
49
72
|
}
|
|
50
73
|
| {
|
|
51
74
|
type: "compaction";
|
|
52
75
|
preCompactTokens: number;
|
|
53
|
-
compactTrigger: "auto" | "manual";
|
|
76
|
+
compactTrigger: "auto" | "manual" | "auto-inferred";
|
|
54
77
|
contextTotalTokens: number;
|
|
55
78
|
};
|
|
56
79
|
|
package/src/server.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
2
|
import pkg from "../package.json";
|
|
3
3
|
import { initDb } from "./be/db";
|
|
4
|
+
import { seedPricingFromModelsDev } from "./be/seed-pricing";
|
|
4
5
|
import { registerCancelTaskTool } from "./tools/cancel-task";
|
|
5
6
|
import { registerContextDiffTool } from "./tools/context-diff";
|
|
6
7
|
import { registerContextHistoryTool } from "./tools/context-history";
|
|
@@ -147,6 +148,11 @@ export function createServer() {
|
|
|
147
148
|
// Initialize database with WAL mode
|
|
148
149
|
// Uses DATABASE_PATH env var for Docker volume compatibility (WAL needs .sqlite, .sqlite-wal, .sqlite-shm on same filesystem)
|
|
149
150
|
initDb(process.env.DATABASE_PATH);
|
|
151
|
+
// Phase 2: project the vendored models.dev snapshot into the pricing table.
|
|
152
|
+
// Idempotent (INSERT OR IGNORE keyed on PK with effective_from=0); safe to
|
|
153
|
+
// call on every boot. See src/be/seed-pricing.ts for the projection logic
|
|
154
|
+
// and the manual-override constants for runtime-fee / ACU pricing.
|
|
155
|
+
seedPricingFromModelsDev();
|
|
150
156
|
|
|
151
157
|
const server = new McpServer(
|
|
152
158
|
{
|
|
@@ -319,11 +319,14 @@ describe("ClaudeManagedAdapter (Phase 3) — session lifecycle", () => {
|
|
|
319
319
|
}
|
|
320
320
|
|
|
321
321
|
// context_usage emitted on span.model_request_end.
|
|
322
|
+
// Phase 5 / Phase 9 unified formula = input + cache_read + cache_write + output.
|
|
322
323
|
const ctx = emitted.find((e) => e.type === "context_usage");
|
|
323
324
|
expect(ctx).toBeDefined();
|
|
324
325
|
if (ctx && ctx.type === "context_usage") {
|
|
325
|
-
expect(ctx.contextUsedTokens).toBe(
|
|
326
|
+
expect(ctx.contextUsedTokens).toBe(165); // 100 + 10 + 5 + 50
|
|
326
327
|
expect(ctx.outputTokens).toBe(50);
|
|
328
|
+
// Phase 9: every snapshot carries the formula tag.
|
|
329
|
+
expect(ctx.contextFormula).toBe("input-cache-output");
|
|
327
330
|
}
|
|
328
331
|
|
|
329
332
|
// result emitted with accumulated cost. Phase 3 leaves totalCostUsd at 0
|
|
@@ -345,6 +348,8 @@ describe("ClaudeManagedAdapter (Phase 3) — session lifecycle", () => {
|
|
|
345
348
|
expect(resultEvent.cost.totalCostUsd).toBeGreaterThanOrEqual(0);
|
|
346
349
|
expect(Number.isFinite(resultEvent.cost.totalCostUsd)).toBe(true);
|
|
347
350
|
expect(resultEvent.output).toBe("Hello from managed agent");
|
|
351
|
+
// Phase 3 — provider tag is required so the API recompute path engages.
|
|
352
|
+
expect(resultEvent.cost.provider).toBe("claude-managed");
|
|
348
353
|
}
|
|
349
354
|
|
|
350
355
|
// ProviderResult.
|
|
@@ -644,17 +649,24 @@ describe("ClaudeManagedAdapter (Phase 4) — repo provisioning + cost data", ()
|
|
|
644
649
|
process.env.ANTHROPIC_API_KEY = "sk-test";
|
|
645
650
|
process.env.MANAGED_AGENT_ID = "agent_x";
|
|
646
651
|
process.env.MANAGED_ENVIRONMENT_ID = "env_x";
|
|
652
|
+
// Defensive: vault env vars may leak in from the host .env (Bun auto-loads
|
|
653
|
+
// it); each vault-related test sets exactly what it asserts on.
|
|
654
|
+
delete process.env.MANAGED_GITHUB_TOKEN;
|
|
655
|
+
delete process.env.MANAGED_GITHUB_VAULT_ID;
|
|
656
|
+
delete process.env.MANAGED_MCP_VAULT_ID;
|
|
647
657
|
});
|
|
648
658
|
|
|
649
659
|
afterAll(() => {
|
|
650
660
|
rmSync(tmpLogDir, { recursive: true, force: true });
|
|
651
661
|
delete process.env.MANAGED_GITHUB_TOKEN;
|
|
652
662
|
delete process.env.MANAGED_GITHUB_VAULT_ID;
|
|
663
|
+
delete process.env.MANAGED_MCP_VAULT_ID;
|
|
653
664
|
});
|
|
654
665
|
|
|
655
666
|
afterEach(() => {
|
|
656
667
|
delete process.env.MANAGED_GITHUB_TOKEN;
|
|
657
668
|
delete process.env.MANAGED_GITHUB_VAULT_ID;
|
|
669
|
+
delete process.env.MANAGED_MCP_VAULT_ID;
|
|
658
670
|
});
|
|
659
671
|
|
|
660
672
|
test("normalizeRepoUrl: passes through https URLs and expands owner/repo shorthand", () => {
|
|
@@ -1266,9 +1278,11 @@ describe("ClaudeManagedAdapter (Phase 6) — full happy-path integration", () =>
|
|
|
1266
1278
|
const ctxUsage = emitted.find((e) => e.type === "context_usage");
|
|
1267
1279
|
expect(ctxUsage?.type).toBe("context_usage");
|
|
1268
1280
|
if (ctxUsage?.type === "context_usage") {
|
|
1269
|
-
//
|
|
1270
|
-
|
|
1281
|
+
// Phase 5 / Phase 9 unified: input + cache_read + cache_write + output.
|
|
1282
|
+
// 1M + 50k + 25k + 200k = 1,275,000.
|
|
1283
|
+
expect(ctxUsage.contextUsedTokens).toBe(1_275_000);
|
|
1271
1284
|
expect(ctxUsage.outputTokens).toBe(200_000);
|
|
1285
|
+
expect(ctxUsage.contextFormula).toBe("input-cache-output");
|
|
1272
1286
|
}
|
|
1273
1287
|
|
|
1274
1288
|
// The terminal `result` ProviderEvent — the contract Phase 4 hardened —
|
|
@@ -88,13 +88,22 @@ describe("runClaudeManagedSetupFlow — happy path", () => {
|
|
|
88
88
|
const agentCallArgs = agentsCreate.mock.calls[0]?.[0] as {
|
|
89
89
|
name: string;
|
|
90
90
|
model: string;
|
|
91
|
-
tools: Array<{
|
|
91
|
+
tools: Array<{
|
|
92
|
+
type: string;
|
|
93
|
+
default_config?: { permission_policy?: { type: string } };
|
|
94
|
+
}>;
|
|
92
95
|
skills: Array<{ type: string; skill_id: string }>;
|
|
93
96
|
mcp_servers: Array<{ name: string; type: string; url: string }>;
|
|
94
97
|
};
|
|
95
98
|
expect(agentCallArgs.name).toBe("swarm-worker");
|
|
96
99
|
expect(agentCallArgs.model).toBe("claude-sonnet-4-6");
|
|
97
100
|
expect(agentCallArgs.tools[0]?.type).toBe("agent_toolset_20260401");
|
|
101
|
+
// Headless workers can't approve interactively — both toolsets must be
|
|
102
|
+
// configured with `always_allow` so the sandbox executes tool calls
|
|
103
|
+
// without parking them in `awaiting approval`.
|
|
104
|
+
for (const tool of agentCallArgs.tools) {
|
|
105
|
+
expect(tool.default_config?.permission_policy?.type).toBe("always_allow");
|
|
106
|
+
}
|
|
98
107
|
expect(agentCallArgs.skills.map((s) => s.skill_id)).toEqual([
|
|
99
108
|
"skill_work-on-task",
|
|
100
109
|
"skill_create-pr",
|
|
@@ -191,16 +191,16 @@ describe("CodexSession event mapping", () => {
|
|
|
191
191
|
expect(messages[0].content).toBe("Hello from codex");
|
|
192
192
|
}
|
|
193
193
|
|
|
194
|
-
//
|
|
195
|
-
//
|
|
196
|
-
//
|
|
194
|
+
// Phase 9: unified `input + output` formula (Codex `input_tokens` already
|
|
195
|
+
// includes cached input, so we don't add cache_read separately).
|
|
196
|
+
// input=100 + output=50 → contextUsed=150.
|
|
197
197
|
const contextUsage = emitted.find((e) => e.type === "context_usage");
|
|
198
198
|
expect(contextUsage).toBeDefined();
|
|
199
199
|
if (contextUsage && contextUsage.type === "context_usage") {
|
|
200
|
-
expect(contextUsage.contextUsedTokens).toBe(
|
|
200
|
+
expect(contextUsage.contextUsedTokens).toBe(150);
|
|
201
201
|
expect(contextUsage.contextTotalTokens).toBe(200_000);
|
|
202
|
-
|
|
203
|
-
expect(contextUsage.
|
|
202
|
+
expect(contextUsage.contextPercent).toBeCloseTo((150 / 200_000) * 100, 6);
|
|
203
|
+
expect(contextUsage.contextFormula).toBe("input-cache-output");
|
|
204
204
|
}
|
|
205
205
|
|
|
206
206
|
// result event is final and non-error, with cost computed from token counts
|
|
@@ -225,14 +225,15 @@ describe("CodexSession event mapping", () => {
|
|
|
225
225
|
expect(result.sessionId).toBe("thread-abc");
|
|
226
226
|
});
|
|
227
227
|
|
|
228
|
-
test("chatty turn
|
|
229
|
-
//
|
|
230
|
-
//
|
|
231
|
-
//
|
|
232
|
-
//
|
|
233
|
-
//
|
|
234
|
-
//
|
|
235
|
-
//
|
|
228
|
+
test("Phase 9: chatty turn clamps contextPercent to 100% under the unified formula", async () => {
|
|
229
|
+
// Phase 9 deliberately swapped Codex's per-adapter peak-proxy formula
|
|
230
|
+
// (`(input - cached) + output`) for the unified `input + output` formula
|
|
231
|
+
// shared with every other provider. The trade-off: a chatty Codex turn
|
|
232
|
+
// — where `input_tokens` is the SUM across every model call in the turn
|
|
233
|
+
// — over-reports compared to the peak-proxy variant. The clamp at 100%
|
|
234
|
+
// keeps the gauge sensible; downstream consumers reading the new
|
|
235
|
+
// `contextFormula='input-cache-output'` tag know it's apples-to-apples
|
|
236
|
+
// across providers. Numbers below are from the verify-plan transcript.
|
|
236
237
|
const agentMsg: AgentMessageItem = {
|
|
237
238
|
id: "msg-1",
|
|
238
239
|
type: "agent_message",
|
|
@@ -262,12 +263,12 @@ describe("CodexSession event mapping", () => {
|
|
|
262
263
|
const contextUsage = emitted.find((e) => e.type === "context_usage");
|
|
263
264
|
expect(contextUsage).toBeDefined();
|
|
264
265
|
if (contextUsage && contextUsage.type === "context_usage") {
|
|
265
|
-
//
|
|
266
|
-
expect(contextUsage.contextUsedTokens).toBe(
|
|
266
|
+
// Phase 9 unified: input + output = 357142 + 2156 = 359298 (above 200k).
|
|
267
|
+
expect(contextUsage.contextUsedTokens).toBe(359298);
|
|
267
268
|
expect(contextUsage.contextTotalTokens).toBe(200_000);
|
|
268
|
-
//
|
|
269
|
-
expect(contextUsage.contextPercent).
|
|
270
|
-
expect(contextUsage.
|
|
269
|
+
// Above 100% raw → clamped to exactly 100.
|
|
270
|
+
expect(contextUsage.contextPercent).toBe(100);
|
|
271
|
+
expect(contextUsage.contextFormula).toBe("input-cache-output");
|
|
271
272
|
}
|
|
272
273
|
|
|
273
274
|
// Cost still uses the full input_tokens — billing semantics are
|
|
@@ -82,7 +82,7 @@ describe("Context Snapshots", () => {
|
|
|
82
82
|
|
|
83
83
|
// The summary should preserve the last known context usage, not null/0
|
|
84
84
|
const summary = getContextSummaryByTaskId(taskId);
|
|
85
|
-
expect(summary.
|
|
85
|
+
expect(summary.peakContextTokens).toBe(80000);
|
|
86
86
|
expect(summary.contextWindowSize).toBe(200000);
|
|
87
87
|
expect(summary.peakContextPercent).toBe(40);
|
|
88
88
|
});
|
|
@@ -113,7 +113,7 @@ describe("Context Snapshots", () => {
|
|
|
113
113
|
});
|
|
114
114
|
|
|
115
115
|
const summary = getContextSummaryByTaskId(task2.id);
|
|
116
|
-
expect(summary.
|
|
116
|
+
expect(summary.peakContextTokens).toBe(60000);
|
|
117
117
|
expect(summary.contextWindowSize).toBe(200000);
|
|
118
118
|
});
|
|
119
119
|
|
|
@@ -1,8 +1,15 @@
|
|
|
1
1
|
import { describe, expect, test } from "bun:test";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
CONTEXT_FORMULA,
|
|
4
|
+
clampContextPercent,
|
|
5
|
+
computeContextUsed,
|
|
6
|
+
computeContextUsedUnified,
|
|
7
|
+
getContextWindowSize,
|
|
8
|
+
} from "../utils/context-window";
|
|
3
9
|
|
|
4
10
|
describe("getContextWindowSize", () => {
|
|
5
11
|
test("returns 1M for opus models", () => {
|
|
12
|
+
expect(getContextWindowSize("claude-opus-4-7")).toBe(1_000_000);
|
|
6
13
|
expect(getContextWindowSize("claude-opus-4-6")).toBe(1_000_000);
|
|
7
14
|
expect(getContextWindowSize("opus")).toBe(1_000_000);
|
|
8
15
|
});
|
|
@@ -26,6 +33,20 @@ describe("getContextWindowSize", () => {
|
|
|
26
33
|
test("returns default entry value", () => {
|
|
27
34
|
expect(getContextWindowSize("default")).toBe(200_000);
|
|
28
35
|
});
|
|
36
|
+
|
|
37
|
+
test("Phase 4: dated full ids resolve via date-suffix stripping", () => {
|
|
38
|
+
// The regression this fixes: pre-Phase 4 these all fell to the 200k
|
|
39
|
+
// default, wildly understating opus/sonnet 4.x context.
|
|
40
|
+
expect(getContextWindowSize("claude-sonnet-4-6-20251004")).toBe(1_000_000);
|
|
41
|
+
expect(getContextWindowSize("claude-opus-4-7-20251201")).toBe(1_000_000);
|
|
42
|
+
expect(getContextWindowSize("claude-haiku-4-5-20251001")).toBe(200_000);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("Phase 4: legacy 3.x family ids resolve", () => {
|
|
46
|
+
expect(getContextWindowSize("claude-3-5-sonnet")).toBe(200_000);
|
|
47
|
+
expect(getContextWindowSize("claude-3-5-sonnet-20241022")).toBe(200_000);
|
|
48
|
+
expect(getContextWindowSize("claude-3-opus")).toBe(200_000);
|
|
49
|
+
});
|
|
29
50
|
});
|
|
30
51
|
|
|
31
52
|
describe("computeContextUsed", () => {
|
|
@@ -64,3 +85,46 @@ describe("computeContextUsed", () => {
|
|
|
64
85
|
).toBe(5000);
|
|
65
86
|
});
|
|
66
87
|
});
|
|
88
|
+
|
|
89
|
+
describe("computeContextUsedUnified (Phase 9 unified formula)", () => {
|
|
90
|
+
test("sums input + cache_read + cache_create + output", () => {
|
|
91
|
+
expect(
|
|
92
|
+
computeContextUsedUnified({
|
|
93
|
+
inputTokens: 1000,
|
|
94
|
+
cacheReadTokens: 200,
|
|
95
|
+
cacheCreateTokens: 300,
|
|
96
|
+
outputTokens: 500,
|
|
97
|
+
}),
|
|
98
|
+
).toBe(2000);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
test("treats missing/null fields as zero", () => {
|
|
102
|
+
expect(computeContextUsedUnified({})).toBe(0);
|
|
103
|
+
expect(computeContextUsedUnified({ inputTokens: 100, outputTokens: null })).toBe(100);
|
|
104
|
+
});
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
describe("clampContextPercent (Phase 9)", () => {
|
|
108
|
+
test("returns the clamped percent for valid inputs", () => {
|
|
109
|
+
expect(clampContextPercent(50_000, 200_000)).toBe(25);
|
|
110
|
+
expect(clampContextPercent(0, 200_000)).toBe(0);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
test("clamps to [0, 100]", () => {
|
|
114
|
+
expect(clampContextPercent(500_000, 200_000)).toBe(100);
|
|
115
|
+
expect(clampContextPercent(-10, 200_000)).toBe(0);
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
test("returns null for missing/zero/negative total (no divide-by-zero NaN)", () => {
|
|
119
|
+
expect(clampContextPercent(100, 0)).toBeNull();
|
|
120
|
+
expect(clampContextPercent(100, null)).toBeNull();
|
|
121
|
+
expect(clampContextPercent(100, undefined)).toBeNull();
|
|
122
|
+
expect(clampContextPercent(100, -1)).toBeNull();
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
describe("CONTEXT_FORMULA constant", () => {
|
|
127
|
+
test("is 'input-cache-output' so adapters stamp the same value on snapshots", () => {
|
|
128
|
+
expect(CONTEXT_FORMULA).toBe("input-cache-output");
|
|
129
|
+
});
|
|
130
|
+
});
|
|
@@ -598,6 +598,8 @@ describe("CostData mapping", () => {
|
|
|
598
598
|
expect(resultEvent.cost.model).toBe("devin");
|
|
599
599
|
expect(resultEvent.cost.inputTokens).toBe(0);
|
|
600
600
|
expect(resultEvent.cost.outputTokens).toBe(0);
|
|
601
|
+
// Phase 3 — provider tag is required so the API recompute path engages.
|
|
602
|
+
expect(resultEvent.cost.provider).toBe("devin");
|
|
601
603
|
}
|
|
602
604
|
});
|
|
603
605
|
|