@zhixuan92/multi-model-agent-core 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +0 -6
  2. package/dist/config/schema.d.ts +73 -45
  3. package/dist/config/schema.d.ts.map +1 -1
  4. package/dist/config/schema.js +14 -0
  5. package/dist/config/schema.js.map +1 -1
  6. package/dist/context/context-block-store.d.ts +75 -0
  7. package/dist/context/context-block-store.d.ts.map +1 -0
  8. package/dist/context/context-block-store.js +82 -0
  9. package/dist/context/context-block-store.js.map +1 -0
  10. package/dist/context/expand-context-blocks.d.ts +20 -0
  11. package/dist/context/expand-context-blocks.d.ts.map +1 -0
  12. package/dist/context/expand-context-blocks.js +46 -0
  13. package/dist/context/expand-context-blocks.js.map +1 -0
  14. package/dist/delegate-with-escalation.d.ts +34 -0
  15. package/dist/delegate-with-escalation.d.ts.map +1 -0
  16. package/dist/delegate-with-escalation.js +172 -0
  17. package/dist/delegate-with-escalation.js.map +1 -0
  18. package/dist/index.d.ts +4 -1
  19. package/dist/index.d.ts.map +1 -1
  20. package/dist/index.js +3 -0
  21. package/dist/index.js.map +1 -1
  22. package/dist/model-profiles.json +42 -4
  23. package/dist/provider.d.ts.map +1 -1
  24. package/dist/provider.js +7 -1
  25. package/dist/provider.js.map +1 -1
  26. package/dist/routing/model-profiles.d.ts +9 -4
  27. package/dist/routing/model-profiles.d.ts.map +1 -1
  28. package/dist/routing/model-profiles.js +8 -0
  29. package/dist/routing/model-profiles.js.map +1 -1
  30. package/dist/run-tasks.d.ts +26 -2
  31. package/dist/run-tasks.d.ts.map +1 -1
  32. package/dist/run-tasks.js +61 -19
  33. package/dist/run-tasks.js.map +1 -1
  34. package/dist/runners/claude-runner.d.ts.map +1 -1
  35. package/dist/runners/claude-runner.js +721 -32
  36. package/dist/runners/claude-runner.js.map +1 -1
  37. package/dist/runners/codex-runner.d.ts.map +1 -1
  38. package/dist/runners/codex-runner.js +541 -48
  39. package/dist/runners/codex-runner.js.map +1 -1
  40. package/dist/runners/error-classification.d.ts +30 -0
  41. package/dist/runners/error-classification.d.ts.map +1 -0
  42. package/dist/runners/error-classification.js +72 -0
  43. package/dist/runners/error-classification.js.map +1 -0
  44. package/dist/runners/injection-type.d.ts +22 -0
  45. package/dist/runners/injection-type.d.ts.map +1 -0
  46. package/dist/runners/injection-type.js +34 -0
  47. package/dist/runners/injection-type.js.map +1 -0
  48. package/dist/runners/openai-runner.d.ts +5 -0
  49. package/dist/runners/openai-runner.d.ts.map +1 -1
  50. package/dist/runners/openai-runner.js +608 -36
  51. package/dist/runners/openai-runner.js.map +1 -1
  52. package/dist/runners/prevention.d.ts +41 -0
  53. package/dist/runners/prevention.d.ts.map +1 -0
  54. package/dist/runners/prevention.js +68 -0
  55. package/dist/runners/prevention.js.map +1 -0
  56. package/dist/runners/supervision.d.ts +137 -0
  57. package/dist/runners/supervision.d.ts.map +1 -0
  58. package/dist/runners/supervision.js +345 -0
  59. package/dist/runners/supervision.js.map +1 -0
  60. package/dist/tools/claude-adapter.d.ts.map +1 -1
  61. package/dist/tools/claude-adapter.js +6 -3
  62. package/dist/tools/claude-adapter.js.map +1 -1
  63. package/dist/tools/definitions.d.ts +3 -1
  64. package/dist/tools/definitions.d.ts.map +1 -1
  65. package/dist/tools/definitions.js +57 -5
  66. package/dist/tools/definitions.js.map +1 -1
  67. package/dist/tools/openai-adapter.d.ts.map +1 -1
  68. package/dist/tools/openai-adapter.js +6 -3
  69. package/dist/tools/openai-adapter.js.map +1 -1
  70. package/dist/tools/scratchpad.d.ts +28 -0
  71. package/dist/tools/scratchpad.d.ts.map +1 -0
  72. package/dist/tools/scratchpad.js +49 -0
  73. package/dist/tools/scratchpad.js.map +1 -0
  74. package/dist/tools/tracker.d.ts +42 -2
  75. package/dist/tools/tracker.d.ts.map +1 -1
  76. package/dist/tools/tracker.js +63 -5
  77. package/dist/tools/tracker.js.map +1 -1
  78. package/dist/types.d.ts +261 -2
  79. package/dist/types.d.ts.map +1 -1
  80. package/dist/types.js +43 -1
  81. package/dist/types.js.map +1 -1
  82. package/package.json +7 -3
@@ -1,8 +1,89 @@
1
1
  import { query } from '@anthropic-ai/claude-agent-sdk';
2
- import { withTimeout } from '../types.js';
2
+ import { createHash } from 'node:crypto';
3
+ import { withTimeout, computeCostUSD, computeSavedCostUSD, } from '../types.js';
3
4
  import { FileTracker } from '../tools/tracker.js';
4
5
  import { createToolImplementations } from '../tools/definitions.js';
5
6
  import { createClaudeToolServer } from '../tools/claude-adapter.js';
7
+ import { TextScratchpad } from '../tools/scratchpad.js';
8
+ import { buildSystemPrompt, buildBudgetHint, buildReGroundingMessage, buildBudgetPressureNudge, RE_GROUNDING_INTERVAL_TURNS, } from './prevention.js';
9
+ import { validateCompletion, validateCoverage, buildRePrompt, sameDegenerateOutput, resolveInputTokenSoftLimit, checkWatchdogThreshold, logWatchdogEvent, trimProgressTrace, } from './supervision.js';
10
+ import { injectionTypeFor } from './injection-type.js';
11
+ import { classifyError } from './error-classification.js';
12
+ import { findModelProfile } from '../routing/model-profiles.js';
13
+ /**
14
+ * Hard cap on supervision re-prompts before we give up and salvage. Same as
15
+ * openai-runner; see spec A.2.2.
16
+ */
17
+ const MAX_SUPERVISION_RETRIES = 3;
18
+ /**
19
+ * Minimal pushable async-iterable queue for feeding user messages to the
20
+ * claude-agent-sdk `query()` in streaming-input mode.
21
+ *
22
+ * The SDK's `query({ prompt: string | AsyncIterable<SDKUserMessage>, ... })`
23
+ * signature (see node_modules/@anthropic-ai/claude-agent-sdk/sdk.d.ts L1879-1882)
24
+ * accepts an async iterable when we want multi-turn input — the intended
25
+ * pathway for "push a follow-up user message into the current query without
26
+ * restarting the CLI subprocess." The built-in `streamInput(...)` method on
27
+ * the returned `Query` object (sdk.d.ts L1862) is documented as "used
28
+ * internally for multi-turn conversations", and the only public way to
29
+ * drive multi-turn input is via this iterable.
30
+ *
31
+ * This class is deliberately small: `push(msg)` delivers a message to a
32
+ * waiting iterator (or buffers it if the iterator isn't waiting yet),
33
+ * `close()` signals end-of-stream, and `[Symbol.asyncIterator]()` returns
34
+ * a generator that yields buffered messages then awaits the next push.
35
+ */
36
+ class PushableUserMessageQueue {
37
+ buffer = [];
38
+ resolvers = [];
39
+ closed = false;
40
+ push(msg) {
41
+ if (this.closed)
42
+ return;
43
+ const resolver = this.resolvers.shift();
44
+ if (resolver) {
45
+ resolver({ value: msg, done: false });
46
+ }
47
+ else {
48
+ this.buffer.push(msg);
49
+ }
50
+ }
51
+ close() {
52
+ if (this.closed)
53
+ return;
54
+ this.closed = true;
55
+ while (this.resolvers.length > 0) {
56
+ const resolver = this.resolvers.shift();
57
+ resolver({ value: undefined, done: true });
58
+ }
59
+ }
60
+ [Symbol.asyncIterator]() {
61
+ return {
62
+ next: () => {
63
+ if (this.buffer.length > 0) {
64
+ return Promise.resolve({ value: this.buffer.shift(), done: false });
65
+ }
66
+ if (this.closed) {
67
+ return Promise.resolve({ value: undefined, done: true });
68
+ }
69
+ return new Promise((resolve) => {
70
+ this.resolvers.push(resolve);
71
+ });
72
+ },
73
+ };
74
+ }
75
+ }
76
+ /**
77
+ * Wrap a plain string in the SDKUserMessage envelope the SDK expects when
78
+ * using streaming input mode. Keeps the per-call sites tidy.
79
+ */
80
+ function userMessage(text) {
81
+ return {
82
+ type: 'user',
83
+ message: { role: 'user', content: text },
84
+ parent_tool_use_id: null,
85
+ };
86
+ }
6
87
  export async function runClaude(prompt, options, providerConfig, defaults) {
7
88
  const maxTurns = options.maxTurns ?? providerConfig.maxTurns ?? defaults.maxTurns;
8
89
  const timeoutMs = options.timeoutMs ?? providerConfig.timeoutMs ?? defaults.timeoutMs;
@@ -11,8 +92,80 @@ export async function runClaude(prompt, options, providerConfig, defaults) {
11
92
  const effort = options.effort ?? providerConfig.effort;
12
93
  const sandboxPolicy = options.sandboxPolicy ?? providerConfig.sandboxPolicy ?? 'cwd-only';
13
94
  const abortController = new AbortController();
14
- const tracker = new FileTracker();
95
+ // --- Progress event emission (Task 10) ----------------------------------
96
+ //
97
+ // `onProgress` is already wrapped in `safeSink` by the orchestrator
98
+ // (Task 8), so any throw from the consumer callback is swallowed
99
+ // upstream and cannot corrupt this loop. We do not need to wrap it
100
+ // again here.
101
+ const onProgress = options.onProgress;
102
+ const shouldCaptureTrace = options.includeProgressTrace ?? false;
103
+ const traceBuffer = [];
104
+ const emit = (event) => {
105
+ if (shouldCaptureTrace)
106
+ traceBuffer.push(event);
107
+ if (onProgress)
108
+ onProgress(event);
109
+ };
110
+ // Hoisted so the FileTracker callback (closed over below) can read the
111
+ // running turn count at callback firing time. Unlike openai-runner — where
112
+ // the turn counter comes from `currentResult?.state.usage.requests + 1`
113
+ // because the SDK only bumps the counter after the call completes — the
114
+ // claude-runner increments `turns` at the top of every `msg.type ===
115
+ // 'assistant'` branch, which is PROCESSED BEFORE the SDK fires any tool
116
+ // calls for that turn. That means `turns` already holds the current
117
+ // turn number when the tracker callback fires mid-tool-loop, so we
118
+ // attribute tool calls to `turns` directly (no +1 offset).
119
+ let inputTokens = 0;
120
+ let outputTokens = 0;
121
+ let costUSD = null;
122
+ let turns = 0;
123
+ const tracker = new FileTracker((summary) => {
124
+ emit({ kind: 'tool_call', turn: turns, toolSummary: summary });
125
+ });
15
126
  const toolImpls = createToolImplementations(tracker, cwd, sandboxPolicy, abortController.signal);
127
+ // --- Prevention layer: system prompt + budget hint ---
128
+ //
129
+ // buildSystemPrompt() is deliberately static and parameter-free (same
130
+ // decision as openai-runner: Task 1 review rejected provider/maxTurns
131
+ // options). We append our discipline rules onto the `claude_code` preset
132
+ // rather than REPLACING the default system prompt, because replacing it
133
+ // strips the SDK's tool-usage guidance. See
134
+ // node_modules/@anthropic-ai/claude-agent-sdk/sdk.d.ts L1460-1465 for the
135
+ // systemPrompt union type — `{ type: 'preset', preset: 'claude_code',
136
+ // append: string }` is the intended "add to defaults" shape.
137
+ const systemPrompt = buildSystemPrompt();
138
+ const budgetHint = buildBudgetHint({ maxTurns });
139
+ const promptWithBudgetHint = `${budgetHint}\n\n${prompt}`;
140
+ // --- onInitialRequest (Task 12) ----------------------------------------
141
+ //
142
+ // Fire once per attempt with the canonical orchestrator-side initial
143
+ // brief: `${systemPrompt}\n\n${promptWithBudgetHint}`. This is NOT the
144
+ // literal bytes the Anthropic SDK will send — the SDK wraps our
145
+ // systemPrompt in `{ type: 'preset', preset: 'claude_code', append: ... }`
146
+ // (see queryOptions.systemPrompt below), so the wire-level system prompt
147
+ // includes the claude_code preset bytes that precede ours. We hash the
148
+ // canonical form anyway for two reasons:
149
+ // 1. It matches openai-runner and codex-runner, which also don't hash
150
+ // literal wire bytes (they hash the same canonical form before the
151
+ // SDK wraps it in its own `messages` / Responses API structures).
152
+ // Cross-runner stability is the Task 12 design requirement.
153
+ // 2. It answers the "did the orchestrator send the same brief across
154
+ // retries?" question, which is the actual debugging use case — NOT
155
+ // "were the literal wire bytes identical?".
156
+ // See `AttemptRecord.initialPromptHash` in types.ts for the full caveat.
157
+ if (options.onInitialRequest) {
158
+ const canonicalInitialBrief = `${systemPrompt}\n\n${promptWithBudgetHint}`;
159
+ try {
160
+ options.onInitialRequest({
161
+ lengthChars: canonicalInitialBrief.length,
162
+ sha256: createHash('sha256').update(canonicalInitialBrief).digest('hex'),
163
+ });
164
+ }
165
+ catch {
166
+ // Swallow — a broken callback must not affect dispatch.
167
+ }
168
+ }
16
169
  // Permission bypass is intentional for sub-agent use. File-system confinement
17
170
  // is enforced by assertWithinCwd in tool definitions when sandboxPolicy is 'cwd-only'.
18
171
  const queryOptions = {
@@ -23,6 +176,11 @@ export async function runClaude(prompt, options, providerConfig, defaults) {
23
176
  allowDangerouslySkipPermissions: true,
24
177
  persistSession: false,
25
178
  abortController,
179
+ systemPrompt: {
180
+ type: 'preset',
181
+ preset: 'claude_code',
182
+ append: systemPrompt,
183
+ },
26
184
  };
27
185
  if (toolMode === 'full') {
28
186
  const toolServer = createClaudeToolServer(toolImpls, sandboxPolicy);
@@ -46,73 +204,604 @@ export async function runClaude(prompt, options, providerConfig, defaults) {
46
204
  // effort is typed as EffortLevel in Options; cast from string
47
205
  queryOptions.effort = effort;
48
206
  }
49
- // Hoisted so the timeout callback can read partial progress
50
- let inputTokens = 0;
51
- let outputTokens = 0;
52
- let costUSD = null;
53
- let turns = 0;
207
+ // --- Scratchpad: buffers every assistant text block we see streaming
208
+ // through the iterator. On any termination path (ok/incomplete/max_turns/
209
+ // error/timeout/force_salvage) we salvage `scratchpad.latest()` when the
210
+ // final `result.result` is empty or degenerate. ---
211
+ const scratchpad = new TextScratchpad();
212
+ // --- Watchdog: resolve the input-token soft limit once per run ---
213
+ const profile = findModelProfile(providerConfig.model);
214
+ const softLimit = resolveInputTokenSoftLimit(providerConfig, profile);
215
+ // --- Task timing + parent model (Task 9) --------------------------------
216
+ const taskStartMs = Date.now();
217
+ const parentModel = options.parentModel;
54
218
  const run = async () => {
55
219
  let output = '';
56
- let hitMaxTurns = false;
220
+ // --- Supervision / watchdog bookkeeping ---
221
+ let supervisionRetries = 0;
222
+ // Initialised to `null` (NOT ''): on the first turn there is no
223
+ // previous degenerate output to compare against, so the same-output
224
+ // early-out must be skipped. See openai-runner regression #5.
225
+ let lastDegenerateOutput = null;
226
+ // High-watermark guard for the watchdog warning nudge — fire at most
227
+ // once per distinct input-token level. Mirrors openai-runner.
228
+ let lastWarnedInputTokens = -1;
229
+ // --- Completed-result sentinel. Every exit from the supervision
230
+ // state machine inside the `for await` iterator sets this to a fully-
231
+ // built RunResult and then `break`s. After the loop, the one explicit
232
+ // return on the happy path is `completedResult`. This gives every
233
+ // exit (ok / incomplete / force_salvage / max_turns) a single
234
+ // explicit owner, mirroring openai-runner's `while (true) + return`
235
+ // shape but compatible with the for-await iterator contract. ---
236
+ let completedResult = null;
237
+ // --- Streaming input queue. See PushableUserMessageQueue docstring:
238
+ // using an async iterable as the `prompt` enables mid-run user-message
239
+ // injection (supervision re-prompts, re-grounding, budget-pressure
240
+ // nudges) without restarting the CLI subprocess. ---
241
+ const messageQueue = new PushableUserMessageQueue();
242
+ messageQueue.push(userMessage(promptWithBudgetHint));
57
243
  try {
58
- for await (const msg of query({ prompt, options: queryOptions })) {
244
+ for await (const msg of query({ prompt: messageQueue, options: queryOptions })) {
59
245
  if (msg.type === 'assistant') {
60
246
  turns++;
247
+ emit({ kind: 'turn_start', turn: turns, provider: 'claude' });
248
+ // Capture every assistant text block as scratchpad fodder. The
249
+ // claude-agent-sdk's BetaMessage.content is an array of blocks:
250
+ // `{ type: 'text', text } | { type: 'tool_use', ... } |
251
+ // { type: 'thinking', ... } | ...`. We only want plain text;
252
+ // tool_use blocks have no salvage value (they're side-effects)
253
+ // and thinking blocks are stripped before the caller sees them.
254
+ if ('message' in msg && msg.message && 'content' in msg.message) {
255
+ // The claude-agent-sdk's BetaMessage.content is typed as an
256
+ // array of content blocks — but historically the API sometimes
257
+ // delivers a bare string, so we defensively handle both. The
258
+ // string branch is narrow-typed to `never` by the SDK, so we
259
+ // cast through `unknown` to keep runtime safety without fighting
260
+ // the compiler.
261
+ const content = msg.message.content;
262
+ if (typeof content === 'string') {
263
+ scratchpad.append(turns, content);
264
+ if (content.length > 0) {
265
+ emit({
266
+ kind: 'text_emission',
267
+ turn: turns,
268
+ chars: content.length,
269
+ preview: content.slice(0, 200),
270
+ });
271
+ }
272
+ }
273
+ else if (Array.isArray(content)) {
274
+ const texts = content
275
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
276
+ .filter((c) => c && c.type === 'text' && typeof c.text === 'string')
277
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
278
+ .map((c) => c.text);
279
+ if (texts.length > 0) {
280
+ const joined = texts.join('\n');
281
+ scratchpad.append(turns, joined);
282
+ if (joined.length > 0) {
283
+ emit({
284
+ kind: 'text_emission',
285
+ turn: turns,
286
+ chars: joined.length,
287
+ preview: joined.slice(0, 200),
288
+ });
289
+ }
290
+ }
291
+ }
292
+ }
293
+ // --- Watchdog check (assistant-message cadence). We check
294
+ // `inputTokens` as accumulated from prior `result` messages.
295
+ // On the very first assistant message inputTokens is 0 and no
296
+ // threshold can fire; that's correct. This is also the ONLY
297
+ // site that handles `warning` — it logs AND pushes the nudge
298
+ // as one action. The post-result site only handles
299
+ // force_salvage. ---
300
+ const watchdogStatus = checkWatchdogThreshold(inputTokens, softLimit);
301
+ if (watchdogStatus !== 'ok') {
302
+ logWatchdogEvent(watchdogStatus, {
303
+ provider: 'claude',
304
+ model: providerConfig.model,
305
+ turn: turns,
306
+ inputTokens,
307
+ softLimit,
308
+ scratchpadChars: scratchpad.toString().length,
309
+ });
310
+ }
311
+ if (watchdogStatus === 'force_salvage') {
312
+ // `watchdog_force_salvage` is not an injected message — no
313
+ // re-prompt is sent — but observers still want to see why the
314
+ // run is being killed. We emit the event with
315
+ // `contentLengthChars: 0` to reflect the "nothing was injected,
316
+ // we just terminated" semantics (mirrors openai-runner).
317
+ emit({
318
+ kind: 'injection',
319
+ injectionType: 'watchdog_force_salvage',
320
+ turn: turns,
321
+ contentLengthChars: 0,
322
+ });
323
+ completedResult = buildClaudeForceSalvageResult({
324
+ tracker,
325
+ scratchpad,
326
+ providerConfig,
327
+ sdkCostUSD: costUSD,
328
+ inputTokens,
329
+ outputTokens,
330
+ turns,
331
+ softLimit,
332
+ durationMs: Date.now() - taskStartMs,
333
+ parentModel,
334
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
335
+ });
336
+ messageQueue.close();
337
+ abortController.abort();
338
+ break;
339
+ }
340
+ // Fire the warning nudge at most once per distinct input-token
341
+ // high-watermark. We push a user message into the queue so the
342
+ // next turn of the conversation will address the budget-pressure
343
+ // prompt. If the nudge response is itself a valid final answer,
344
+ // the supervision loop on the NEXT `result` message will return
345
+ // `ok`. High-watermark guard prevents re-nudging if inputTokens
346
+ // stays the same across two assistant messages.
347
+ if (watchdogStatus === 'warning' && inputTokens > lastWarnedInputTokens) {
348
+ lastWarnedInputTokens = inputTokens;
349
+ const warning = buildBudgetPressureNudge({ inputTokens, softLimit });
350
+ emit({
351
+ kind: 'injection',
352
+ injectionType: 'watchdog_warning',
353
+ turn: turns,
354
+ contentLengthChars: warning.length,
355
+ });
356
+ messageQueue.push(userMessage(warning));
357
+ }
358
+ // --- Periodic re-grounding (best-effort in streaming-input
359
+ // mode): inject a reminder every RE_GROUNDING_INTERVAL_TURNS
360
+ // turns via the same queue. The iterator keeps reading until
361
+ // the CLI subprocess decides to emit a final result after it
362
+ // processes the new user message. ---
363
+ if (turns > 0 && turns % RE_GROUNDING_INTERVAL_TURNS === 0) {
364
+ const reground = buildReGroundingMessage({
365
+ originalPromptExcerpt: prompt,
366
+ currentTurn: turns,
367
+ maxTurns,
368
+ toolCallsSoFar: tracker.getToolCalls().length,
369
+ filesReadSoFar: tracker.getReads().length,
370
+ });
371
+ emit({
372
+ kind: 'injection',
373
+ injectionType: 'reground',
374
+ turn: turns,
375
+ contentLengthChars: reground.length,
376
+ });
377
+ messageQueue.push(userMessage(reground));
378
+ }
61
379
  }
62
380
  if (msg.type === 'result') {
63
381
  if ('result' in msg) {
64
382
  output = msg.result;
65
383
  }
66
- if ('subtype' in msg && msg.subtype === 'error_max_turns') {
67
- hitMaxTurns = true;
68
- }
69
- // Extract usage from modelUsage or usage
384
+ const hitMaxTurns = 'subtype' in msg && msg.subtype === 'error_max_turns';
385
+ // Extract usage from modelUsage or usage, then ACCUMULATE into
386
+ // the running inputTokens/outputTokens. Supervision retries in
387
+ // streaming-input mode push a new user message into the queue
388
+ // and the SDK emits a fresh `result` message per top-level user
389
+ // turn — we want the cumulative usage across every result we
390
+ // see, not just the last one. Accumulation keeps the watchdog
391
+ // soft-limit check honest across retries and produces correct
392
+ // totals on any termination path.
393
+ let turnInputTokens = 0;
394
+ let turnOutputTokens = 0;
70
395
  if ('modelUsage' in msg && msg.modelUsage) {
71
396
  for (const model of Object.values(msg.modelUsage)) {
72
- inputTokens += model.inputTokens ?? 0;
73
- outputTokens += model.outputTokens ?? 0;
397
+ turnInputTokens += model.inputTokens ?? 0;
398
+ turnOutputTokens += model.outputTokens ?? 0;
74
399
  }
75
400
  }
76
401
  else if ('usage' in msg && msg.usage) {
77
402
  const u = msg.usage;
78
- inputTokens = u['input_tokens'] ?? 0;
79
- outputTokens = u['output_tokens'] ?? 0;
403
+ turnInputTokens = u['input_tokens'] ?? 0;
404
+ turnOutputTokens = u['output_tokens'] ?? 0;
80
405
  }
406
+ inputTokens += turnInputTokens;
407
+ outputTokens += turnOutputTokens;
81
408
  if ('total_cost_usd' in msg && typeof msg.total_cost_usd === 'number') {
82
409
  costUSD = msg.total_cost_usd;
83
410
  }
411
+ // --- turn_complete: one event per result message (which
412
+ // corresponds to one top-level assistant turn from the SDK's
413
+ // perspective). Fires after usage aggregation so the cumulative
414
+ // counters are up-to-date.
415
+ emit({
416
+ kind: 'turn_complete',
417
+ turn: turns,
418
+ cumulativeInputTokens: inputTokens,
419
+ cumulativeOutputTokens: outputTokens,
420
+ });
421
+ // --- Watchdog check on the result message as well: input tokens
422
+ // have just jumped and we may now be in force_salvage territory.
423
+ // The post-result site ONLY handles force_salvage. `warning` is
424
+ // intentionally ignored here — the assistant-message-cadence site
425
+ // above is the single place that logs warnings AND pushes the
426
+ // nudge into the queue. Logging `warning` here without pushing a
427
+ // nudge would be misleading (suggests action that didn't happen).
428
+ const postResultWatchdog = checkWatchdogThreshold(inputTokens, softLimit);
429
+ if (postResultWatchdog === 'force_salvage') {
430
+ logWatchdogEvent(postResultWatchdog, {
431
+ provider: 'claude',
432
+ model: providerConfig.model,
433
+ turn: turns,
434
+ inputTokens,
435
+ softLimit,
436
+ scratchpadChars: scratchpad.toString().length,
437
+ });
438
+ emit({
439
+ kind: 'injection',
440
+ injectionType: 'watchdog_force_salvage',
441
+ turn: turns,
442
+ contentLengthChars: 0,
443
+ });
444
+ completedResult = buildClaudeForceSalvageResult({
445
+ tracker,
446
+ scratchpad,
447
+ providerConfig,
448
+ sdkCostUSD: costUSD,
449
+ inputTokens,
450
+ outputTokens,
451
+ turns,
452
+ softLimit,
453
+ durationMs: Date.now() - taskStartMs,
454
+ parentModel,
455
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
456
+ });
457
+ messageQueue.close();
458
+ abortController.abort();
459
+ break;
460
+ }
461
+ // --- Max-turns: don't supervise a max-turns termination,
462
+ // build the max_turns result directly and exit. ---
463
+ if (hitMaxTurns) {
464
+ completedResult = buildClaudeMaxTurnsResult({
465
+ tracker,
466
+ scratchpad,
467
+ providerConfig,
468
+ sdkCostUSD: costUSD,
469
+ inputTokens,
470
+ outputTokens,
471
+ turns,
472
+ maxTurns,
473
+ lastOutput: output,
474
+ reason: `claude-agent-sdk signaled error_max_turns after ${turns} turns (user-declared maxTurns: ${maxTurns})`,
475
+ durationMs: Date.now() - taskStartMs,
476
+ parentModel,
477
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
478
+ });
479
+ messageQueue.close();
480
+ break;
481
+ }
482
+ // --- Supervision: validate the captured output. Valid output
483
+ // is an immediate ok-exit. Degenerate output either re-prompts
484
+ // (and keeps reading the iterator) or — if the retry budget is
485
+ // spent / same-output early-out fires — exits as incomplete. ---
486
+ const validation = validateCompletion(output);
487
+ // NEW: coverage check — only when syntactic validation passes
488
+ if (validation.valid && options.expectedCoverage) {
489
+ const coverageValidation = validateCoverage(output, options.expectedCoverage);
490
+ if (!coverageValidation.valid) {
491
+ validation.valid = false;
492
+ validation.kind = coverageValidation.kind;
493
+ validation.reason = coverageValidation.reason;
494
+ }
495
+ }
496
+ if (validation.valid) {
497
+ completedResult = buildClaudeOkResult({
498
+ tracker,
499
+ scratchpad,
500
+ providerConfig,
501
+ sdkCostUSD: costUSD,
502
+ inputTokens,
503
+ outputTokens,
504
+ turns,
505
+ output,
506
+ durationMs: Date.now() - taskStartMs,
507
+ parentModel,
508
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
509
+ });
510
+ messageQueue.close();
511
+ break;
512
+ }
513
+ // Same-output early-out: don't burn another retry on identical
514
+ // garbage. Compare only when we have a previous degenerate.
515
+ if (lastDegenerateOutput !== null &&
516
+ sameDegenerateOutput(output, lastDegenerateOutput)) {
517
+ completedResult = buildClaudeIncompleteResult({
518
+ tracker,
519
+ scratchpad,
520
+ providerConfig,
521
+ sdkCostUSD: costUSD,
522
+ inputTokens,
523
+ outputTokens,
524
+ turns,
525
+ reason: `supervision loop exhausted after ${supervisionRetries} re-prompts (last kind: ${validation.kind ?? 'unknown'})`,
526
+ durationMs: Date.now() - taskStartMs,
527
+ parentModel,
528
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
529
+ });
530
+ messageQueue.close();
531
+ break;
532
+ }
533
+ lastDegenerateOutput = output;
534
+ supervisionRetries++;
535
+ if (supervisionRetries >= MAX_SUPERVISION_RETRIES) {
536
+ completedResult = buildClaudeIncompleteResult({
537
+ tracker,
538
+ scratchpad,
539
+ providerConfig,
540
+ sdkCostUSD: costUSD,
541
+ inputTokens,
542
+ outputTokens,
543
+ turns,
544
+ reason: `supervision loop exhausted after ${supervisionRetries} re-prompts (last kind: ${validation.kind ?? 'unknown'})`,
545
+ durationMs: Date.now() - taskStartMs,
546
+ parentModel,
547
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
548
+ });
549
+ messageQueue.close();
550
+ break;
551
+ }
552
+ // Push the re-prompt and continue reading the iterator.
553
+ const rePrompt = buildRePrompt(validation);
554
+ emit({
555
+ kind: 'injection',
556
+ injectionType: injectionTypeFor(validation.kind),
557
+ turn: turns,
558
+ contentLengthChars: rePrompt.length,
559
+ });
560
+ messageQueue.push(userMessage(rePrompt));
84
561
  }
85
562
  }
86
563
  }
87
564
  catch (err) {
565
+ // Preserve partial usage — the scratchpad may have buffered text
566
+ // from turns that ran before the throw. Route the thrown error
567
+ // through the shared classifier so the escalation orchestrator can
568
+ // distinguish abort / network / HTTP-error / generic failure modes.
569
+ const { status, reason } = classifyError(err);
570
+ const msg = err instanceof Error ? err.message : String(err);
571
+ emit({ kind: 'done', status });
572
+ const hasSalvage = !scratchpad.isEmpty();
573
+ const finalCostUSD = effectiveClaudeCost(providerConfig, inputTokens, outputTokens, costUSD);
574
+ const savedCostUSD = computeSavedCostUSD(finalCostUSD, inputTokens, outputTokens, parentModel);
88
575
  return {
89
- output: `Sub-agent error: ${err instanceof Error ? err.message : String(err)}`,
90
- status: 'error',
91
- usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens, costUSD },
576
+ output: hasSalvage ? scratchpad.latest() : `Sub-agent error: ${msg}`,
577
+ status,
578
+ usage: {
579
+ inputTokens,
580
+ outputTokens,
581
+ totalTokens: inputTokens + outputTokens,
582
+ costUSD: finalCostUSD,
583
+ savedCostUSD,
584
+ },
92
585
  turns,
93
- files: tracker.getFiles(),
94
- error: err instanceof Error ? err.message : String(err),
586
+ filesRead: tracker.getReads(),
587
+ directoriesListed: tracker.getDirectoriesListed(),
588
+ filesWritten: tracker.getWrites(),
589
+ toolCalls: tracker.getToolCalls(),
590
+ outputIsDiagnostic: !hasSalvage,
591
+ escalationLog: [],
592
+ error: msg || reason,
593
+ durationMs: Date.now() - taskStartMs,
594
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
95
595
  };
96
596
  }
597
+ // Every `break` inside the iterator above assigned `completedResult`
598
+ // before exiting. If the iterator drained without any break (e.g. the
599
+ // SDK closed the stream cleanly without ever emitting a final
600
+ // `result`), synthesize an incomplete result so the caller always
601
+ // gets a meaningful diagnostic instead of undefined.
602
+ if (completedResult) {
603
+ emit({ kind: 'done', status: completedResult.status });
604
+ return completedResult;
605
+ }
606
+ const drained = buildClaudeIncompleteResult({
607
+ tracker,
608
+ scratchpad,
609
+ providerConfig,
610
+ sdkCostUSD: costUSD,
611
+ inputTokens,
612
+ outputTokens,
613
+ turns,
614
+ durationMs: Date.now() - taskStartMs,
615
+ parentModel,
616
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
617
+ });
618
+ emit({ kind: 'done', status: drained.status });
619
+ return drained;
620
+ };
621
+ return withTimeout(run(), timeoutMs, () => {
622
+ emit({ kind: 'done', status: 'timeout' });
623
+ const hasSalvage = !scratchpad.isEmpty();
624
+ const finalCostUSD = effectiveClaudeCost(providerConfig, inputTokens, outputTokens, costUSD);
625
+ const savedCostUSD = computeSavedCostUSD(finalCostUSD, inputTokens, outputTokens, parentModel);
97
626
  return {
98
- output: hitMaxTurns ? (output || `Agent exceeded max turns (${maxTurns}).`) : output,
99
- status: hitMaxTurns ? 'max_turns' : 'ok',
627
+ output: hasSalvage ? scratchpad.latest() : `Agent timed out after ${timeoutMs}ms.`,
628
+ status: 'timeout',
629
+ filesRead: tracker.getReads(),
630
+ directoriesListed: tracker.getDirectoriesListed(),
631
+ filesWritten: tracker.getWrites(),
632
+ toolCalls: tracker.getToolCalls(),
100
633
  usage: {
101
634
  inputTokens,
102
635
  outputTokens,
103
636
  totalTokens: inputTokens + outputTokens,
104
- costUSD,
637
+ costUSD: finalCostUSD,
638
+ savedCostUSD,
105
639
  },
106
640
  turns,
107
- files: tracker.getFiles(),
641
+ outputIsDiagnostic: !hasSalvage,
642
+ escalationLog: [],
643
+ durationMs: Date.now() - taskStartMs,
644
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
108
645
  };
646
+ }, abortController);
647
+ }
648
+ function effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCost) {
649
+ const computed = computeCostUSD(inputTokens, outputTokens, providerConfig);
650
+ return computed ?? sdkCost;
651
+ }
652
+ function buildClaudeOkResult(args) {
653
+ const { tracker, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, output, durationMs, parentModel, traceBuffer } = args;
654
+ const costUSD = effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD);
655
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
656
+ return {
657
+ output,
658
+ status: 'ok',
659
+ usage: {
660
+ inputTokens,
661
+ outputTokens,
662
+ totalTokens: inputTokens + outputTokens,
663
+ costUSD,
664
+ savedCostUSD,
665
+ },
666
+ turns,
667
+ filesRead: tracker.getReads(),
668
+ directoriesListed: tracker.getDirectoriesListed(),
669
+ filesWritten: tracker.getWrites(),
670
+ toolCalls: tracker.getToolCalls(),
671
+ // `ok` always carries a real model answer — never a diagnostic.
672
+ outputIsDiagnostic: false,
673
+ escalationLog: [],
674
+ durationMs,
675
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
109
676
  };
110
- return withTimeout(run(), timeoutMs, () => ({
111
- output: `Agent timed out after ${timeoutMs}ms.`,
112
- status: 'timeout',
113
- files: tracker.getFiles(),
114
- usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens, costUSD },
677
+ }
678
+ /**
679
+ * Supervision-exhausted path: retry cap hit or same-output early-out. Prefer
680
+ * scratchpad salvage; fall back to the incomplete diagnostic.
681
+ */
682
+ function buildClaudeIncompleteResult(args) {
683
+ const { tracker, scratchpad, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, reason, durationMs, parentModel, traceBuffer } = args;
684
+ const filesRead = tracker.getReads();
685
+ const filesWritten = tracker.getWrites();
686
+ const costUSD = effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD);
687
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
688
+ const hasSalvage = !scratchpad.isEmpty();
689
+ return {
690
+ output: hasSalvage
691
+ ? scratchpad.latest()
692
+ : buildClaudeIncompleteDiagnostic({
693
+ turns,
694
+ inputTokens,
695
+ outputTokens,
696
+ filesRead,
697
+ filesWritten,
698
+ }),
699
+ status: 'incomplete',
700
+ usage: {
701
+ inputTokens,
702
+ outputTokens,
703
+ totalTokens: inputTokens + outputTokens,
704
+ costUSD,
705
+ savedCostUSD,
706
+ },
115
707
  turns,
116
- }), abortController);
708
+ filesRead,
709
+ directoriesListed: tracker.getDirectoriesListed(),
710
+ filesWritten,
711
+ toolCalls: tracker.getToolCalls(),
712
+ outputIsDiagnostic: !hasSalvage,
713
+ escalationLog: [],
714
+ error: reason,
715
+ durationMs,
716
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
717
+ };
718
+ }
719
+ function buildClaudeForceSalvageResult(args) {
720
+ const { tracker, scratchpad, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, softLimit, durationMs, parentModel, traceBuffer } = args;
721
+ const costUSD = effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD);
722
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
723
+ const hasSalvage = !scratchpad.isEmpty();
724
+ return {
725
+ output: hasSalvage
726
+ ? scratchpad.latest()
727
+ : `[claude sub-agent forcibly terminated at ${inputTokens} input tokens (soft limit ${softLimit}). No usable text was buffered.]`,
728
+ status: 'incomplete',
729
+ usage: {
730
+ inputTokens,
731
+ outputTokens,
732
+ totalTokens: inputTokens + outputTokens,
733
+ costUSD,
734
+ savedCostUSD,
735
+ },
736
+ turns,
737
+ filesRead: tracker.getReads(),
738
+ directoriesListed: tracker.getDirectoriesListed(),
739
+ filesWritten: tracker.getWrites(),
740
+ toolCalls: tracker.getToolCalls(),
741
+ outputIsDiagnostic: !hasSalvage,
742
+ escalationLog: [],
743
+ durationMs,
744
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
745
+ };
746
+ }
747
+ function buildClaudeMaxTurnsResult(args) {
748
+ const { tracker, scratchpad, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, maxTurns, lastOutput, reason, durationMs, parentModel, traceBuffer } = args;
749
+ const hasSalvage = !scratchpad.isEmpty();
750
+ // Note: `lastOutput` here is the model's last streamed text before the
751
+ // max-turns boundary — NOT a diagnostic template. If the scratchpad has
752
+ // nothing but `lastOutput` is non-empty, that's still real model content,
753
+ // so outputIsDiagnostic is false. Only the `Agent exceeded max turns…`
754
+ // fallback (empty scratchpad AND empty lastOutput) is a diagnostic.
755
+ const output = hasSalvage
756
+ ? scratchpad.latest()
757
+ : (lastOutput || `Agent exceeded max turns (${maxTurns}).`);
758
+ const outputIsDiagnostic = !hasSalvage && !lastOutput;
759
+ const costUSD = effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD);
760
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
761
+ return {
762
+ output,
763
+ status: 'max_turns',
764
+ usage: {
765
+ inputTokens,
766
+ outputTokens,
767
+ totalTokens: inputTokens + outputTokens,
768
+ costUSD,
769
+ savedCostUSD,
770
+ },
771
+ turns,
772
+ filesRead: tracker.getReads(),
773
+ directoriesListed: tracker.getDirectoriesListed(),
774
+ filesWritten: tracker.getWrites(),
775
+ toolCalls: tracker.getToolCalls(),
776
+ outputIsDiagnostic,
777
+ escalationLog: [],
778
+ error: reason,
779
+ durationMs,
780
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
781
+ };
782
+ }
783
+ function buildClaudeIncompleteDiagnostic(opts) {
784
+ const formatList = (files) => {
785
+ const MAX_SHOWN = 10;
786
+ if (files.length === 0)
787
+ return '';
788
+ if (files.length <= MAX_SHOWN)
789
+ return ` (${files.join(', ')})`;
790
+ return ` (${files.slice(0, MAX_SHOWN).join(', ')}, … ${files.length - MAX_SHOWN} more)`;
791
+ };
792
+ return [
793
+ '[claude sub-agent terminated without producing a final answer]',
794
+ '',
795
+ 'The query stream ended without ever emitting a result message. This usually means ' +
796
+ 'the agent loop exited prematurely or the SDK lost the final message.',
797
+ '',
798
+ `Turns used: ${opts.turns}`,
799
+ `Input tokens: ${opts.inputTokens}`,
800
+ `Output tokens: ${opts.outputTokens}`,
801
+ `Files read: ${opts.filesRead.length}${formatList(opts.filesRead)}`,
802
+ `Files written: ${opts.filesWritten.length}${formatList(opts.filesWritten)}`,
803
+ '',
804
+ 'Recommended action: re-dispatch with a tighter brief, or check Claude Agent SDK logs.',
805
+ ].join('\n');
117
806
  }
118
807
  //# sourceMappingURL=claude-runner.js.map