@zhixuan92/multi-model-agent-core 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -6
- package/dist/config/schema.d.ts +27 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +13 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/context/context-block-store.d.ts +75 -0
- package/dist/context/context-block-store.d.ts.map +1 -0
- package/dist/context/context-block-store.js +82 -0
- package/dist/context/context-block-store.js.map +1 -0
- package/dist/context/expand-context-blocks.d.ts +20 -0
- package/dist/context/expand-context-blocks.d.ts.map +1 -0
- package/dist/context/expand-context-blocks.js +46 -0
- package/dist/context/expand-context-blocks.js.map +1 -0
- package/dist/delegate-with-escalation.d.ts +34 -0
- package/dist/delegate-with-escalation.d.ts.map +1 -0
- package/dist/delegate-with-escalation.js +168 -0
- package/dist/delegate-with-escalation.js.map +1 -0
- package/dist/index.d.ts +4 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/model-profiles.json +8 -4
- package/dist/provider.d.ts.map +1 -1
- package/dist/provider.js +7 -1
- package/dist/provider.js.map +1 -1
- package/dist/routing/model-profiles.d.ts +1 -0
- package/dist/routing/model-profiles.d.ts.map +1 -1
- package/dist/routing/model-profiles.js +4 -0
- package/dist/routing/model-profiles.js.map +1 -1
- package/dist/run-tasks.d.ts +26 -2
- package/dist/run-tasks.d.ts.map +1 -1
- package/dist/run-tasks.js +61 -19
- package/dist/run-tasks.js.map +1 -1
- package/dist/runners/claude-runner.d.ts.map +1 -1
- package/dist/runners/claude-runner.js +643 -32
- package/dist/runners/claude-runner.js.map +1 -1
- package/dist/runners/codex-runner.d.ts.map +1 -1
- package/dist/runners/codex-runner.js +473 -48
- package/dist/runners/codex-runner.js.map +1 -1
- package/dist/runners/error-classification.d.ts +30 -0
- package/dist/runners/error-classification.d.ts.map +1 -0
- package/dist/runners/error-classification.js +72 -0
- package/dist/runners/error-classification.js.map +1 -0
- package/dist/runners/injection-type.d.ts +17 -0
- package/dist/runners/injection-type.d.ts.map +1 -0
- package/dist/runners/injection-type.js +27 -0
- package/dist/runners/injection-type.js.map +1 -0
- package/dist/runners/openai-runner.d.ts +5 -0
- package/dist/runners/openai-runner.d.ts.map +1 -1
- package/dist/runners/openai-runner.js +508 -36
- package/dist/runners/openai-runner.js.map +1 -1
- package/dist/runners/prevention.d.ts +41 -0
- package/dist/runners/prevention.d.ts.map +1 -0
- package/dist/runners/prevention.js +68 -0
- package/dist/runners/prevention.js.map +1 -0
- package/dist/runners/supervision.d.ts +130 -0
- package/dist/runners/supervision.d.ts.map +1 -0
- package/dist/runners/supervision.js +238 -0
- package/dist/runners/supervision.js.map +1 -0
- package/dist/tools/claude-adapter.d.ts.map +1 -1
- package/dist/tools/claude-adapter.js +6 -3
- package/dist/tools/claude-adapter.js.map +1 -1
- package/dist/tools/definitions.d.ts +3 -1
- package/dist/tools/definitions.d.ts.map +1 -1
- package/dist/tools/definitions.js +56 -5
- package/dist/tools/definitions.js.map +1 -1
- package/dist/tools/openai-adapter.d.ts.map +1 -1
- package/dist/tools/openai-adapter.js +6 -3
- package/dist/tools/openai-adapter.js.map +1 -1
- package/dist/tools/scratchpad.d.ts +28 -0
- package/dist/tools/scratchpad.d.ts.map +1 -0
- package/dist/tools/scratchpad.js +49 -0
- package/dist/tools/scratchpad.js.map +1 -0
- package/dist/tools/tracker.d.ts +38 -2
- package/dist/tools/tracker.d.ts.map +1 -1
- package/dist/tools/tracker.js +54 -5
- package/dist/tools/tracker.js.map +1 -1
- package/dist/types.d.ts +184 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +17 -1
- package/dist/types.js.map +1 -1
- package/package.json +7 -3
|
@@ -1,8 +1,89 @@
|
|
|
1
1
|
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
2
|
-
import {
|
|
2
|
+
import { createHash } from 'node:crypto';
|
|
3
|
+
import { withTimeout, computeCostUSD, } from '../types.js';
|
|
3
4
|
import { FileTracker } from '../tools/tracker.js';
|
|
4
5
|
import { createToolImplementations } from '../tools/definitions.js';
|
|
5
6
|
import { createClaudeToolServer } from '../tools/claude-adapter.js';
|
|
7
|
+
import { TextScratchpad } from '../tools/scratchpad.js';
|
|
8
|
+
import { buildSystemPrompt, buildBudgetHint, buildReGroundingMessage, buildBudgetPressureNudge, RE_GROUNDING_INTERVAL_TURNS, } from './prevention.js';
|
|
9
|
+
import { validateCompletion, buildRePrompt, sameDegenerateOutput, resolveInputTokenSoftLimit, checkWatchdogThreshold, logWatchdogEvent, } from './supervision.js';
|
|
10
|
+
import { injectionTypeFor } from './injection-type.js';
|
|
11
|
+
import { classifyError } from './error-classification.js';
|
|
12
|
+
import { findModelProfile } from '../routing/model-profiles.js';
|
|
13
|
+
/**
|
|
14
|
+
* Hard cap on supervision re-prompts before we give up and salvage. Same as
|
|
15
|
+
* openai-runner; see spec A.2.2.
|
|
16
|
+
*/
|
|
17
|
+
const MAX_SUPERVISION_RETRIES = 3;
|
|
18
|
+
/**
|
|
19
|
+
* Minimal pushable async-iterable queue for feeding user messages to the
|
|
20
|
+
* claude-agent-sdk `query()` in streaming-input mode.
|
|
21
|
+
*
|
|
22
|
+
* The SDK's `query({ prompt: string | AsyncIterable<SDKUserMessage>, ... })`
|
|
23
|
+
* signature (see node_modules/@anthropic-ai/claude-agent-sdk/sdk.d.ts L1879-1882)
|
|
24
|
+
* accepts an async iterable when we want multi-turn input — the intended
|
|
25
|
+
* pathway for "push a follow-up user message into the current query without
|
|
26
|
+
* restarting the CLI subprocess." The built-in `streamInput(...)` method on
|
|
27
|
+
* the returned `Query` object (sdk.d.ts L1862) is documented as "used
|
|
28
|
+
* internally for multi-turn conversations", and the only public way to
|
|
29
|
+
* drive multi-turn input is via this iterable.
|
|
30
|
+
*
|
|
31
|
+
* This class is deliberately small: `push(msg)` delivers a message to a
|
|
32
|
+
* waiting iterator (or buffers it if the iterator isn't waiting yet),
|
|
33
|
+
* `close()` signals end-of-stream, and `[Symbol.asyncIterator]()` returns
|
|
34
|
+
* a generator that yields buffered messages then awaits the next push.
|
|
35
|
+
*/
|
|
36
|
+
class PushableUserMessageQueue {
|
|
37
|
+
buffer = [];
|
|
38
|
+
resolvers = [];
|
|
39
|
+
closed = false;
|
|
40
|
+
push(msg) {
|
|
41
|
+
if (this.closed)
|
|
42
|
+
return;
|
|
43
|
+
const resolver = this.resolvers.shift();
|
|
44
|
+
if (resolver) {
|
|
45
|
+
resolver({ value: msg, done: false });
|
|
46
|
+
}
|
|
47
|
+
else {
|
|
48
|
+
this.buffer.push(msg);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
close() {
|
|
52
|
+
if (this.closed)
|
|
53
|
+
return;
|
|
54
|
+
this.closed = true;
|
|
55
|
+
while (this.resolvers.length > 0) {
|
|
56
|
+
const resolver = this.resolvers.shift();
|
|
57
|
+
resolver({ value: undefined, done: true });
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
[Symbol.asyncIterator]() {
|
|
61
|
+
return {
|
|
62
|
+
next: () => {
|
|
63
|
+
if (this.buffer.length > 0) {
|
|
64
|
+
return Promise.resolve({ value: this.buffer.shift(), done: false });
|
|
65
|
+
}
|
|
66
|
+
if (this.closed) {
|
|
67
|
+
return Promise.resolve({ value: undefined, done: true });
|
|
68
|
+
}
|
|
69
|
+
return new Promise((resolve) => {
|
|
70
|
+
this.resolvers.push(resolve);
|
|
71
|
+
});
|
|
72
|
+
},
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Wrap a plain string in the SDKUserMessage envelope the SDK expects when
|
|
78
|
+
* using streaming input mode. Keeps the per-call sites tidy.
|
|
79
|
+
*/
|
|
80
|
+
function userMessage(text) {
|
|
81
|
+
return {
|
|
82
|
+
type: 'user',
|
|
83
|
+
message: { role: 'user', content: text },
|
|
84
|
+
parent_tool_use_id: null,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
6
87
|
export async function runClaude(prompt, options, providerConfig, defaults) {
|
|
7
88
|
const maxTurns = options.maxTurns ?? providerConfig.maxTurns ?? defaults.maxTurns;
|
|
8
89
|
const timeoutMs = options.timeoutMs ?? providerConfig.timeoutMs ?? defaults.timeoutMs;
|
|
@@ -11,8 +92,76 @@ export async function runClaude(prompt, options, providerConfig, defaults) {
|
|
|
11
92
|
const effort = options.effort ?? providerConfig.effort;
|
|
12
93
|
const sandboxPolicy = options.sandboxPolicy ?? providerConfig.sandboxPolicy ?? 'cwd-only';
|
|
13
94
|
const abortController = new AbortController();
|
|
14
|
-
|
|
95
|
+
// --- Progress event emission (Task 10) ----------------------------------
|
|
96
|
+
//
|
|
97
|
+
// `onProgress` is already wrapped in `safeSink` by the orchestrator
|
|
98
|
+
// (Task 8), so any throw from the consumer callback is swallowed
|
|
99
|
+
// upstream and cannot corrupt this loop. We do not need to wrap it
|
|
100
|
+
// again here.
|
|
101
|
+
const onProgress = options.onProgress;
|
|
102
|
+
const emit = (event) => {
|
|
103
|
+
if (onProgress)
|
|
104
|
+
onProgress(event);
|
|
105
|
+
};
|
|
106
|
+
// Hoisted so the FileTracker callback (closed over below) can read the
|
|
107
|
+
// running turn count at callback firing time. Unlike openai-runner — where
|
|
108
|
+
// the turn counter comes from `currentResult?.state.usage.requests + 1`
|
|
109
|
+
// because the SDK only bumps the counter after the call completes — the
|
|
110
|
+
// claude-runner increments `turns` at the top of every `msg.type ===
|
|
111
|
+
// 'assistant'` branch, which is PROCESSED BEFORE the SDK fires any tool
|
|
112
|
+
// calls for that turn. That means `turns` already holds the current
|
|
113
|
+
// turn number when the tracker callback fires mid-tool-loop, so we
|
|
114
|
+
// attribute tool calls to `turns` directly (no +1 offset).
|
|
115
|
+
let inputTokens = 0;
|
|
116
|
+
let outputTokens = 0;
|
|
117
|
+
let costUSD = null;
|
|
118
|
+
let turns = 0;
|
|
119
|
+
const tracker = new FileTracker((summary) => {
|
|
120
|
+
emit({ kind: 'tool_call', turn: turns, toolSummary: summary });
|
|
121
|
+
});
|
|
15
122
|
const toolImpls = createToolImplementations(tracker, cwd, sandboxPolicy, abortController.signal);
|
|
123
|
+
// --- Prevention layer: system prompt + budget hint ---
|
|
124
|
+
//
|
|
125
|
+
// buildSystemPrompt() is deliberately static and parameter-free (same
|
|
126
|
+
// decision as openai-runner: Task 1 review rejected provider/maxTurns
|
|
127
|
+
// options). We append our discipline rules onto the `claude_code` preset
|
|
128
|
+
// rather than REPLACING the default system prompt, because replacing it
|
|
129
|
+
// strips the SDK's tool-usage guidance. See
|
|
130
|
+
// node_modules/@anthropic-ai/claude-agent-sdk/sdk.d.ts L1460-1465 for the
|
|
131
|
+
// systemPrompt union type — `{ type: 'preset', preset: 'claude_code',
|
|
132
|
+
// append: string }` is the intended "add to defaults" shape.
|
|
133
|
+
const systemPrompt = buildSystemPrompt();
|
|
134
|
+
const budgetHint = buildBudgetHint({ maxTurns });
|
|
135
|
+
const promptWithBudgetHint = `${budgetHint}\n\n${prompt}`;
|
|
136
|
+
// --- onInitialRequest (Task 12) ----------------------------------------
|
|
137
|
+
//
|
|
138
|
+
// Fire once per attempt with the canonical orchestrator-side initial
|
|
139
|
+
// brief: `${systemPrompt}\n\n${promptWithBudgetHint}`. This is NOT the
|
|
140
|
+
// literal bytes the Anthropic SDK will send — the SDK wraps our
|
|
141
|
+
// systemPrompt in `{ type: 'preset', preset: 'claude_code', append: ... }`
|
|
142
|
+
// (see queryOptions.systemPrompt below), so the wire-level system prompt
|
|
143
|
+
// includes the claude_code preset bytes that precede ours. We hash the
|
|
144
|
+
// canonical form anyway for two reasons:
|
|
145
|
+
// 1. It matches openai-runner and codex-runner, which also don't hash
|
|
146
|
+
// literal wire bytes (they hash the same canonical form before the
|
|
147
|
+
// SDK wraps it in its own `messages` / Responses API structures).
|
|
148
|
+
// Cross-runner stability is the Task 12 design requirement.
|
|
149
|
+
// 2. It answers the "did the orchestrator send the same brief across
|
|
150
|
+
// retries?" question, which is the actual debugging use case — NOT
|
|
151
|
+
// "were the literal wire bytes identical?".
|
|
152
|
+
// See `AttemptRecord.initialPromptHash` in types.ts for the full caveat.
|
|
153
|
+
if (options.onInitialRequest) {
|
|
154
|
+
const canonicalInitialBrief = `${systemPrompt}\n\n${promptWithBudgetHint}`;
|
|
155
|
+
try {
|
|
156
|
+
options.onInitialRequest({
|
|
157
|
+
lengthChars: canonicalInitialBrief.length,
|
|
158
|
+
sha256: createHash('sha256').update(canonicalInitialBrief).digest('hex'),
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
// Swallow — a broken callback must not affect dispatch.
|
|
163
|
+
}
|
|
164
|
+
}
|
|
16
165
|
// Permission bypass is intentional for sub-agent use. File-system confinement
|
|
17
166
|
// is enforced by assertWithinCwd in tool definitions when sandboxPolicy is 'cwd-only'.
|
|
18
167
|
const queryOptions = {
|
|
@@ -23,6 +172,11 @@ export async function runClaude(prompt, options, providerConfig, defaults) {
|
|
|
23
172
|
allowDangerouslySkipPermissions: true,
|
|
24
173
|
persistSession: false,
|
|
25
174
|
abortController,
|
|
175
|
+
systemPrompt: {
|
|
176
|
+
type: 'preset',
|
|
177
|
+
preset: 'claude_code',
|
|
178
|
+
append: systemPrompt,
|
|
179
|
+
},
|
|
26
180
|
};
|
|
27
181
|
if (toolMode === 'full') {
|
|
28
182
|
const toolServer = createClaudeToolServer(toolImpls, sandboxPolicy);
|
|
@@ -46,73 +200,530 @@ export async function runClaude(prompt, options, providerConfig, defaults) {
|
|
|
46
200
|
// effort is typed as EffortLevel in Options; cast from string
|
|
47
201
|
queryOptions.effort = effort;
|
|
48
202
|
}
|
|
49
|
-
//
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
203
|
+
// --- Scratchpad: buffers every assistant text block we see streaming
|
|
204
|
+
// through the iterator. On any termination path (ok/incomplete/max_turns/
|
|
205
|
+
// error/timeout/force_salvage) we salvage `scratchpad.latest()` when the
|
|
206
|
+
// final `result.result` is empty or degenerate. ---
|
|
207
|
+
const scratchpad = new TextScratchpad();
|
|
208
|
+
// --- Watchdog: resolve the input-token soft limit once per run ---
|
|
209
|
+
const profile = findModelProfile(providerConfig.model);
|
|
210
|
+
const softLimit = resolveInputTokenSoftLimit(providerConfig, profile);
|
|
54
211
|
const run = async () => {
|
|
55
212
|
let output = '';
|
|
56
|
-
|
|
213
|
+
// --- Supervision / watchdog bookkeeping ---
|
|
214
|
+
let supervisionRetries = 0;
|
|
215
|
+
// Initialised to `null` (NOT ''): on the first turn there is no
|
|
216
|
+
// previous degenerate output to compare against, so the same-output
|
|
217
|
+
// early-out must be skipped. See openai-runner regression #5.
|
|
218
|
+
let lastDegenerateOutput = null;
|
|
219
|
+
// High-watermark guard for the watchdog warning nudge — fire at most
|
|
220
|
+
// once per distinct input-token level. Mirrors openai-runner.
|
|
221
|
+
let lastWarnedInputTokens = -1;
|
|
222
|
+
// --- Completed-result sentinel. Every exit from the supervision
|
|
223
|
+
// state machine inside the `for await` iterator sets this to a fully-
|
|
224
|
+
// built RunResult and then `break`s. After the loop, the one explicit
|
|
225
|
+
// return on the happy path is `completedResult`. This gives every
|
|
226
|
+
// exit (ok / incomplete / force_salvage / max_turns) a single
|
|
227
|
+
// explicit owner, mirroring openai-runner's `while (true) + return`
|
|
228
|
+
// shape but compatible with the for-await iterator contract. ---
|
|
229
|
+
let completedResult = null;
|
|
230
|
+
// --- Streaming input queue. See PushableUserMessageQueue docstring:
|
|
231
|
+
// using an async iterable as the `prompt` enables mid-run user-message
|
|
232
|
+
// injection (supervision re-prompts, re-grounding, budget-pressure
|
|
233
|
+
// nudges) without restarting the CLI subprocess. ---
|
|
234
|
+
const messageQueue = new PushableUserMessageQueue();
|
|
235
|
+
messageQueue.push(userMessage(promptWithBudgetHint));
|
|
57
236
|
try {
|
|
58
|
-
for await (const msg of query({ prompt, options: queryOptions })) {
|
|
237
|
+
for await (const msg of query({ prompt: messageQueue, options: queryOptions })) {
|
|
59
238
|
if (msg.type === 'assistant') {
|
|
60
239
|
turns++;
|
|
240
|
+
emit({ kind: 'turn_start', turn: turns, provider: 'claude' });
|
|
241
|
+
// Capture every assistant text block as scratchpad fodder. The
|
|
242
|
+
// claude-agent-sdk's BetaMessage.content is an array of blocks:
|
|
243
|
+
// `{ type: 'text', text } | { type: 'tool_use', ... } |
|
|
244
|
+
// { type: 'thinking', ... } | ...`. We only want plain text;
|
|
245
|
+
// tool_use blocks have no salvage value (they're side-effects)
|
|
246
|
+
// and thinking blocks are stripped before the caller sees them.
|
|
247
|
+
if ('message' in msg && msg.message && 'content' in msg.message) {
|
|
248
|
+
// The claude-agent-sdk's BetaMessage.content is typed as an
|
|
249
|
+
// array of content blocks — but historically the API sometimes
|
|
250
|
+
// delivers a bare string, so we defensively handle both. The
|
|
251
|
+
// string branch is narrow-typed to `never` by the SDK, so we
|
|
252
|
+
// cast through `unknown` to keep runtime safety without fighting
|
|
253
|
+
// the compiler.
|
|
254
|
+
const content = msg.message.content;
|
|
255
|
+
if (typeof content === 'string') {
|
|
256
|
+
scratchpad.append(turns, content);
|
|
257
|
+
if (content.length > 0) {
|
|
258
|
+
emit({
|
|
259
|
+
kind: 'text_emission',
|
|
260
|
+
turn: turns,
|
|
261
|
+
chars: content.length,
|
|
262
|
+
preview: content.slice(0, 200),
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
else if (Array.isArray(content)) {
|
|
267
|
+
const texts = content
|
|
268
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
269
|
+
.filter((c) => c && c.type === 'text' && typeof c.text === 'string')
|
|
270
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
271
|
+
.map((c) => c.text);
|
|
272
|
+
if (texts.length > 0) {
|
|
273
|
+
const joined = texts.join('\n');
|
|
274
|
+
scratchpad.append(turns, joined);
|
|
275
|
+
if (joined.length > 0) {
|
|
276
|
+
emit({
|
|
277
|
+
kind: 'text_emission',
|
|
278
|
+
turn: turns,
|
|
279
|
+
chars: joined.length,
|
|
280
|
+
preview: joined.slice(0, 200),
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
// --- Watchdog check (assistant-message cadence). We check
|
|
287
|
+
// `inputTokens` as accumulated from prior `result` messages.
|
|
288
|
+
// On the very first assistant message inputTokens is 0 and no
|
|
289
|
+
// threshold can fire; that's correct. This is also the ONLY
|
|
290
|
+
// site that handles `warning` — it logs AND pushes the nudge
|
|
291
|
+
// as one action. The post-result site only handles
|
|
292
|
+
// force_salvage. ---
|
|
293
|
+
const watchdogStatus = checkWatchdogThreshold(inputTokens, softLimit);
|
|
294
|
+
if (watchdogStatus !== 'ok') {
|
|
295
|
+
logWatchdogEvent(watchdogStatus, {
|
|
296
|
+
provider: 'claude',
|
|
297
|
+
model: providerConfig.model,
|
|
298
|
+
turn: turns,
|
|
299
|
+
inputTokens,
|
|
300
|
+
softLimit,
|
|
301
|
+
scratchpadChars: scratchpad.toString().length,
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
if (watchdogStatus === 'force_salvage') {
|
|
305
|
+
// `watchdog_force_salvage` is not an injected message — no
|
|
306
|
+
// re-prompt is sent — but observers still want to see why the
|
|
307
|
+
// run is being killed. We emit the event with
|
|
308
|
+
// `contentLengthChars: 0` to reflect the "nothing was injected,
|
|
309
|
+
// we just terminated" semantics (mirrors openai-runner).
|
|
310
|
+
emit({
|
|
311
|
+
kind: 'injection',
|
|
312
|
+
injectionType: 'watchdog_force_salvage',
|
|
313
|
+
turn: turns,
|
|
314
|
+
contentLengthChars: 0,
|
|
315
|
+
});
|
|
316
|
+
completedResult = buildClaudeForceSalvageResult({
|
|
317
|
+
tracker,
|
|
318
|
+
scratchpad,
|
|
319
|
+
providerConfig,
|
|
320
|
+
sdkCostUSD: costUSD,
|
|
321
|
+
inputTokens,
|
|
322
|
+
outputTokens,
|
|
323
|
+
turns,
|
|
324
|
+
softLimit,
|
|
325
|
+
});
|
|
326
|
+
messageQueue.close();
|
|
327
|
+
abortController.abort();
|
|
328
|
+
break;
|
|
329
|
+
}
|
|
330
|
+
// Fire the warning nudge at most once per distinct input-token
|
|
331
|
+
// high-watermark. We push a user message into the queue so the
|
|
332
|
+
// next turn of the conversation will address the budget-pressure
|
|
333
|
+
// prompt. If the nudge response is itself a valid final answer,
|
|
334
|
+
// the supervision loop on the NEXT `result` message will return
|
|
335
|
+
// `ok`. High-watermark guard prevents re-nudging if inputTokens
|
|
336
|
+
// stays the same across two assistant messages.
|
|
337
|
+
if (watchdogStatus === 'warning' && inputTokens > lastWarnedInputTokens) {
|
|
338
|
+
lastWarnedInputTokens = inputTokens;
|
|
339
|
+
const warning = buildBudgetPressureNudge({ inputTokens, softLimit });
|
|
340
|
+
emit({
|
|
341
|
+
kind: 'injection',
|
|
342
|
+
injectionType: 'watchdog_warning',
|
|
343
|
+
turn: turns,
|
|
344
|
+
contentLengthChars: warning.length,
|
|
345
|
+
});
|
|
346
|
+
messageQueue.push(userMessage(warning));
|
|
347
|
+
}
|
|
348
|
+
// --- Periodic re-grounding (best-effort in streaming-input
|
|
349
|
+
// mode): inject a reminder every RE_GROUNDING_INTERVAL_TURNS
|
|
350
|
+
// turns via the same queue. The iterator keeps reading until
|
|
351
|
+
// the CLI subprocess decides to emit a final result after it
|
|
352
|
+
// processes the new user message. ---
|
|
353
|
+
if (turns > 0 && turns % RE_GROUNDING_INTERVAL_TURNS === 0) {
|
|
354
|
+
const reground = buildReGroundingMessage({
|
|
355
|
+
originalPromptExcerpt: prompt,
|
|
356
|
+
currentTurn: turns,
|
|
357
|
+
maxTurns,
|
|
358
|
+
toolCallsSoFar: tracker.getToolCalls().length,
|
|
359
|
+
filesReadSoFar: tracker.getReads().length,
|
|
360
|
+
});
|
|
361
|
+
emit({
|
|
362
|
+
kind: 'injection',
|
|
363
|
+
injectionType: 'reground',
|
|
364
|
+
turn: turns,
|
|
365
|
+
contentLengthChars: reground.length,
|
|
366
|
+
});
|
|
367
|
+
messageQueue.push(userMessage(reground));
|
|
368
|
+
}
|
|
61
369
|
}
|
|
62
370
|
if (msg.type === 'result') {
|
|
63
371
|
if ('result' in msg) {
|
|
64
372
|
output = msg.result;
|
|
65
373
|
}
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
//
|
|
374
|
+
const hitMaxTurns = 'subtype' in msg && msg.subtype === 'error_max_turns';
|
|
375
|
+
// Extract usage from modelUsage or usage, then ACCUMULATE into
|
|
376
|
+
// the running inputTokens/outputTokens. Supervision retries in
|
|
377
|
+
// streaming-input mode push a new user message into the queue
|
|
378
|
+
// and the SDK emits a fresh `result` message per top-level user
|
|
379
|
+
// turn — we want the cumulative usage across every result we
|
|
380
|
+
// see, not just the last one. Accumulation keeps the watchdog
|
|
381
|
+
// soft-limit check honest across retries and produces correct
|
|
382
|
+
// totals on any termination path.
|
|
383
|
+
let turnInputTokens = 0;
|
|
384
|
+
let turnOutputTokens = 0;
|
|
70
385
|
if ('modelUsage' in msg && msg.modelUsage) {
|
|
71
386
|
for (const model of Object.values(msg.modelUsage)) {
|
|
72
|
-
|
|
73
|
-
|
|
387
|
+
turnInputTokens += model.inputTokens ?? 0;
|
|
388
|
+
turnOutputTokens += model.outputTokens ?? 0;
|
|
74
389
|
}
|
|
75
390
|
}
|
|
76
391
|
else if ('usage' in msg && msg.usage) {
|
|
77
392
|
const u = msg.usage;
|
|
78
|
-
|
|
79
|
-
|
|
393
|
+
turnInputTokens = u['input_tokens'] ?? 0;
|
|
394
|
+
turnOutputTokens = u['output_tokens'] ?? 0;
|
|
80
395
|
}
|
|
396
|
+
inputTokens += turnInputTokens;
|
|
397
|
+
outputTokens += turnOutputTokens;
|
|
81
398
|
if ('total_cost_usd' in msg && typeof msg.total_cost_usd === 'number') {
|
|
82
399
|
costUSD = msg.total_cost_usd;
|
|
83
400
|
}
|
|
401
|
+
// --- turn_complete: one event per result message (which
|
|
402
|
+
// corresponds to one top-level assistant turn from the SDK's
|
|
403
|
+
// perspective). Fires after usage aggregation so the cumulative
|
|
404
|
+
// counters are up-to-date.
|
|
405
|
+
emit({
|
|
406
|
+
kind: 'turn_complete',
|
|
407
|
+
turn: turns,
|
|
408
|
+
cumulativeInputTokens: inputTokens,
|
|
409
|
+
cumulativeOutputTokens: outputTokens,
|
|
410
|
+
});
|
|
411
|
+
// --- Watchdog check on the result message as well: input tokens
|
|
412
|
+
// have just jumped and we may now be in force_salvage territory.
|
|
413
|
+
// The post-result site ONLY handles force_salvage. `warning` is
|
|
414
|
+
// intentionally ignored here — the assistant-message-cadence site
|
|
415
|
+
// above is the single place that logs warnings AND pushes the
|
|
416
|
+
// nudge into the queue. Logging `warning` here without pushing a
|
|
417
|
+
// nudge would be misleading (suggests action that didn't happen).
|
|
418
|
+
const postResultWatchdog = checkWatchdogThreshold(inputTokens, softLimit);
|
|
419
|
+
if (postResultWatchdog === 'force_salvage') {
|
|
420
|
+
logWatchdogEvent(postResultWatchdog, {
|
|
421
|
+
provider: 'claude',
|
|
422
|
+
model: providerConfig.model,
|
|
423
|
+
turn: turns,
|
|
424
|
+
inputTokens,
|
|
425
|
+
softLimit,
|
|
426
|
+
scratchpadChars: scratchpad.toString().length,
|
|
427
|
+
});
|
|
428
|
+
emit({
|
|
429
|
+
kind: 'injection',
|
|
430
|
+
injectionType: 'watchdog_force_salvage',
|
|
431
|
+
turn: turns,
|
|
432
|
+
contentLengthChars: 0,
|
|
433
|
+
});
|
|
434
|
+
completedResult = buildClaudeForceSalvageResult({
|
|
435
|
+
tracker,
|
|
436
|
+
scratchpad,
|
|
437
|
+
providerConfig,
|
|
438
|
+
sdkCostUSD: costUSD,
|
|
439
|
+
inputTokens,
|
|
440
|
+
outputTokens,
|
|
441
|
+
turns,
|
|
442
|
+
softLimit,
|
|
443
|
+
});
|
|
444
|
+
messageQueue.close();
|
|
445
|
+
abortController.abort();
|
|
446
|
+
break;
|
|
447
|
+
}
|
|
448
|
+
// --- Max-turns: don't supervise a max-turns termination,
|
|
449
|
+
// build the max_turns result directly and exit. ---
|
|
450
|
+
if (hitMaxTurns) {
|
|
451
|
+
completedResult = buildClaudeMaxTurnsResult({
|
|
452
|
+
tracker,
|
|
453
|
+
scratchpad,
|
|
454
|
+
providerConfig,
|
|
455
|
+
sdkCostUSD: costUSD,
|
|
456
|
+
inputTokens,
|
|
457
|
+
outputTokens,
|
|
458
|
+
turns,
|
|
459
|
+
maxTurns,
|
|
460
|
+
lastOutput: output,
|
|
461
|
+
});
|
|
462
|
+
messageQueue.close();
|
|
463
|
+
break;
|
|
464
|
+
}
|
|
465
|
+
// --- Supervision: validate the captured output. Valid output
|
|
466
|
+
// is an immediate ok-exit. Degenerate output either re-prompts
|
|
467
|
+
// (and keeps reading the iterator) or — if the retry budget is
|
|
468
|
+
// spent / same-output early-out fires — exits as incomplete. ---
|
|
469
|
+
const validation = validateCompletion(output);
|
|
470
|
+
if (validation.valid) {
|
|
471
|
+
completedResult = buildClaudeOkResult({
|
|
472
|
+
tracker,
|
|
473
|
+
scratchpad,
|
|
474
|
+
providerConfig,
|
|
475
|
+
sdkCostUSD: costUSD,
|
|
476
|
+
inputTokens,
|
|
477
|
+
outputTokens,
|
|
478
|
+
turns,
|
|
479
|
+
output,
|
|
480
|
+
});
|
|
481
|
+
messageQueue.close();
|
|
482
|
+
break;
|
|
483
|
+
}
|
|
484
|
+
// Same-output early-out: don't burn another retry on identical
|
|
485
|
+
// garbage. Compare only when we have a previous degenerate.
|
|
486
|
+
if (lastDegenerateOutput !== null &&
|
|
487
|
+
sameDegenerateOutput(output, lastDegenerateOutput)) {
|
|
488
|
+
completedResult = buildClaudeIncompleteResult({
|
|
489
|
+
tracker,
|
|
490
|
+
scratchpad,
|
|
491
|
+
providerConfig,
|
|
492
|
+
sdkCostUSD: costUSD,
|
|
493
|
+
inputTokens,
|
|
494
|
+
outputTokens,
|
|
495
|
+
turns,
|
|
496
|
+
});
|
|
497
|
+
messageQueue.close();
|
|
498
|
+
break;
|
|
499
|
+
}
|
|
500
|
+
lastDegenerateOutput = output;
|
|
501
|
+
supervisionRetries++;
|
|
502
|
+
if (supervisionRetries >= MAX_SUPERVISION_RETRIES) {
|
|
503
|
+
completedResult = buildClaudeIncompleteResult({
|
|
504
|
+
tracker,
|
|
505
|
+
scratchpad,
|
|
506
|
+
providerConfig,
|
|
507
|
+
sdkCostUSD: costUSD,
|
|
508
|
+
inputTokens,
|
|
509
|
+
outputTokens,
|
|
510
|
+
turns,
|
|
511
|
+
});
|
|
512
|
+
messageQueue.close();
|
|
513
|
+
break;
|
|
514
|
+
}
|
|
515
|
+
// Push the re-prompt and continue reading the iterator.
|
|
516
|
+
const rePrompt = buildRePrompt(validation);
|
|
517
|
+
emit({
|
|
518
|
+
kind: 'injection',
|
|
519
|
+
injectionType: injectionTypeFor(validation.kind),
|
|
520
|
+
turn: turns,
|
|
521
|
+
contentLengthChars: rePrompt.length,
|
|
522
|
+
});
|
|
523
|
+
messageQueue.push(userMessage(rePrompt));
|
|
84
524
|
}
|
|
85
525
|
}
|
|
86
526
|
}
|
|
87
527
|
catch (err) {
|
|
528
|
+
// Preserve partial usage — the scratchpad may have buffered text
|
|
529
|
+
// from turns that ran before the throw. Route the thrown error
|
|
530
|
+
// through the shared classifier so the escalation orchestrator can
|
|
531
|
+
// distinguish abort / network / HTTP-error / generic failure modes.
|
|
532
|
+
const { status, reason } = classifyError(err);
|
|
533
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
534
|
+
emit({ kind: 'done', status });
|
|
535
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
88
536
|
return {
|
|
89
|
-
output: `Sub-agent error: ${
|
|
90
|
-
status
|
|
91
|
-
usage: {
|
|
537
|
+
output: hasSalvage ? scratchpad.latest() : `Sub-agent error: ${msg}`,
|
|
538
|
+
status,
|
|
539
|
+
usage: {
|
|
540
|
+
inputTokens,
|
|
541
|
+
outputTokens,
|
|
542
|
+
totalTokens: inputTokens + outputTokens,
|
|
543
|
+
costUSD: effectiveClaudeCost(providerConfig, inputTokens, outputTokens, costUSD),
|
|
544
|
+
},
|
|
92
545
|
turns,
|
|
93
|
-
|
|
94
|
-
|
|
546
|
+
filesRead: tracker.getReads(),
|
|
547
|
+
filesWritten: tracker.getWrites(),
|
|
548
|
+
toolCalls: tracker.getToolCalls(),
|
|
549
|
+
outputIsDiagnostic: !hasSalvage,
|
|
550
|
+
escalationLog: [],
|
|
551
|
+
error: msg || reason,
|
|
95
552
|
};
|
|
96
553
|
}
|
|
554
|
+
// Every `break` inside the iterator above assigned `completedResult`
|
|
555
|
+
// before exiting. If the iterator drained without any break (e.g. the
|
|
556
|
+
// SDK closed the stream cleanly without ever emitting a final
|
|
557
|
+
// `result`), synthesize an incomplete result so the caller always
|
|
558
|
+
// gets a meaningful diagnostic instead of undefined.
|
|
559
|
+
if (completedResult) {
|
|
560
|
+
emit({ kind: 'done', status: completedResult.status });
|
|
561
|
+
return completedResult;
|
|
562
|
+
}
|
|
563
|
+
const drained = buildClaudeIncompleteResult({
|
|
564
|
+
tracker,
|
|
565
|
+
scratchpad,
|
|
566
|
+
providerConfig,
|
|
567
|
+
sdkCostUSD: costUSD,
|
|
568
|
+
inputTokens,
|
|
569
|
+
outputTokens,
|
|
570
|
+
turns,
|
|
571
|
+
});
|
|
572
|
+
emit({ kind: 'done', status: drained.status });
|
|
573
|
+
return drained;
|
|
574
|
+
};
|
|
575
|
+
return withTimeout(run(), timeoutMs, () => {
|
|
576
|
+
emit({ kind: 'done', status: 'timeout' });
|
|
577
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
97
578
|
return {
|
|
98
|
-
output:
|
|
99
|
-
status:
|
|
579
|
+
output: hasSalvage ? scratchpad.latest() : `Agent timed out after ${timeoutMs}ms.`,
|
|
580
|
+
status: 'timeout',
|
|
581
|
+
filesRead: tracker.getReads(),
|
|
582
|
+
filesWritten: tracker.getWrites(),
|
|
583
|
+
toolCalls: tracker.getToolCalls(),
|
|
100
584
|
usage: {
|
|
101
585
|
inputTokens,
|
|
102
586
|
outputTokens,
|
|
103
587
|
totalTokens: inputTokens + outputTokens,
|
|
104
|
-
costUSD,
|
|
588
|
+
costUSD: effectiveClaudeCost(providerConfig, inputTokens, outputTokens, costUSD),
|
|
105
589
|
},
|
|
106
590
|
turns,
|
|
107
|
-
|
|
591
|
+
outputIsDiagnostic: !hasSalvage,
|
|
592
|
+
escalationLog: [],
|
|
108
593
|
};
|
|
594
|
+
}, abortController);
|
|
595
|
+
}
|
|
596
|
+
function effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCost) {
|
|
597
|
+
const computed = computeCostUSD(inputTokens, outputTokens, providerConfig);
|
|
598
|
+
return computed ?? sdkCost;
|
|
599
|
+
}
|
|
600
|
+
function buildClaudeOkResult(args) {
|
|
601
|
+
const { tracker, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, output } = args;
|
|
602
|
+
return {
|
|
603
|
+
output,
|
|
604
|
+
status: 'ok',
|
|
605
|
+
usage: {
|
|
606
|
+
inputTokens,
|
|
607
|
+
outputTokens,
|
|
608
|
+
totalTokens: inputTokens + outputTokens,
|
|
609
|
+
costUSD: effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD),
|
|
610
|
+
},
|
|
611
|
+
turns,
|
|
612
|
+
filesRead: tracker.getReads(),
|
|
613
|
+
filesWritten: tracker.getWrites(),
|
|
614
|
+
toolCalls: tracker.getToolCalls(),
|
|
615
|
+
// `ok` always carries a real model answer — never a diagnostic.
|
|
616
|
+
outputIsDiagnostic: false,
|
|
617
|
+
escalationLog: [],
|
|
618
|
+
};
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Supervision-exhausted path: retry cap hit or same-output early-out. Prefer
|
|
622
|
+
* scratchpad salvage; fall back to the incomplete diagnostic.
|
|
623
|
+
*/
|
|
624
|
+
function buildClaudeIncompleteResult(args) {
|
|
625
|
+
const { tracker, scratchpad, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns } = args;
|
|
626
|
+
const filesRead = tracker.getReads();
|
|
627
|
+
const filesWritten = tracker.getWrites();
|
|
628
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
629
|
+
return {
|
|
630
|
+
output: hasSalvage
|
|
631
|
+
? scratchpad.latest()
|
|
632
|
+
: buildClaudeIncompleteDiagnostic({
|
|
633
|
+
turns,
|
|
634
|
+
inputTokens,
|
|
635
|
+
outputTokens,
|
|
636
|
+
filesRead,
|
|
637
|
+
filesWritten,
|
|
638
|
+
}),
|
|
639
|
+
status: 'incomplete',
|
|
640
|
+
usage: {
|
|
641
|
+
inputTokens,
|
|
642
|
+
outputTokens,
|
|
643
|
+
totalTokens: inputTokens + outputTokens,
|
|
644
|
+
costUSD: effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD),
|
|
645
|
+
},
|
|
646
|
+
turns,
|
|
647
|
+
filesRead,
|
|
648
|
+
filesWritten,
|
|
649
|
+
toolCalls: tracker.getToolCalls(),
|
|
650
|
+
outputIsDiagnostic: !hasSalvage,
|
|
651
|
+
escalationLog: [],
|
|
109
652
|
};
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
653
|
+
}
|
|
654
|
+
function buildClaudeForceSalvageResult(args) {
|
|
655
|
+
const { tracker, scratchpad, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, softLimit } = args;
|
|
656
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
657
|
+
return {
|
|
658
|
+
output: hasSalvage
|
|
659
|
+
? scratchpad.latest()
|
|
660
|
+
: `[claude sub-agent forcibly terminated at ${inputTokens} input tokens (soft limit ${softLimit}). No usable text was buffered.]`,
|
|
661
|
+
status: 'incomplete',
|
|
662
|
+
usage: {
|
|
663
|
+
inputTokens,
|
|
664
|
+
outputTokens,
|
|
665
|
+
totalTokens: inputTokens + outputTokens,
|
|
666
|
+
costUSD: effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD),
|
|
667
|
+
},
|
|
668
|
+
turns,
|
|
669
|
+
filesRead: tracker.getReads(),
|
|
670
|
+
filesWritten: tracker.getWrites(),
|
|
671
|
+
toolCalls: tracker.getToolCalls(),
|
|
672
|
+
outputIsDiagnostic: !hasSalvage,
|
|
673
|
+
escalationLog: [],
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
function buildClaudeMaxTurnsResult(args) {
|
|
677
|
+
const { tracker, scratchpad, providerConfig, sdkCostUSD, inputTokens, outputTokens, turns, maxTurns, lastOutput } = args;
|
|
678
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
679
|
+
// Note: `lastOutput` here is the model's last streamed text before the
|
|
680
|
+
// max-turns boundary — NOT a diagnostic template. If the scratchpad has
|
|
681
|
+
// nothing but `lastOutput` is non-empty, that's still real model content,
|
|
682
|
+
// so outputIsDiagnostic is false. Only the `Agent exceeded max turns…`
|
|
683
|
+
// fallback (empty scratchpad AND empty lastOutput) is a diagnostic.
|
|
684
|
+
const output = hasSalvage
|
|
685
|
+
? scratchpad.latest()
|
|
686
|
+
: (lastOutput || `Agent exceeded max turns (${maxTurns}).`);
|
|
687
|
+
const outputIsDiagnostic = !hasSalvage && !lastOutput;
|
|
688
|
+
return {
|
|
689
|
+
output,
|
|
690
|
+
status: 'max_turns',
|
|
691
|
+
usage: {
|
|
692
|
+
inputTokens,
|
|
693
|
+
outputTokens,
|
|
694
|
+
totalTokens: inputTokens + outputTokens,
|
|
695
|
+
costUSD: effectiveClaudeCost(providerConfig, inputTokens, outputTokens, sdkCostUSD),
|
|
696
|
+
},
|
|
115
697
|
turns,
|
|
116
|
-
|
|
698
|
+
filesRead: tracker.getReads(),
|
|
699
|
+
filesWritten: tracker.getWrites(),
|
|
700
|
+
toolCalls: tracker.getToolCalls(),
|
|
701
|
+
outputIsDiagnostic,
|
|
702
|
+
escalationLog: [],
|
|
703
|
+
};
|
|
704
|
+
}
|
|
705
|
+
function buildClaudeIncompleteDiagnostic(opts) {
|
|
706
|
+
const formatList = (files) => {
|
|
707
|
+
const MAX_SHOWN = 10;
|
|
708
|
+
if (files.length === 0)
|
|
709
|
+
return '';
|
|
710
|
+
if (files.length <= MAX_SHOWN)
|
|
711
|
+
return ` (${files.join(', ')})`;
|
|
712
|
+
return ` (${files.slice(0, MAX_SHOWN).join(', ')}, … ${files.length - MAX_SHOWN} more)`;
|
|
713
|
+
};
|
|
714
|
+
return [
|
|
715
|
+
'[claude sub-agent terminated without producing a final answer]',
|
|
716
|
+
'',
|
|
717
|
+
'The query stream ended without ever emitting a result message. This usually means ' +
|
|
718
|
+
'the agent loop exited prematurely or the SDK lost the final message.',
|
|
719
|
+
'',
|
|
720
|
+
`Turns used: ${opts.turns}`,
|
|
721
|
+
`Input tokens: ${opts.inputTokens}`,
|
|
722
|
+
`Output tokens: ${opts.outputTokens}`,
|
|
723
|
+
`Files read: ${opts.filesRead.length}${formatList(opts.filesRead)}`,
|
|
724
|
+
`Files written: ${opts.filesWritten.length}${formatList(opts.filesWritten)}`,
|
|
725
|
+
'',
|
|
726
|
+
'Recommended action: re-dispatch with a tighter brief, or check Claude Agent SDK logs.',
|
|
727
|
+
].join('\n');
|
|
117
728
|
}
|
|
118
729
|
//# sourceMappingURL=claude-runner.js.map
|