@zhixuan92/multi-model-agent-core 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +0 -6
  2. package/dist/config/schema.d.ts +73 -45
  3. package/dist/config/schema.d.ts.map +1 -1
  4. package/dist/config/schema.js +14 -0
  5. package/dist/config/schema.js.map +1 -1
  6. package/dist/context/context-block-store.d.ts +75 -0
  7. package/dist/context/context-block-store.d.ts.map +1 -0
  8. package/dist/context/context-block-store.js +82 -0
  9. package/dist/context/context-block-store.js.map +1 -0
  10. package/dist/context/expand-context-blocks.d.ts +20 -0
  11. package/dist/context/expand-context-blocks.d.ts.map +1 -0
  12. package/dist/context/expand-context-blocks.js +46 -0
  13. package/dist/context/expand-context-blocks.js.map +1 -0
  14. package/dist/delegate-with-escalation.d.ts +34 -0
  15. package/dist/delegate-with-escalation.d.ts.map +1 -0
  16. package/dist/delegate-with-escalation.js +172 -0
  17. package/dist/delegate-with-escalation.js.map +1 -0
  18. package/dist/index.d.ts +4 -1
  19. package/dist/index.d.ts.map +1 -1
  20. package/dist/index.js +3 -0
  21. package/dist/index.js.map +1 -1
  22. package/dist/model-profiles.json +42 -4
  23. package/dist/provider.d.ts.map +1 -1
  24. package/dist/provider.js +7 -1
  25. package/dist/provider.js.map +1 -1
  26. package/dist/routing/model-profiles.d.ts +9 -4
  27. package/dist/routing/model-profiles.d.ts.map +1 -1
  28. package/dist/routing/model-profiles.js +8 -0
  29. package/dist/routing/model-profiles.js.map +1 -1
  30. package/dist/run-tasks.d.ts +26 -2
  31. package/dist/run-tasks.d.ts.map +1 -1
  32. package/dist/run-tasks.js +61 -19
  33. package/dist/run-tasks.js.map +1 -1
  34. package/dist/runners/claude-runner.d.ts.map +1 -1
  35. package/dist/runners/claude-runner.js +721 -32
  36. package/dist/runners/claude-runner.js.map +1 -1
  37. package/dist/runners/codex-runner.d.ts.map +1 -1
  38. package/dist/runners/codex-runner.js +541 -48
  39. package/dist/runners/codex-runner.js.map +1 -1
  40. package/dist/runners/error-classification.d.ts +30 -0
  41. package/dist/runners/error-classification.d.ts.map +1 -0
  42. package/dist/runners/error-classification.js +72 -0
  43. package/dist/runners/error-classification.js.map +1 -0
  44. package/dist/runners/injection-type.d.ts +22 -0
  45. package/dist/runners/injection-type.d.ts.map +1 -0
  46. package/dist/runners/injection-type.js +34 -0
  47. package/dist/runners/injection-type.js.map +1 -0
  48. package/dist/runners/openai-runner.d.ts +5 -0
  49. package/dist/runners/openai-runner.d.ts.map +1 -1
  50. package/dist/runners/openai-runner.js +608 -36
  51. package/dist/runners/openai-runner.js.map +1 -1
  52. package/dist/runners/prevention.d.ts +41 -0
  53. package/dist/runners/prevention.d.ts.map +1 -0
  54. package/dist/runners/prevention.js +68 -0
  55. package/dist/runners/prevention.js.map +1 -0
  56. package/dist/runners/supervision.d.ts +137 -0
  57. package/dist/runners/supervision.d.ts.map +1 -0
  58. package/dist/runners/supervision.js +345 -0
  59. package/dist/runners/supervision.js.map +1 -0
  60. package/dist/tools/claude-adapter.d.ts.map +1 -1
  61. package/dist/tools/claude-adapter.js +6 -3
  62. package/dist/tools/claude-adapter.js.map +1 -1
  63. package/dist/tools/definitions.d.ts +3 -1
  64. package/dist/tools/definitions.d.ts.map +1 -1
  65. package/dist/tools/definitions.js +57 -5
  66. package/dist/tools/definitions.js.map +1 -1
  67. package/dist/tools/openai-adapter.d.ts.map +1 -1
  68. package/dist/tools/openai-adapter.js +6 -3
  69. package/dist/tools/openai-adapter.js.map +1 -1
  70. package/dist/tools/scratchpad.d.ts +28 -0
  71. package/dist/tools/scratchpad.d.ts.map +1 -0
  72. package/dist/tools/scratchpad.js +49 -0
  73. package/dist/tools/scratchpad.js.map +1 -0
  74. package/dist/tools/tracker.d.ts +42 -2
  75. package/dist/tools/tracker.d.ts.map +1 -1
  76. package/dist/tools/tracker.js +63 -5
  77. package/dist/tools/tracker.js.map +1 -1
  78. package/dist/types.d.ts +261 -2
  79. package/dist/types.d.ts.map +1 -1
  80. package/dist/types.js +43 -1
  81. package/dist/types.js.map +1 -1
  82. package/package.json +7 -3
@@ -1,8 +1,16 @@
1
- import { Agent, run as agentRun, setTracingDisabled, OpenAIChatCompletionsModel, MaxTurnsExceededError } from '@openai/agents';
2
- import { withTimeout } from '../types.js';
1
+ import { Agent, run as agentRun, setTracingDisabled, OpenAIChatCompletionsModel, MaxTurnsExceededError, } from '@openai/agents';
2
+ import { createHash } from 'node:crypto';
3
+ import { withTimeout, computeCostUSD, computeSavedCostUSD, } from '../types.js';
4
+ import { trimProgressTrace } from './supervision.js';
5
+ import { injectionTypeFor } from './injection-type.js';
3
6
  import { FileTracker } from '../tools/tracker.js';
4
7
  import { createToolImplementations } from '../tools/definitions.js';
5
8
  import { createOpenAITools } from '../tools/openai-adapter.js';
9
+ import { TextScratchpad } from '../tools/scratchpad.js';
10
+ import { buildSystemPrompt, buildBudgetHint, buildReGroundingMessage, buildBudgetPressureNudge, RE_GROUNDING_INTERVAL_TURNS, } from './prevention.js';
11
+ import { validateCompletion, validateCoverage, buildRePrompt, sameDegenerateOutput, resolveInputTokenSoftLimit, checkWatchdogThreshold, logWatchdogEvent, THINKING_DIAGNOSTIC_MARKER, } from './supervision.js';
12
+ import { classifyError } from './error-classification.js';
13
+ import { findModelProfile } from '../routing/model-profiles.js';
6
14
  // Disable tracing — not all OpenAI-compatible providers support it
7
15
  setTracingDisabled(true);
8
16
  /**
@@ -12,9 +20,60 @@ setTracingDisabled(true);
12
20
  * chain-of-thought inline wrapped in `<think>...</think>` tags. These are
13
21
  * scratch-pad content and should not surface to the caller. Stripping is
14
22
  * non-greedy, multi-line, and handles multiple blocks.
23
+ *
24
+ * If the entire input was reasoning (stripping leaves nothing), return an
25
+ * explicit marker instead of an empty string. Silently swallowing
26
+ * "all thinking, no answer" responses leaves the caller with `output: ""`
27
+ * and no idea what happened — see the openai-runner empty-output diagnostic.
15
28
  */
16
29
  export function stripThinkingTags(text) {
17
- return text.replace(/<think>[\s\S]*?<\/think>\s*/gi, '').trimStart();
30
+ if (!text)
31
+ return '';
32
+ const stripped = text.replace(/<think>[\s\S]*?<\/think>\s*/gi, '').trimStart();
33
+ if (!stripped && /<think>[\s\S]*?<\/think>/i.test(text)) {
34
+ return THINKING_DIAGNOSTIC_MARKER;
35
+ }
36
+ return stripped;
37
+ }
38
+ /**
39
+ * Hard cap on supervision re-prompts before we give up and salvage. Three is
40
+ * the value chosen in the spec (A.2.2): enough room for the model to recover
41
+ * from a one-off fragment but not so many that a wedged model can burn the
42
+ * budget via repeated re-prompts.
43
+ */
44
+ /** Maximum turns for each continuation (reprompt/reground/watchdog-warning) in the
45
+ * supervision loop. Higher than the old hardcoded 1 so the model can call a tool
46
+ * and reply to the tool result without immediately exhausting the sub-budget. */
47
+ const SUPERVISION_CONTINUATION_BUDGET = 5;
48
+ const MAX_SUPERVISION_RETRIES = 3;
49
+ /**
50
+ * Extract every assistant text emission from a single `agentRun(...)` result.
51
+ * See the SDK introspection finding in supervision.ts: `result.newItems` is a
52
+ * discriminated union and entries of type `"message_output_item"` wrap an
53
+ * `AssistantMessageItem` whose `content` is a list of `{ type: 'output_text',
54
+ * text }` / `refusal` / `audio` / `image` parts. We concatenate every
55
+ * `output_text` part from every assistant `message_output_item`. Refusals
56
+ * and non-text parts are ignored (they have no salvage value for a
57
+ * text-in-text-out sub-agent).
58
+ */
59
+ function extractAssistantText(newItems) {
60
+ const chunks = [];
61
+ for (const item of newItems) {
62
+ if (item.type !== 'message_output_item')
63
+ continue;
64
+ const raw = item.rawItem;
65
+ if (raw.role !== 'assistant')
66
+ continue;
67
+ const content = raw.content;
68
+ if (!Array.isArray(content))
69
+ continue;
70
+ for (const part of content) {
71
+ if (part.type === 'output_text' && typeof part.text === 'string') {
72
+ chunks.push(part.text);
73
+ }
74
+ }
75
+ }
76
+ return chunks.join('');
18
77
  }
19
78
  export async function runOpenAI(prompt, options, runner) {
20
79
  const maxTurns = options.maxTurns ?? runner.providerConfig.maxTurns ?? runner.defaults.maxTurns;
@@ -24,7 +83,41 @@ export async function runOpenAI(prompt, options, runner) {
24
83
  const effort = options.effort ?? runner.providerConfig.effort;
25
84
  const sandboxPolicy = options.sandboxPolicy ?? runner.providerConfig.sandboxPolicy ?? 'cwd-only';
26
85
  const abortController = new AbortController();
27
- const tracker = new FileTracker();
86
+ // --- Task timing + parent model (Task 9) --------------------------------
87
+ const taskStartMs = Date.now();
88
+ const parentModel = options.parentModel;
89
+ // --- Progress trace capture (Task 10) ---------------------------------
90
+ const shouldCaptureTrace = options.includeProgressTrace ?? false;
91
+ const traceBuffer = [];
92
+ // --- Progress event emission (Task 9) -----------------------------------
93
+ //
94
+ // `onProgress` is already wrapped in `safeSink` by the orchestrator
95
+ // (Task 8), so any throw from the consumer callback is swallowed
96
+ // upstream and cannot corrupt this loop. We do not need to wrap it
97
+ // again here.
98
+ const onProgress = options.onProgress;
99
+ const emit = (event) => {
100
+ if (shouldCaptureTrace)
101
+ traceBuffer.push(event);
102
+ if (onProgress)
103
+ onProgress(event);
104
+ };
105
+ // Hoisted out of `run()` so the withTimeout callback (which runs in a
106
+ // different microtask chain) can still read partial usage from the last
107
+ // successful agentRun. `run()` updates this on every turn. Declared
108
+ // here (before the tracker) so the FileTracker callback closure can
109
+ // reference it without a TDZ issue at construction.
110
+ let currentResult;
111
+ // The tracker fires `onToolCall` synchronously inside every
112
+ // `trackToolCall(...)` — which itself is called from inside a tool
113
+ // implementation during an `agentRun` turn. That means `currentResult`
114
+ // may still hold the PREVIOUS turn's request count when the callback
115
+ // fires. We read it with an optional chain + fallback and attribute
116
+ // the tool call to the in-flight turn (previous turn + 1).
117
+ const tracker = new FileTracker((summary) => {
118
+ const inflightTurn = (currentResult?.state.usage.requests ?? 0) + 1;
119
+ emit({ kind: 'tool_call', turn: inflightTurn, toolSummary: summary });
120
+ });
28
121
  const toolImpls = createToolImplementations(tracker, cwd, sandboxPolicy, abortController.signal);
29
122
  const fileTools = toolMode === 'full' ? createOpenAITools(toolImpls, sandboxPolicy) : [];
30
123
  // Add hosted tools (web_search, image_generation, etc.) if configured — only when tools are enabled
@@ -33,60 +126,539 @@ export async function runOpenAI(prompt, options, runner) {
33
126
  ? (runner.providerConfig.hostedTools ?? []).map(t => ({ type: t }))
34
127
  : [];
35
128
  const tools = [...fileTools, ...hostedTools];
129
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
36
130
  const model = new OpenAIChatCompletionsModel(runner.client, runner.providerConfig.model);
131
+ // --- Prevention layer: system prompt + budget hint ---
132
+ //
133
+ // buildSystemPrompt() is deliberately static and parameter-free. The Task 1
134
+ // review rejected speculative `providerLabel` / `maxTurns` parameters — the
135
+ // system prompt is generic ~400 tokens of discipline that applies to every
136
+ // provider. Per-turn budget information is threaded through buildBudgetHint
137
+ // (prepended to the first user prompt) and buildReGroundingMessage
138
+ // (injected every RE_GROUNDING_INTERVAL_TURNS turns).
139
+ const systemPrompt = buildSystemPrompt();
140
+ const budgetHint = buildBudgetHint({ maxTurns });
141
+ const promptWithBudgetHint = `${budgetHint}\n\n${prompt}`;
142
+ // --- onInitialRequest (Task 12) ----------------------------------------
143
+ //
144
+ // Fire once per attempt with the canonical orchestrator-side initial
145
+ // brief: `${systemPrompt}\n\n${promptWithBudgetHint}`. This is NOT the
146
+ // literal request body the `@openai/agents` SDK transmits — the SDK
147
+ // wraps our systemPrompt in the Agent `instructions` field and our
148
+ // user prompt in a messages array. We hash the canonical form instead
149
+ // so the hash is cross-runner stable: the same canonical brief on any
150
+ // of the three runners produces the same hash, even though each SDK's
151
+ // wire format differs. This answers "did the orchestrator send the
152
+ // same brief across retries?" — not "were the literal wire bytes
153
+ // identical?". See `AttemptRecord.initialPromptHash` in types.ts for
154
+ // the full caveat. We guard with try/catch because the orchestrator
155
+ // owns the callback and a throw would corrupt its closure (symmetry
156
+ // with safeSink around onProgress).
157
+ if (options.onInitialRequest) {
158
+ const canonicalInitialBrief = `${systemPrompt}\n\n${promptWithBudgetHint}`;
159
+ try {
160
+ options.onInitialRequest({
161
+ lengthChars: canonicalInitialBrief.length,
162
+ sha256: createHash('sha256').update(canonicalInitialBrief).digest('hex'),
163
+ });
164
+ }
165
+ catch {
166
+ // Swallow — a broken callback must not affect dispatch.
167
+ }
168
+ }
37
169
  const agent = new Agent({
38
170
  name: 'sub-agent',
39
171
  model,
40
- instructions: 'You are a helpful assistant. Complete the task given to you. Use the provided tools when needed.',
172
+ instructions: systemPrompt,
41
173
  tools,
42
174
  ...(effort && effort !== 'none' && {
43
175
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
44
176
  modelSettings: { reasoning: { effort: effort } },
45
177
  }),
46
178
  });
179
+ // --- Watchdog: resolve the input-token soft limit once per run ---
180
+ const profile = findModelProfile(runner.providerConfig.model);
181
+ const softLimit = resolveInputTokenSoftLimit(runner.providerConfig, profile);
182
+ // --- Scratchpad: buffers assistant text across every agentRun() call so
183
+ // that every termination path (ok/incomplete/max_turns/error/timeout/
184
+ // force_salvage) can return the best text we heard, even if the final
185
+ // message is junk. ---
186
+ const scratchpad = new TextScratchpad();
187
+ /**
188
+ * Build an AgentInputItem[] for continuing `prev` with a new user message.
189
+ *
190
+ * @openai/agents does NOT expose a `conversation:` option on `run()`. It
191
+ * accepts `string | AgentInputItem[] | RunState` as the input. The idiomatic
192
+ * way to "continue" a completed run with a new user turn is to pass
193
+ * `[...prev.history, { role: 'user', content: newText }]` as the input on
194
+ * the next call. `result.history` is the full conversation (system prompt
195
+ * + original input + every new item generated during the run), typed as
196
+ * `AgentInputItem[]`. See node_modules/@openai/agents-core/dist/run.d.ts
197
+ * line 182 and result.d.ts line 84 (history getter).
198
+ */
199
+ const continueWith = (prev, nextUserMessage) => {
200
+ const history = prev.history;
201
+ return [
202
+ ...history,
203
+ { role: 'user', content: nextUserMessage },
204
+ ];
205
+ };
206
+ /**
207
+ * Local helper: run one agent turn and buffer its assistant text into
208
+ * the scratchpad. Closes over `agent`, `abortController`, `scratchpad`
209
+ * and `emit` so every call site in `run()` is just one line AND every
210
+ * turn automatically emits the correct `turn_start` / `text_emission`
211
+ * / `turn_complete` progress events.
212
+ *
213
+ * Event ordering:
214
+ * 1. `turn_start` — fires BEFORE agentRun. Turn number is the NEXT
215
+ * request count (prev + 1) because the SDK won't bump
216
+ * `state.usage.requests` until the call completes.
217
+ * 2. `text_emission` — fires AFTER scratchpad.append, only when the
218
+ * stripped assistant text is non-empty. Skipping empty emissions
219
+ * keeps the event stream useful (empty-text turns are observable
220
+ * via `turn_complete` alone).
221
+ * 3. `turn_complete` — fires AFTER agentRun, with the post-call
222
+ * cumulative usage from `result.state.usage`.
223
+ */
224
+ const runTurnAndBuffer = async (input, turnBudget) => {
225
+ const nextTurn = (currentResult?.state.usage.requests ?? 0) + 1;
226
+ emit({ kind: 'turn_start', turn: nextTurn, provider: 'openai-compatible' });
227
+ const result = (await agentRun(agent, input, {
228
+ maxTurns: turnBudget,
229
+ signal: abortController.signal,
230
+ }));
231
+ const text = stripThinkingTags(extractAssistantText(result.newItems));
232
+ scratchpad.append(result.state.usage.requests, text);
233
+ if (text.length > 0) {
234
+ emit({
235
+ kind: 'text_emission',
236
+ turn: result.state.usage.requests,
237
+ chars: text.length,
238
+ preview: text.slice(0, 200),
239
+ });
240
+ }
241
+ emit({
242
+ kind: 'turn_complete',
243
+ turn: result.state.usage.requests,
244
+ cumulativeInputTokens: result.state.usage.inputTokens,
245
+ cumulativeOutputTokens: result.state.usage.outputTokens,
246
+ });
247
+ return result;
248
+ };
47
249
  const run = async () => {
48
250
  try {
49
- const result = await agentRun(agent, prompt, { maxTurns, signal: abortController.signal });
50
- const usage = result.state.usage;
51
- return {
52
- output: stripThinkingTags(result.finalOutput ?? ''),
53
- status: 'ok',
54
- usage: {
55
- inputTokens: usage.inputTokens,
56
- outputTokens: usage.outputTokens,
57
- totalTokens: usage.totalTokens,
58
- costUSD: null,
59
- },
60
- turns: usage.requests,
61
- files: tracker.getFiles(),
62
- };
251
+ currentResult = await runTurnAndBuffer(promptWithBudgetHint, maxTurns);
252
+ let supervisionRetries = 0;
253
+ // Continuation-exhausted flag: set when runContinuationTurn catches a
254
+ // MaxTurnsExceededError on a re-prompt or re-ground continuation.
255
+ // The break below lands in the exhausted handler so we don't conflate
256
+ // a 5-turn sub-budget exhaustion with the user-declared maxTurns limit.
257
+ let supervisionExhausted = false;
258
+ // Initialized to `null` (NOT ''): on the first turn there is no
259
+ // previous degenerate output to compare against, so the
260
+ // same-output early-out must be skipped. Initialising to ''
261
+ // would cause `sameDegenerateOutput('', '')` to fire on a first-
262
+ // turn empty output and break the loop before retries run.
263
+ let lastDegenerateOutput = null;
264
+ // Track the input-token count at which we last fired a warning
265
+ // nudge. This prevents nudging twice in a row for the same
266
+ // `currentResult` when validation still fails after a nudge
267
+ // response: the next loop iteration will see
268
+ // `currentInputTokens <= lastWarnedInputTokens` and fall through
269
+ // to validation / re-prompt instead of re-issuing the nudge.
270
+ let lastWarnedInputTokens = -1;
271
+ let lastValidationKind = undefined;
272
+ /**
273
+ * Wraps a continuation turn (re-prompt or re-ground) that uses a small
274
+ * fixed budget. Catches MaxTurnsExceededError from the SDK and returns a
275
+ * discriminated union so callers can handle exhaustion without conflating it
276
+ * with the user-declared maxTurns limit.
277
+ */
278
+ async function runContinuationTurn(currentResult, instruction, budget) {
279
+ try {
280
+ const result = await runTurnAndBuffer(continueWith(currentResult, instruction), budget);
281
+ return { ok: true, result };
282
+ }
283
+ catch (err) {
284
+ if (err instanceof MaxTurnsExceededError) {
285
+ return { ok: false, cause: err, label: 'continuation_exhausted', turnAtFailure: currentResult.state.usage.requests };
286
+ }
287
+ throw err;
288
+ }
289
+ }
290
+ // Supervision loop. On each iteration we:
291
+ // 1. Check the watchdog (may force-terminate or nudge)
292
+ // 2. Validate the final message (may re-prompt)
293
+ // 3. Inject re-grounding every RE_GROUNDING_INTERVAL_TURNS turns
294
+ // A single pass where validateCompletion returns `valid` is the clean
295
+ // exit. Otherwise we either re-prompt (and loop) or salvage.
296
+ // eslint-disable-next-line no-constant-condition
297
+ while (true) {
298
+ // --- Watchdog check ---
299
+ const currentInputTokens = currentResult.state.usage.inputTokens;
300
+ const watchdogStatus = checkWatchdogThreshold(currentInputTokens, softLimit);
301
+ if (watchdogStatus !== 'ok') {
302
+ logWatchdogEvent(watchdogStatus, {
303
+ provider: 'openai-compatible',
304
+ model: runner.providerConfig.model,
305
+ turn: currentResult.state.usage.requests,
306
+ inputTokens: currentInputTokens,
307
+ softLimit,
308
+ scratchpadChars: scratchpad.toString().length,
309
+ });
310
+ }
311
+ if (watchdogStatus === 'force_salvage') {
312
+ // `watchdog_force_salvage` is not an injected message — no
313
+ // re-prompt is sent — but observers still want to see exactly
314
+ // why the run is being killed. We emit the event with a
315
+ // `contentLengthChars` of 0 to reflect the "nothing was
316
+ // injected, we just terminated" semantics.
317
+ emit({
318
+ kind: 'injection',
319
+ injectionType: 'watchdog_force_salvage',
320
+ turn: currentResult.state.usage.requests,
321
+ contentLengthChars: 0,
322
+ });
323
+ const salvaged = buildForceSalvageResult(currentResult, scratchpad, tracker, runner.providerConfig, softLimit, Date.now() - taskStartMs, parentModel, shouldCaptureTrace ? traceBuffer : undefined);
324
+ emit({ kind: 'done', status: salvaged.status });
325
+ return salvaged;
326
+ }
327
+ // Warning-band nudge: fire at most once per distinct input-token
328
+ // level. We dispatch the nudge turn, append to the scratchpad,
329
+ // record the new high-watermark, and then FALL THROUGH to the
330
+ // validation block below — the nudge response might itself be a
331
+ // perfectly valid final answer, so we must validate it in the
332
+ // SAME iteration. Without the fall-through, a valid nudge
333
+ // response would be thrown away and the loop would grind until
334
+ // force_salvage (pre-fix bug #1).
335
+ if (watchdogStatus === 'warning' && currentInputTokens > lastWarnedInputTokens) {
336
+ const warning = buildBudgetPressureNudge({
337
+ inputTokens: currentInputTokens,
338
+ softLimit,
339
+ });
340
+ emit({
341
+ kind: 'injection',
342
+ injectionType: 'watchdog_warning',
343
+ turn: currentResult.state.usage.requests,
344
+ contentLengthChars: warning.length,
345
+ });
346
+ lastWarnedInputTokens = currentInputTokens;
347
+ const warningCont = await runContinuationTurn(currentResult, warning, SUPERVISION_CONTINUATION_BUDGET);
348
+ if (!warningCont.ok) {
349
+ supervisionExhausted = true;
350
+ break;
351
+ }
352
+ currentResult = warningCont.result;
353
+ }
354
+ // --- Validation check ---
355
+ const stripped = stripThinkingTags(currentResult.finalOutput ?? '');
356
+ const validation = validateCompletion(stripped);
357
+ // NEW: coverage check — only runs when syntactic validation passes
358
+ if (validation.valid && options.expectedCoverage) {
359
+ const coverageValidation = validateCoverage(stripped, options.expectedCoverage);
360
+ if (!coverageValidation.valid) {
361
+ // Treat identically to a degenerate validation — same retry logic
362
+ validation.valid = false;
363
+ validation.kind = coverageValidation.kind;
364
+ validation.reason = coverageValidation.reason;
365
+ }
366
+ }
367
+ if (validation.valid) {
368
+ const ok = buildOkResult(stripped, currentResult, tracker, runner.providerConfig, Date.now() - taskStartMs, parentModel, shouldCaptureTrace ? traceBuffer : undefined);
369
+ emit({ kind: 'done', status: ok.status });
370
+ return ok;
371
+ }
372
+ // Track last validation kind so the exhausted handler can report it.
373
+ lastValidationKind = validation.kind;
374
+ // Degenerate. Apply same-output early-out (only when we have a
375
+ // prior degenerate output to compare against) and retry budget.
376
+ if (lastDegenerateOutput !== null && sameDegenerateOutput(stripped, lastDegenerateOutput))
377
+ break;
378
+ lastDegenerateOutput = stripped;
379
+ supervisionRetries++;
380
+ if (supervisionRetries >= MAX_SUPERVISION_RETRIES)
381
+ break;
382
+ // --- Re-prompt the model to recover ---
383
+ const rePrompt = buildRePrompt(validation);
384
+ emit({
385
+ kind: 'injection',
386
+ injectionType: injectionTypeFor(validation.kind),
387
+ turn: currentResult.state.usage.requests,
388
+ contentLengthChars: rePrompt.length,
389
+ });
390
+ // Give the model a small budget to recover. One extra turn per
391
+ // retry is enough for the "emit your final answer" nudge.
392
+ const rePromptCont = await runContinuationTurn(currentResult, rePrompt, SUPERVISION_CONTINUATION_BUDGET);
393
+ if (!rePromptCont.ok) {
394
+ supervisionExhausted = true;
395
+ break;
396
+ }
397
+ currentResult = rePromptCont.result;
398
+ // --- Periodic re-grounding ---
399
+ const turnsSoFar = currentResult.state.usage.requests;
400
+ if (turnsSoFar > 0 && turnsSoFar % RE_GROUNDING_INTERVAL_TURNS === 0) {
401
+ const reground = buildReGroundingMessage({
402
+ originalPromptExcerpt: prompt,
403
+ currentTurn: turnsSoFar,
404
+ maxTurns,
405
+ toolCallsSoFar: tracker.getToolCalls().length,
406
+ filesReadSoFar: tracker.getReads().length,
407
+ });
408
+ emit({
409
+ kind: 'injection',
410
+ injectionType: 'reground',
411
+ turn: currentResult.state.usage.requests,
412
+ contentLengthChars: reground.length,
413
+ });
414
+ const regroundCont = await runContinuationTurn(currentResult, reground, SUPERVISION_CONTINUATION_BUDGET);
415
+ if (!regroundCont.ok) {
416
+ supervisionExhausted = true;
417
+ break;
418
+ }
419
+ currentResult = regroundCont.result;
420
+ }
421
+ }
422
+ // Supervision exhausted (either retry budget or same-output early-out or
423
+ // continuation-exhausted break). Salvage from the scratchpad if we have
424
+ // anything; otherwise return the existing incomplete diagnostic.
425
+ const exhaustedReason = supervisionExhausted
426
+ ? `supervision continuation sub-budget exhausted at turn ${currentResult.state.usage.requests}`
427
+ : `supervision loop exhausted after ${supervisionRetries} re-prompts (last kind: ${lastValidationKind ?? 'unknown'})`;
428
+ const exhausted = buildSupervisionExhaustedResult(currentResult, scratchpad, tracker, runner.providerConfig, Date.now() - taskStartMs, parentModel, shouldCaptureTrace ? traceBuffer : undefined, { reason: exhaustedReason });
429
+ emit({ kind: 'done', status: exhausted.status });
430
+ return exhausted;
63
431
  }
64
432
  catch (err) {
65
433
  if (err instanceof MaxTurnsExceededError) {
434
+ // max_turns path: prefer scratchpad salvage over the bare diagnostic.
435
+ // Preserve whatever partial usage we accumulated in the last
436
+ // successful agentRun so the caller sees real numbers, not zeros.
437
+ const filesRead = tracker.getReads();
438
+ const filesWritten = tracker.getWrites();
439
+ const toolCalls = tracker.getToolCalls();
440
+ const partial = partialUsage(currentResult, runner.providerConfig);
441
+ const savedCostUSD = computeSavedCostUSD(partial.costUSD, partial.inputTokens, partial.outputTokens, parentModel);
442
+ emit({ kind: 'done', status: 'max_turns' });
443
+ const hasSalvage = !scratchpad.isEmpty();
444
+ const turnsAtFailure = currentResult?.state.usage.requests ?? maxTurns;
66
445
  return {
67
- output: `Agent exceeded max turns (${maxTurns}).`,
446
+ output: hasSalvage
447
+ ? scratchpad.latest()
448
+ : `Agent exceeded max turns (${maxTurns}).`,
68
449
  status: 'max_turns',
69
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0, costUSD: null },
70
- turns: maxTurns,
71
- files: tracker.getFiles(),
450
+ error: `agent exhausted user-declared maxTurns limit (${maxTurns}) after ${turnsAtFailure} turns`,
451
+ usage: { ...partial, savedCostUSD },
452
+ turns: turnsAtFailure,
453
+ filesRead,
454
+ directoriesListed: tracker.getDirectoriesListed(),
455
+ filesWritten,
456
+ toolCalls,
457
+ outputIsDiagnostic: !hasSalvage,
458
+ escalationLog: [],
459
+ durationMs: Date.now() - taskStartMs,
460
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
72
461
  };
73
462
  }
463
+ // Classify the thrown error into a finer-grained RunStatus so the
464
+ // escalation orchestrator (and downstream observers) can distinguish
465
+ // abort / network / HTTP-error / generic failure modes. We still
466
+ // surface the original error message as the `error` field — the
467
+ // classifier's `reason` is deliberately a stable category label and
468
+ // NOT the human-readable message.
469
+ const { status, reason } = classifyError(err);
470
+ const msg = err instanceof Error ? err.message : String(err);
471
+ emit({ kind: 'done', status });
472
+ const hasSalvage = !scratchpad.isEmpty();
473
+ const partial = partialUsage(currentResult, runner.providerConfig);
474
+ const savedCostUSD = computeSavedCostUSD(partial.costUSD, partial.inputTokens, partial.outputTokens, parentModel);
74
475
  return {
75
- output: `Sub-agent error: ${err instanceof Error ? err.message : String(err)}`,
76
- status: 'error',
77
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0, costUSD: null },
78
- turns: 0,
79
- files: tracker.getFiles(),
80
- error: err instanceof Error ? err.message : String(err),
476
+ output: hasSalvage ? scratchpad.latest() : `Sub-agent error: ${msg}`,
477
+ status,
478
+ usage: { ...partial, savedCostUSD },
479
+ turns: currentResult?.state.usage.requests ?? 0,
480
+ filesRead: tracker.getReads(),
481
+ directoriesListed: tracker.getDirectoriesListed(),
482
+ filesWritten: tracker.getWrites(),
483
+ toolCalls: tracker.getToolCalls(),
484
+ outputIsDiagnostic: !hasSalvage,
485
+ escalationLog: [],
486
+ error: msg || reason,
487
+ durationMs: Date.now() - taskStartMs,
488
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
81
489
  };
82
490
  }
83
491
  };
84
- return withTimeout(run(), timeoutMs, () => ({
85
- output: `Agent timed out after ${timeoutMs}ms.`,
86
- status: 'timeout',
87
- files: tracker.getFiles(),
88
- usage: { inputTokens: 0, outputTokens: 0, totalTokens: 0, costUSD: null },
89
- turns: maxTurns,
90
- }), abortController);
492
+ return withTimeout(run(), timeoutMs, () => {
493
+ emit({ kind: 'done', status: 'timeout' });
494
+ const hasSalvage = !scratchpad.isEmpty();
495
+ const partial = partialUsage(currentResult, runner.providerConfig);
496
+ const savedCostUSD = computeSavedCostUSD(partial.costUSD, partial.inputTokens, partial.outputTokens, parentModel);
497
+ return {
498
+ output: hasSalvage
499
+ ? scratchpad.latest()
500
+ : `Agent timed out after ${timeoutMs}ms.`,
501
+ status: 'timeout',
502
+ filesRead: tracker.getReads(),
503
+ directoriesListed: tracker.getDirectoriesListed(),
504
+ filesWritten: tracker.getWrites(),
505
+ toolCalls: tracker.getToolCalls(),
506
+ // Preserve partial usage from the last successful agentRun so the
507
+ // caller sees real numbers, not zeros, on a timeout.
508
+ usage: { ...partial, savedCostUSD },
509
+ turns: currentResult?.state.usage.requests ?? maxTurns,
510
+ outputIsDiagnostic: !hasSalvage,
511
+ escalationLog: [],
512
+ durationMs: Date.now() - taskStartMs,
513
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
514
+ };
515
+ }, abortController);
516
+ }
517
+ // --- Helpers: canonical return-shape builders -------------------------------
518
+ function buildOkResult(output, currentResult, tracker, providerConfig, durationMs, parentModel, traceBuffer) {
519
+ const usage = currentResult.state.usage;
520
+ const costUSD = computeCostUSD(usage.inputTokens, usage.outputTokens, providerConfig);
521
+ const savedCostUSD = computeSavedCostUSD(costUSD, usage.inputTokens, usage.outputTokens, parentModel);
522
+ return {
523
+ output,
524
+ status: 'ok',
525
+ usage: {
526
+ inputTokens: usage.inputTokens,
527
+ outputTokens: usage.outputTokens,
528
+ totalTokens: usage.totalTokens,
529
+ costUSD,
530
+ savedCostUSD,
531
+ },
532
+ turns: usage.requests,
533
+ filesRead: tracker.getReads(),
534
+ directoriesListed: tracker.getDirectoriesListed(),
535
+ filesWritten: tracker.getWrites(),
536
+ toolCalls: tracker.getToolCalls(),
537
+ // `ok` always carries a real model answer — never a diagnostic.
538
+ outputIsDiagnostic: false,
539
+ escalationLog: [],
540
+ durationMs,
541
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
542
+ };
543
+ }
544
+ function buildSupervisionExhaustedResult(currentResult, scratchpad, tracker, providerConfig, durationMs, parentModel, traceBuffer, opts) {
545
+ const usage = currentResult.state.usage;
546
+ const filesRead = tracker.getReads();
547
+ const filesWritten = tracker.getWrites();
548
+ const toolCalls = tracker.getToolCalls();
549
+ const costUSD = computeCostUSD(usage.inputTokens, usage.outputTokens, providerConfig);
550
+ const savedCostUSD = computeSavedCostUSD(costUSD, usage.inputTokens, usage.outputTokens, parentModel);
551
+ const hasSalvage = !scratchpad.isEmpty();
552
+ return {
553
+ output: hasSalvage
554
+ ? scratchpad.latest()
555
+ : buildIncompleteDiagnostic({
556
+ providerLabel: 'openai-compatible',
557
+ turns: usage.requests,
558
+ inputTokens: usage.inputTokens,
559
+ outputTokens: usage.outputTokens,
560
+ filesRead,
561
+ filesWritten,
562
+ }),
563
+ status: 'incomplete',
564
+ usage: {
565
+ inputTokens: usage.inputTokens,
566
+ outputTokens: usage.outputTokens,
567
+ totalTokens: usage.totalTokens,
568
+ costUSD,
569
+ savedCostUSD,
570
+ },
571
+ turns: usage.requests,
572
+ filesRead,
573
+ directoriesListed: tracker.getDirectoriesListed(),
574
+ filesWritten,
575
+ toolCalls,
576
+ outputIsDiagnostic: !hasSalvage,
577
+ escalationLog: [],
578
+ ...(opts?.reason && { error: opts.reason }),
579
+ durationMs,
580
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
581
+ };
582
+ }
583
+ function buildForceSalvageResult(currentResult, scratchpad, tracker, providerConfig, softLimit, durationMs, parentModel, traceBuffer) {
584
+ const usage = currentResult.state.usage;
585
+ const filesRead = tracker.getReads();
586
+ const filesWritten = tracker.getWrites();
587
+ const toolCalls = tracker.getToolCalls();
588
+ const costUSD = computeCostUSD(usage.inputTokens, usage.outputTokens, providerConfig);
589
+ const savedCostUSD = computeSavedCostUSD(costUSD, usage.inputTokens, usage.outputTokens, parentModel);
590
+ const hasSalvage = !scratchpad.isEmpty();
591
+ return {
592
+ output: hasSalvage
593
+ ? scratchpad.latest()
594
+ : `[openai-compatible sub-agent forcibly terminated at ${usage.inputTokens} input tokens (soft limit ${softLimit}). No usable text was buffered.]`,
595
+ status: 'incomplete',
596
+ usage: {
597
+ inputTokens: usage.inputTokens,
598
+ outputTokens: usage.outputTokens,
599
+ totalTokens: usage.totalTokens,
600
+ costUSD,
601
+ savedCostUSD,
602
+ },
603
+ turns: usage.requests,
604
+ filesRead,
605
+ directoriesListed: tracker.getDirectoriesListed(),
606
+ filesWritten,
607
+ toolCalls,
608
+ outputIsDiagnostic: !hasSalvage,
609
+ escalationLog: [],
610
+ durationMs,
611
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
612
+ };
613
+ }
614
+ /**
615
+ * Synthesise a diagnostic message for runs that completed without producing
616
+ * usable final output. Surfaces enough metadata for the caller to debug:
617
+ * how many turns were spent, what the model burnt token-wise, and what files
618
+ * the worker actually looked at before giving up.
619
+ */
620
+ function buildIncompleteDiagnostic(opts) {
621
+ const lines = [
622
+ `[${opts.providerLabel} sub-agent terminated without producing a final answer]`,
623
+ '',
624
+ 'The agent loop ended on a message with no tool calls and no plain-text content. ' +
625
+ 'This usually means one of:',
626
+ ' • the model emitted only <think> reasoning, then stopped',
627
+ ' • the model produced a conversational fragment instead of a final answer',
628
+ ' • a tool call was malformed and the SDK treated the response as terminal',
629
+ '',
630
+ `Turns used: ${opts.turns}`,
631
+ `Input tokens: ${opts.inputTokens}`,
632
+ `Output tokens: ${opts.outputTokens}`,
633
+ `Files read: ${opts.filesRead.length}${opts.filesRead.length > 0 ? ` (${formatFileList(opts.filesRead)})` : ''}`,
634
+ `Files written: ${opts.filesWritten.length}${opts.filesWritten.length > 0 ? ` (${formatFileList(opts.filesWritten)})` : ''}`,
635
+ '',
636
+ 'Recommended action: re-dispatch with a tighter, more explicit brief, or escalate to a higher-tier provider.',
637
+ ];
638
+ return lines.join('\n');
639
+ }
640
+ function formatFileList(files) {
641
+ const MAX_SHOWN = 10;
642
+ if (files.length <= MAX_SHOWN)
643
+ return files.join(', ');
644
+ return `${files.slice(0, MAX_SHOWN).join(', ')}, … ${files.length - MAX_SHOWN} more`;
645
+ }
646
+ /**
647
+ * Read whatever usage we managed to accumulate from the last successful
648
+ * `agentRun` before a throw, max_turns, or timeout. Used by every
649
+ * non-happy-path return so the caller sees real token counts (and a
650
+ * real cost estimate) instead of zeros.
651
+ */
652
+ function partialUsage(result, providerConfig) {
653
+ if (!result) {
654
+ return { inputTokens: 0, outputTokens: 0, totalTokens: 0, costUSD: null };
655
+ }
656
+ const usage = result.state.usage;
657
+ return {
658
+ inputTokens: usage.inputTokens,
659
+ outputTokens: usage.outputTokens,
660
+ totalTokens: usage.totalTokens,
661
+ costUSD: computeCostUSD(usage.inputTokens, usage.outputTokens, providerConfig),
662
+ };
91
663
  }
92
664
  //# sourceMappingURL=openai-runner.js.map