@zhixuan92/multi-model-agent-core 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +0 -6
  2. package/dist/config/schema.d.ts +27 -0
  3. package/dist/config/schema.d.ts.map +1 -1
  4. package/dist/config/schema.js +13 -0
  5. package/dist/config/schema.js.map +1 -1
  6. package/dist/context/context-block-store.d.ts +75 -0
  7. package/dist/context/context-block-store.d.ts.map +1 -0
  8. package/dist/context/context-block-store.js +82 -0
  9. package/dist/context/context-block-store.js.map +1 -0
  10. package/dist/context/expand-context-blocks.d.ts +20 -0
  11. package/dist/context/expand-context-blocks.d.ts.map +1 -0
  12. package/dist/context/expand-context-blocks.js +46 -0
  13. package/dist/context/expand-context-blocks.js.map +1 -0
  14. package/dist/delegate-with-escalation.d.ts +34 -0
  15. package/dist/delegate-with-escalation.d.ts.map +1 -0
  16. package/dist/delegate-with-escalation.js +168 -0
  17. package/dist/delegate-with-escalation.js.map +1 -0
  18. package/dist/index.d.ts +4 -1
  19. package/dist/index.d.ts.map +1 -1
  20. package/dist/index.js +3 -0
  21. package/dist/index.js.map +1 -1
  22. package/dist/model-profiles.json +8 -4
  23. package/dist/provider.d.ts.map +1 -1
  24. package/dist/provider.js +7 -1
  25. package/dist/provider.js.map +1 -1
  26. package/dist/routing/model-profiles.d.ts +1 -0
  27. package/dist/routing/model-profiles.d.ts.map +1 -1
  28. package/dist/routing/model-profiles.js +4 -0
  29. package/dist/routing/model-profiles.js.map +1 -1
  30. package/dist/run-tasks.d.ts +26 -2
  31. package/dist/run-tasks.d.ts.map +1 -1
  32. package/dist/run-tasks.js +61 -19
  33. package/dist/run-tasks.js.map +1 -1
  34. package/dist/runners/claude-runner.d.ts.map +1 -1
  35. package/dist/runners/claude-runner.js +643 -32
  36. package/dist/runners/claude-runner.js.map +1 -1
  37. package/dist/runners/codex-runner.d.ts.map +1 -1
  38. package/dist/runners/codex-runner.js +473 -48
  39. package/dist/runners/codex-runner.js.map +1 -1
  40. package/dist/runners/error-classification.d.ts +30 -0
  41. package/dist/runners/error-classification.d.ts.map +1 -0
  42. package/dist/runners/error-classification.js +72 -0
  43. package/dist/runners/error-classification.js.map +1 -0
  44. package/dist/runners/injection-type.d.ts +17 -0
  45. package/dist/runners/injection-type.d.ts.map +1 -0
  46. package/dist/runners/injection-type.js +27 -0
  47. package/dist/runners/injection-type.js.map +1 -0
  48. package/dist/runners/openai-runner.d.ts +5 -0
  49. package/dist/runners/openai-runner.d.ts.map +1 -1
  50. package/dist/runners/openai-runner.js +508 -36
  51. package/dist/runners/openai-runner.js.map +1 -1
  52. package/dist/runners/prevention.d.ts +41 -0
  53. package/dist/runners/prevention.d.ts.map +1 -0
  54. package/dist/runners/prevention.js +68 -0
  55. package/dist/runners/prevention.js.map +1 -0
  56. package/dist/runners/supervision.d.ts +130 -0
  57. package/dist/runners/supervision.d.ts.map +1 -0
  58. package/dist/runners/supervision.js +238 -0
  59. package/dist/runners/supervision.js.map +1 -0
  60. package/dist/tools/claude-adapter.d.ts.map +1 -1
  61. package/dist/tools/claude-adapter.js +6 -3
  62. package/dist/tools/claude-adapter.js.map +1 -1
  63. package/dist/tools/definitions.d.ts +3 -1
  64. package/dist/tools/definitions.d.ts.map +1 -1
  65. package/dist/tools/definitions.js +56 -5
  66. package/dist/tools/definitions.js.map +1 -1
  67. package/dist/tools/openai-adapter.d.ts.map +1 -1
  68. package/dist/tools/openai-adapter.js +6 -3
  69. package/dist/tools/openai-adapter.js.map +1 -1
  70. package/dist/tools/scratchpad.d.ts +28 -0
  71. package/dist/tools/scratchpad.d.ts.map +1 -0
  72. package/dist/tools/scratchpad.js +49 -0
  73. package/dist/tools/scratchpad.js.map +1 -0
  74. package/dist/tools/tracker.d.ts +38 -2
  75. package/dist/tools/tracker.d.ts.map +1 -1
  76. package/dist/tools/tracker.js +54 -5
  77. package/dist/tools/tracker.js.map +1 -1
  78. package/dist/types.d.ts +184 -2
  79. package/dist/types.d.ts.map +1 -1
  80. package/dist/types.js +17 -1
  81. package/dist/types.js.map +1 -1
  82. package/package.json +9 -15
@@ -1,9 +1,16 @@
1
1
  import OpenAI from 'openai';
2
2
  import { z } from 'zod';
3
+ import { createHash } from 'node:crypto';
3
4
  import { getCodexAuth } from '../auth/codex-oauth.js';
4
- import { withTimeout } from '../types.js';
5
+ import { withTimeout, computeCostUSD, } from '../types.js';
5
6
  import { FileTracker } from '../tools/tracker.js';
6
7
  import { createToolImplementations } from '../tools/definitions.js';
8
+ import { TextScratchpad } from '../tools/scratchpad.js';
9
+ import { buildSystemPrompt, buildBudgetHint, buildReGroundingMessage, buildBudgetPressureNudge, RE_GROUNDING_INTERVAL_TURNS, } from './prevention.js';
10
+ import { validateCompletion, buildRePrompt, sameDegenerateOutput, resolveInputTokenSoftLimit, checkWatchdogThreshold, logWatchdogEvent, } from './supervision.js';
11
+ import { injectionTypeFor } from './injection-type.js';
12
+ import { classifyError } from './error-classification.js';
13
+ import { findModelProfile } from '../routing/model-profiles.js';
7
14
  // CODEX_DEBUG=1 causes the runner to log raw HTTP request/response bodies to
8
15
  // stderr. Those bodies routinely include the user's prompt, file contents,
9
16
  // tool arguments, and other sensitive data — fine for local debugging,
@@ -16,6 +23,11 @@ if (process.env.CODEX_DEBUG === '1') {
16
23
  'bodies (including prompts and file contents) will be logged to stderr. ' +
17
24
  'Disable in any environment where logs may be retained or shared.');
18
25
  }
26
+ /**
27
+ * Hard cap on supervision re-prompts before we give up and salvage. Three is
28
+ * the value chosen in the spec (A.2.2); mirrors openai-runner and claude-runner.
29
+ */
30
+ const MAX_SUPERVISION_RETRIES = 3;
19
31
  export function createCodexClient(capture) {
20
32
  const debug = process.env.CODEX_DEBUG === '1';
21
33
  // A custom fetch that tees error-response bodies into `capture`.
@@ -146,7 +158,33 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
146
158
  const sandboxPolicy = options.sandboxPolicy ?? providerConfig.sandboxPolicy ?? 'cwd-only';
147
159
  const effort = options.effort ?? providerConfig.effort;
148
160
  const abortController = new AbortController();
149
- const tracker = new FileTracker();
161
+ // --- Progress event emission (Task 11) ----------------------------------
162
+ //
163
+ // `onProgress` is already wrapped in `safeSink` by the orchestrator
164
+ // (Task 8), so any throw from the consumer callback is swallowed
165
+ // upstream and cannot corrupt this loop. We do not need to wrap it
166
+ // again here.
167
+ const onProgress = options.onProgress;
168
+ const emit = (event) => {
169
+ if (onProgress)
170
+ onProgress(event);
171
+ };
172
+ // Accumulated state (hoisted so the timeout callback can read partial
173
+ // progress, AND so the FileTracker callback closure — constructed below
174
+ // — can read the running turn count at firing time).
175
+ //
176
+ // Turn attribution for tool calls: in codex-runner, tool calls fire in
177
+ // the tool-execution loop AFTER the model's stream for that turn has
178
+ // completed but BEFORE the next iteration of `while` starts. The `turns`
179
+ // variable already reflects the current turn at that point (it was
180
+ // incremented at the top of the iteration), so the callback can read it
181
+ // directly — no +1 offset.
182
+ let inputTokens = 0;
183
+ let outputTokens = 0;
184
+ let turns = 0;
185
+ const tracker = new FileTracker((summary) => {
186
+ emit({ kind: 'tool_call', turn: turns, toolSummary: summary });
187
+ });
150
188
  const toolImpls = createToolImplementations(tracker, cwd, sandboxPolicy, abortController.signal);
151
189
  const codexTools = toolMode === 'full' ? buildCodexTools(toolImpls, sandboxPolicy) : [];
152
190
  const toolsByName = new Map(codexTools.map(t => [t.name, t]));
@@ -167,29 +205,108 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
167
205
  ? configuredHostedTools.map(t => ({ type: t }))
168
206
  : [];
169
207
  const allTools = [...responsesTools, ...hostedTools];
170
- // Accumulated state (hoisted so the timeout callback can read partial progress)
171
- let inputTokens = 0;
172
- let outputTokens = 0;
173
- let turns = 0;
208
+ // --- Prevention layer: system prompt + budget hint ---
209
+ //
210
+ // buildSystemPrompt() is deliberately static and parameter-free (same
211
+ // decision as openai-runner and claude-runner: Task 1 review rejected
212
+ // provider/maxTurns options). The budget hint is prepended to the user
213
+ // prompt so the model sees it as part of its task brief, while the system
214
+ // prompt is threaded through the Responses API `instructions` field.
215
+ const systemPrompt = buildSystemPrompt();
216
+ const budgetHint = buildBudgetHint({ maxTurns });
217
+ const promptWithBudgetHint = `${budgetHint}\n\n${prompt}`;
218
+ // --- onInitialRequest (Task 12) ----------------------------------------
219
+ //
220
+ // Fire once per attempt with the canonical orchestrator-side initial
221
+ // brief: `${systemPrompt}\n\n${promptWithBudgetHint}`. This is NOT the
222
+ // literal request body the OpenAI Responses API transmits — codex
223
+ // sends the systemPrompt via the Responses API `instructions` field
224
+ // and the user prompt as a structured `input` message array. We hash
225
+ // the canonical form instead so the hash is cross-runner stable:
226
+ // openai-runner and claude-runner compute the same hash from the same
227
+ // canonical string. See `AttemptRecord.initialPromptHash` in types.ts
228
+ // for the full wire-level caveat.
229
+ if (options.onInitialRequest) {
230
+ const canonicalInitialBrief = `${systemPrompt}\n\n${promptWithBudgetHint}`;
231
+ try {
232
+ options.onInitialRequest({
233
+ lengthChars: canonicalInitialBrief.length,
234
+ sha256: createHash('sha256').update(canonicalInitialBrief).digest('hex'),
235
+ });
236
+ }
237
+ catch {
238
+ // Swallow — a broken callback must not affect dispatch.
239
+ }
240
+ }
241
+ // --- Scratchpad: buffers every text emission the codex backend streams
242
+ // through our loop. Every termination path (ok / incomplete / max_turns /
243
+ // error / timeout / force_salvage) salvages `scratchpad.latest()` when
244
+ // the final message is empty or degenerate. ---
245
+ const scratchpad = new TextScratchpad();
246
+ // --- Watchdog: resolve the input-token soft limit once per run ---
247
+ const profile = findModelProfile(providerConfig.model);
248
+ const softLimit = resolveInputTokenSoftLimit(providerConfig, profile);
174
249
  const run = async () => {
175
250
  const capture = {};
176
251
  const client = createCodexClient(capture);
177
252
  const input = [
178
253
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
179
- { role: 'user', content: prompt },
254
+ { role: 'user', content: promptWithBudgetHint },
180
255
  ];
181
256
  let output = '';
257
+ // --- Abort-path investigation (plan Step 2) ---------------------------
258
+ //
259
+ // The 2026-04-10 Fate dispatch captured an error "Request was aborted |
260
+ // last response status: completed". The "completed" suffix was
261
+ // misleading: it was captured from a PREVIOUS successful turn, not the
262
+ // failed one. Mechanism:
263
+ //
264
+ // 1. Turn N's stream emits `response.completed` with status
265
+ // `'completed'`. We update `lastResponseStatus = 'completed'`.
266
+ // 2. Turn N+1 starts; `client.responses.create(...)` opens a new
267
+ // stream, but the abort signal fires before any
268
+ // `response.completed` event is received.
269
+ // 3. The thrown error is caught below. The catch branch reads
270
+ // `lastResponseStatus` — which is STILL `'completed'` from turn N
271
+ // — and appends it as "last response status: completed", making
272
+ // the error look like it originated from a successful response.
273
+ //
274
+ // Fix: track which turn the status was captured on. If the status was
275
+ // NOT captured on the current (failed) turn, drop the suffix. That way
276
+ // we never emit a status that belongs to a different, already-
277
+ // concluded request. Users saw the misleading suffix and wasted time
278
+ // debugging a phantom "the request completed but was aborted" condition
279
+ // that doesn't exist.
182
280
  let lastResponseStatus = null;
281
+ let lastResponseStatusTurn = null;
282
+ // --- Supervision / watchdog bookkeeping ---
283
+ let supervisionRetries = 0;
284
+ // Initialised to `null` (NOT ''): on the first turn there is no
285
+ // previous degenerate output to compare against, so the same-output
286
+ // early-out must be skipped. Initialising to '' would cause
287
+ // sameDegenerateOutput('', '') to fire on a first-turn empty output
288
+ // and break the loop before any retries run. See openai-runner
289
+ // regression #5.
290
+ let lastDegenerateOutput = null;
291
+ // High-watermark guard for the watchdog warning nudge — fire at most
292
+ // once per distinct input-token level. Mirrors openai-runner and
293
+ // claude-runner.
294
+ let lastWarnedInputTokens = -1;
183
295
  try {
184
296
  while (turns < maxTurns) {
185
297
  turns++;
298
+ // Emit turn_start AFTER incrementing so `turn` matches the 1-indexed
299
+ // turn number we use everywhere else in this runner (the scratchpad
300
+ // append, watchdog logs, error diagnostics, result.turns).
301
+ emit({ kind: 'turn_start', turn: turns, provider: 'codex' });
186
302
  // Codex backend requires streaming. The Codex backend's
187
303
  // `response.completed` event does NOT populate `response.output` —
188
304
  // we must accumulate content from individual stream events.
189
- // `instructions` is required (mirrors gumi-agent's proven shape).
305
+ // `instructions` carries the prevention-layer system prompt; the
306
+ // per-run budget hint is already prepended to the first user input.
190
307
  const stream = await client.responses.create({
191
308
  model: providerConfig.model,
192
- instructions: prompt,
309
+ instructions: systemPrompt,
193
310
  input,
194
311
  stream: true,
195
312
  store: false,
@@ -241,8 +358,10 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
241
358
  inputTokens += r.usage.input_tokens ?? 0;
242
359
  outputTokens += r.usage.output_tokens ?? 0;
243
360
  }
244
- if (r?.status)
361
+ if (r?.status) {
245
362
  lastResponseStatus = r.status;
363
+ lastResponseStatusTurn = turns;
364
+ }
246
365
  }
247
366
  }
248
367
  if (process.env.CODEX_DEBUG === '1') {
@@ -256,6 +375,20 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
256
375
  if (!sawCompleted) {
257
376
  throw new Error('Codex stream ended without a response.completed event');
258
377
  }
378
+ // Buffer this turn's text into the scratchpad BEFORE any exit so
379
+ // every termination path (including supervision exhaustion and
380
+ // force_salvage) can salvage it. Codex does not emit <think> tags
381
+ // by default, so there is no stripping step here.
382
+ if (textThisTurn) {
383
+ scratchpad.append(turns, textThisTurn);
384
+ emit({
385
+ kind: 'text_emission',
386
+ turn: turns,
387
+ chars: textThisTurn.length,
388
+ preview: textThisTurn.slice(0, 200),
389
+ });
390
+ output = textThisTurn;
391
+ }
259
392
  // Replay only function_call items into the next turn's input.
260
393
  //
261
394
  // We send `store: false` to the Responses API, which means the server
@@ -287,26 +420,156 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
287
420
  });
288
421
  }
289
422
  }
290
- if (textThisTurn) {
291
- output = textThisTurn;
423
+ // --- Watchdog checks after tokens are updated -------------------
424
+ const watchdogStatus = checkWatchdogThreshold(inputTokens, softLimit);
425
+ if (watchdogStatus !== 'ok') {
426
+ logWatchdogEvent(watchdogStatus, {
427
+ provider: 'codex',
428
+ model: providerConfig.model,
429
+ turn: turns,
430
+ inputTokens,
431
+ softLimit,
432
+ scratchpadChars: scratchpad.toString().length,
433
+ });
292
434
  }
293
- else if (toolCalls.length === 0) {
294
- output = `[codex returned no text items streamed: ${itemTypesSeen.join(', ') || '(none)'}]`;
435
+ if (watchdogStatus === 'force_salvage') {
436
+ // `watchdog_force_salvage` is not an injected message no
437
+ // re-prompt is sent — but observers still want to see exactly
438
+ // why the run is being killed. Emit with contentLengthChars: 0
439
+ // to reflect the "nothing was injected, we just terminated"
440
+ // semantics (mirrors openai-runner and claude-runner).
441
+ emit({
442
+ kind: 'injection',
443
+ injectionType: 'watchdog_force_salvage',
444
+ turn: turns,
445
+ contentLengthChars: 0,
446
+ });
447
+ const salvaged = buildCodexForceSalvageResult({
448
+ tracker,
449
+ scratchpad,
450
+ providerConfig,
451
+ inputTokens,
452
+ outputTokens,
453
+ turns,
454
+ softLimit,
455
+ });
456
+ emit({ kind: 'done', status: salvaged.status });
457
+ return salvaged;
458
+ }
459
+ // Warning-band nudge: fire at most once per distinct input-token
460
+ // high-watermark. Pushed as a user message so the next turn of
461
+ // the codex loop addresses the budget-pressure prompt. We use
462
+ // the shared prevention helper (NOT an inline string) so every
463
+ // runner emits byte-identical wording.
464
+ if (watchdogStatus === 'warning' && inputTokens > lastWarnedInputTokens) {
465
+ lastWarnedInputTokens = inputTokens;
466
+ const warning = buildBudgetPressureNudge({ inputTokens, softLimit });
467
+ input.push({
468
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
469
+ role: 'user',
470
+ content: warning,
471
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
472
+ });
473
+ emit({
474
+ kind: 'injection',
475
+ injectionType: 'watchdog_warning',
476
+ turn: turns,
477
+ contentLengthChars: warning.length,
478
+ });
295
479
  }
296
- // If the model made no tool calls, it's done
480
+ // --- Periodic re-grounding inside the loop ---------------------
481
+ if (turns > 0 && turns % RE_GROUNDING_INTERVAL_TURNS === 0) {
482
+ const reground = buildReGroundingMessage({
483
+ originalPromptExcerpt: prompt,
484
+ currentTurn: turns,
485
+ maxTurns,
486
+ toolCallsSoFar: tracker.getToolCalls().length,
487
+ filesReadSoFar: tracker.getReads().length,
488
+ });
489
+ input.push({
490
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
491
+ role: 'user',
492
+ content: reground,
493
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
494
+ });
495
+ emit({
496
+ kind: 'injection',
497
+ injectionType: 'reground',
498
+ turn: turns,
499
+ contentLengthChars: reground.length,
500
+ });
501
+ }
502
+ // --- turn_complete: one event per while-iteration. Fires after the
503
+ // watchdog + re-grounding checks have run (so cumulative token
504
+ // counts and any injection events are already on the wire) and
505
+ // BEFORE the supervision branching / tool-execution loop. Every
506
+ // continue/return in the branches below happens AFTER this event,
507
+ // so the sequence "turn_start ... text_emission ... turn_complete"
508
+ // is guaranteed per iteration.
509
+ emit({
510
+ kind: 'turn_complete',
511
+ turn: turns,
512
+ cumulativeInputTokens: inputTokens,
513
+ cumulativeOutputTokens: outputTokens,
514
+ });
515
+ // If the model made no tool calls, the turn ended with either a
516
+ // final answer or a degenerate emission. Wrap in the supervision
517
+ // state machine: valid text is an immediate ok-exit; degenerate
518
+ // either re-prompts (and continues the loop) or — if the retry
519
+ // budget is spent / same-output early-out fires — exits as
520
+ // incomplete with scratchpad salvage.
297
521
  if (toolCalls.length === 0) {
298
- return {
299
- output,
300
- status: 'ok',
301
- usage: {
522
+ const stripped = textThisTurn; // codex does not emit <think> tags
523
+ const validation = validateCompletion(stripped);
524
+ if (validation.valid) {
525
+ const ok = buildCodexOkResult({
526
+ tracker,
527
+ scratchpad,
528
+ providerConfig,
302
529
  inputTokens,
303
530
  outputTokens,
304
- totalTokens: inputTokens + outputTokens,
305
- costUSD: null,
306
- },
307
- turns,
308
- files: tracker.getFiles(),
309
- };
531
+ turns,
532
+ output: stripped,
533
+ });
534
+ emit({ kind: 'done', status: ok.status });
535
+ return ok;
536
+ }
537
+ // Same-output early-out: only compare when we have a previous
538
+ // degenerate output. First-turn degeneracy must still get
539
+ // retries — see openai-runner regression #5.
540
+ if ((lastDegenerateOutput !== null &&
541
+ sameDegenerateOutput(stripped, lastDegenerateOutput)) ||
542
+ supervisionRetries >= MAX_SUPERVISION_RETRIES) {
543
+ const exhausted = buildCodexIncompleteResult({
544
+ tracker,
545
+ scratchpad,
546
+ providerConfig,
547
+ inputTokens,
548
+ outputTokens,
549
+ turns,
550
+ });
551
+ emit({ kind: 'done', status: exhausted.status });
552
+ return exhausted;
553
+ }
554
+ // Inject the re-prompt as the next user input and continue
555
+ // the loop. The next turn of the codex backend will respond
556
+ // to the re-prompt directly.
557
+ lastDegenerateOutput = stripped;
558
+ supervisionRetries++;
559
+ const rePrompt = buildRePrompt(validation);
560
+ input.push({
561
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
562
+ role: 'user',
563
+ content: rePrompt,
564
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
565
+ });
566
+ emit({
567
+ kind: 'injection',
568
+ injectionType: injectionTypeFor(validation.kind),
569
+ turn: turns,
570
+ contentLengthChars: rePrompt.length,
571
+ });
572
+ continue;
310
573
  }
311
574
  // Execute tool calls and feed outputs back
312
575
  for (const call of toolCalls) {
@@ -331,19 +594,19 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
331
594
  });
332
595
  }
333
596
  }
334
- // Max turns exhausted
335
- return {
336
- output: output || `Agent exceeded max turns (${maxTurns}).`,
337
- status: 'max_turns',
338
- usage: {
339
- inputTokens,
340
- outputTokens,
341
- totalTokens: inputTokens + outputTokens,
342
- costUSD: null,
343
- },
597
+ // Max turns exhausted — salvage any buffered text.
598
+ const maxTurnsResult = buildCodexMaxTurnsResult({
599
+ tracker,
600
+ scratchpad,
601
+ providerConfig,
602
+ inputTokens,
603
+ outputTokens,
344
604
  turns,
345
- files: tracker.getFiles(),
346
- };
605
+ maxTurns,
606
+ lastOutput: output,
607
+ });
608
+ emit({ kind: 'done', status: maxTurnsResult.status });
609
+ return maxTurnsResult;
347
610
  }
348
611
  catch (err) {
349
612
  // OpenAI SDK's APIError carries status/body/headers — surface them
@@ -374,30 +637,192 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
374
637
  }
375
638
  if (e?.requestID)
376
639
  pieces.push(`req_id=${e.requestID}`);
377
- if (lastResponseStatus)
640
+ // Only include `last response status` when it was captured on the
641
+ // CURRENT (failing) turn — otherwise it belongs to a previous,
642
+ // separate request and appending it is actively misleading. See the
643
+ // abort-path investigation comment at the top of `run()`.
644
+ if (lastResponseStatus && lastResponseStatusTurn === turns) {
378
645
  pieces.push(`last response status: ${lastResponseStatus}`);
646
+ }
647
+ else if (lastResponseStatus && lastResponseStatusTurn !== turns) {
648
+ pieces.push(`note: a previous request (turn ${lastResponseStatusTurn}) completed with status ` +
649
+ `"${lastResponseStatus}" — it is unrelated to this failure`);
650
+ }
379
651
  const detailed = pieces.join(' | ') || String(err);
652
+ // Classify the thrown error into a finer-grained RunStatus. Task 7
653
+ // introduces api_aborted / api_error / network_error alongside the
654
+ // catch-all 'error' status. The turn-scoped `lastResponseStatus`
655
+ // disambiguation above is ORTHOGONAL to this classification: the
656
+ // `detailed` message is still the rich operator-facing diagnostic,
657
+ // and `classifyError` only decides which RunStatus bucket the
658
+ // failure lands in.
659
+ const { status } = classifyError(err);
660
+ // Salvage: if the scratchpad has buffered text from earlier turns,
661
+ // return it as the output. Pre-Task-5 behavior returned only the
662
+ // error string, losing 30k+ tokens of work on abort.
663
+ emit({ kind: 'done', status });
664
+ const hasSalvage = !scratchpad.isEmpty();
380
665
  return {
381
- output: `Sub-agent error: ${detailed}`,
382
- status: 'error',
666
+ output: hasSalvage ? scratchpad.latest() : `Sub-agent error: ${detailed}`,
667
+ status,
383
668
  usage: {
384
669
  inputTokens,
385
670
  outputTokens,
386
671
  totalTokens: inputTokens + outputTokens,
387
- costUSD: null,
672
+ costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
388
673
  },
389
674
  turns,
390
- files: tracker.getFiles(),
675
+ filesRead: tracker.getReads(),
676
+ filesWritten: tracker.getWrites(),
677
+ toolCalls: tracker.getToolCalls(),
678
+ outputIsDiagnostic: !hasSalvage,
679
+ escalationLog: [],
391
680
  error: detailed,
392
681
  };
393
682
  }
394
683
  };
395
- return withTimeout(run(), timeoutMs, () => ({
396
- output: `Agent timed out after ${timeoutMs}ms.`,
397
- status: 'timeout',
398
- files: tracker.getFiles(),
399
- usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens, costUSD: null },
684
+ return withTimeout(run(), timeoutMs, () => {
685
+ emit({ kind: 'done', status: 'timeout' });
686
+ const hasSalvage = !scratchpad.isEmpty();
687
+ return {
688
+ // Preserve any text the scratchpad buffered before the timeout fired.
689
+ // Partial usage is read from the running accumulators hoisted above —
690
+ // hardcoded zeros would discard every token counted on partial turns.
691
+ output: hasSalvage ? scratchpad.latest() : `Agent timed out after ${timeoutMs}ms.`,
692
+ status: 'timeout',
693
+ filesRead: tracker.getReads(),
694
+ filesWritten: tracker.getWrites(),
695
+ toolCalls: tracker.getToolCalls(),
696
+ usage: {
697
+ inputTokens,
698
+ outputTokens,
699
+ totalTokens: inputTokens + outputTokens,
700
+ costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
701
+ },
702
+ turns,
703
+ outputIsDiagnostic: !hasSalvage,
704
+ escalationLog: [],
705
+ };
706
+ }, abortController);
707
+ }
708
+ function buildCodexOkResult(args) {
709
+ const { tracker, providerConfig, inputTokens, outputTokens, turns, output } = args;
710
+ return {
711
+ output,
712
+ status: 'ok',
713
+ usage: {
714
+ inputTokens,
715
+ outputTokens,
716
+ totalTokens: inputTokens + outputTokens,
717
+ costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
718
+ },
719
+ turns,
720
+ filesRead: tracker.getReads(),
721
+ filesWritten: tracker.getWrites(),
722
+ toolCalls: tracker.getToolCalls(),
723
+ // `ok` always carries a real model answer — never a diagnostic.
724
+ outputIsDiagnostic: false,
725
+ escalationLog: [],
726
+ };
727
+ }
728
+ /**
729
+ * Supervision-exhausted path: retry cap hit or same-output early-out. Prefer
730
+ * scratchpad salvage; fall back to the incomplete diagnostic.
731
+ */
732
+ function buildCodexIncompleteResult(args) {
733
+ const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns } = args;
734
+ const filesRead = tracker.getReads();
735
+ const filesWritten = tracker.getWrites();
736
+ const hasSalvage = !scratchpad.isEmpty();
737
+ return {
738
+ output: hasSalvage
739
+ ? scratchpad.latest()
740
+ : buildCodexIncompleteDiagnostic({
741
+ turns,
742
+ inputTokens,
743
+ outputTokens,
744
+ filesRead,
745
+ filesWritten,
746
+ }),
747
+ status: 'incomplete',
748
+ usage: {
749
+ inputTokens,
750
+ outputTokens,
751
+ totalTokens: inputTokens + outputTokens,
752
+ costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
753
+ },
754
+ turns,
755
+ filesRead,
756
+ filesWritten,
757
+ toolCalls: tracker.getToolCalls(),
758
+ outputIsDiagnostic: !hasSalvage,
759
+ escalationLog: [],
760
+ };
761
+ }
762
+ function buildCodexForceSalvageResult(args) {
763
+ const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, softLimit } = args;
764
+ const hasSalvage = !scratchpad.isEmpty();
765
+ return {
766
+ output: hasSalvage
767
+ ? scratchpad.latest()
768
+ : `[codex sub-agent forcibly terminated at ${inputTokens} input tokens (soft limit ${softLimit}). No usable text was buffered.]`,
769
+ status: 'incomplete',
770
+ usage: {
771
+ inputTokens,
772
+ outputTokens,
773
+ totalTokens: inputTokens + outputTokens,
774
+ costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
775
+ },
400
776
  turns,
401
- }), abortController);
777
+ filesRead: tracker.getReads(),
778
+ filesWritten: tracker.getWrites(),
779
+ toolCalls: tracker.getToolCalls(),
780
+ outputIsDiagnostic: !hasSalvage,
781
+ escalationLog: [],
782
+ };
783
+ }
784
+ function buildCodexMaxTurnsResult(args) {
785
+ const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, maxTurns, lastOutput } = args;
786
+ const hasSalvage = !scratchpad.isEmpty();
787
+ // Note: `lastOutput` here is the model's final text for the max-turns
788
+ // boundary — real model content, not a diagnostic template. Only the
789
+ // `Agent exceeded max turns…` fallback (empty scratchpad AND empty
790
+ // lastOutput) is a diagnostic.
791
+ const output = hasSalvage
792
+ ? scratchpad.latest()
793
+ : (lastOutput || `Agent exceeded max turns (${maxTurns}).`);
794
+ const outputIsDiagnostic = !hasSalvage && !lastOutput;
795
+ return {
796
+ output,
797
+ status: 'max_turns',
798
+ usage: {
799
+ inputTokens,
800
+ outputTokens,
801
+ totalTokens: inputTokens + outputTokens,
802
+ costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
803
+ },
804
+ turns,
805
+ filesRead: tracker.getReads(),
806
+ filesWritten: tracker.getWrites(),
807
+ toolCalls: tracker.getToolCalls(),
808
+ outputIsDiagnostic,
809
+ escalationLog: [],
810
+ };
811
+ }
812
+ function buildCodexIncompleteDiagnostic(opts) {
813
+ return [
814
+ '[codex sub-agent terminated without producing a final answer]',
815
+ '',
816
+ 'The model emitted no tool calls and no usable text on its final turn, and',
817
+ 'supervision re-prompts did not recover a valid response.',
818
+ '',
819
+ `Turns used: ${opts.turns}`,
820
+ `Input tokens: ${opts.inputTokens}`,
821
+ `Output tokens: ${opts.outputTokens}`,
822
+ `Files read: ${opts.filesRead.length}`,
823
+ `Files written: ${opts.filesWritten.length}`,
824
+ '',
825
+ 'Recommended action: re-dispatch with a tighter brief, or escalate provider tier.',
826
+ ].join('\n');
402
827
  }
403
828
  //# sourceMappingURL=codex-runner.js.map