@zhixuan92/multi-model-agent-core 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +0 -6
  2. package/dist/config/schema.d.ts +73 -45
  3. package/dist/config/schema.d.ts.map +1 -1
  4. package/dist/config/schema.js +14 -0
  5. package/dist/config/schema.js.map +1 -1
  6. package/dist/context/context-block-store.d.ts +75 -0
  7. package/dist/context/context-block-store.d.ts.map +1 -0
  8. package/dist/context/context-block-store.js +82 -0
  9. package/dist/context/context-block-store.js.map +1 -0
  10. package/dist/context/expand-context-blocks.d.ts +20 -0
  11. package/dist/context/expand-context-blocks.d.ts.map +1 -0
  12. package/dist/context/expand-context-blocks.js +46 -0
  13. package/dist/context/expand-context-blocks.js.map +1 -0
  14. package/dist/delegate-with-escalation.d.ts +34 -0
  15. package/dist/delegate-with-escalation.d.ts.map +1 -0
  16. package/dist/delegate-with-escalation.js +172 -0
  17. package/dist/delegate-with-escalation.js.map +1 -0
  18. package/dist/index.d.ts +4 -1
  19. package/dist/index.d.ts.map +1 -1
  20. package/dist/index.js +3 -0
  21. package/dist/index.js.map +1 -1
  22. package/dist/model-profiles.json +42 -4
  23. package/dist/provider.d.ts.map +1 -1
  24. package/dist/provider.js +7 -1
  25. package/dist/provider.js.map +1 -1
  26. package/dist/routing/model-profiles.d.ts +9 -4
  27. package/dist/routing/model-profiles.d.ts.map +1 -1
  28. package/dist/routing/model-profiles.js +8 -0
  29. package/dist/routing/model-profiles.js.map +1 -1
  30. package/dist/run-tasks.d.ts +26 -2
  31. package/dist/run-tasks.d.ts.map +1 -1
  32. package/dist/run-tasks.js +61 -19
  33. package/dist/run-tasks.js.map +1 -1
  34. package/dist/runners/claude-runner.d.ts.map +1 -1
  35. package/dist/runners/claude-runner.js +721 -32
  36. package/dist/runners/claude-runner.js.map +1 -1
  37. package/dist/runners/codex-runner.d.ts.map +1 -1
  38. package/dist/runners/codex-runner.js +541 -48
  39. package/dist/runners/codex-runner.js.map +1 -1
  40. package/dist/runners/error-classification.d.ts +30 -0
  41. package/dist/runners/error-classification.d.ts.map +1 -0
  42. package/dist/runners/error-classification.js +72 -0
  43. package/dist/runners/error-classification.js.map +1 -0
  44. package/dist/runners/injection-type.d.ts +22 -0
  45. package/dist/runners/injection-type.d.ts.map +1 -0
  46. package/dist/runners/injection-type.js +34 -0
  47. package/dist/runners/injection-type.js.map +1 -0
  48. package/dist/runners/openai-runner.d.ts +5 -0
  49. package/dist/runners/openai-runner.d.ts.map +1 -1
  50. package/dist/runners/openai-runner.js +608 -36
  51. package/dist/runners/openai-runner.js.map +1 -1
  52. package/dist/runners/prevention.d.ts +41 -0
  53. package/dist/runners/prevention.d.ts.map +1 -0
  54. package/dist/runners/prevention.js +68 -0
  55. package/dist/runners/prevention.js.map +1 -0
  56. package/dist/runners/supervision.d.ts +137 -0
  57. package/dist/runners/supervision.d.ts.map +1 -0
  58. package/dist/runners/supervision.js +345 -0
  59. package/dist/runners/supervision.js.map +1 -0
  60. package/dist/tools/claude-adapter.d.ts.map +1 -1
  61. package/dist/tools/claude-adapter.js +6 -3
  62. package/dist/tools/claude-adapter.js.map +1 -1
  63. package/dist/tools/definitions.d.ts +3 -1
  64. package/dist/tools/definitions.d.ts.map +1 -1
  65. package/dist/tools/definitions.js +57 -5
  66. package/dist/tools/definitions.js.map +1 -1
  67. package/dist/tools/openai-adapter.d.ts.map +1 -1
  68. package/dist/tools/openai-adapter.js +6 -3
  69. package/dist/tools/openai-adapter.js.map +1 -1
  70. package/dist/tools/scratchpad.d.ts +28 -0
  71. package/dist/tools/scratchpad.d.ts.map +1 -0
  72. package/dist/tools/scratchpad.js +49 -0
  73. package/dist/tools/scratchpad.js.map +1 -0
  74. package/dist/tools/tracker.d.ts +42 -2
  75. package/dist/tools/tracker.d.ts.map +1 -1
  76. package/dist/tools/tracker.js +63 -5
  77. package/dist/tools/tracker.js.map +1 -1
  78. package/dist/types.d.ts +261 -2
  79. package/dist/types.d.ts.map +1 -1
  80. package/dist/types.js +43 -1
  81. package/dist/types.js.map +1 -1
  82. package/package.json +7 -3
@@ -1,9 +1,16 @@
1
1
  import OpenAI from 'openai';
2
2
  import { z } from 'zod';
3
+ import { createHash } from 'node:crypto';
3
4
  import { getCodexAuth } from '../auth/codex-oauth.js';
4
- import { withTimeout } from '../types.js';
5
+ import { withTimeout, computeCostUSD, computeSavedCostUSD, } from '../types.js';
5
6
  import { FileTracker } from '../tools/tracker.js';
6
7
  import { createToolImplementations } from '../tools/definitions.js';
8
+ import { TextScratchpad } from '../tools/scratchpad.js';
9
+ import { buildSystemPrompt, buildBudgetHint, buildReGroundingMessage, buildBudgetPressureNudge, RE_GROUNDING_INTERVAL_TURNS, } from './prevention.js';
10
+ import { validateCompletion, validateCoverage, buildRePrompt, sameDegenerateOutput, resolveInputTokenSoftLimit, checkWatchdogThreshold, logWatchdogEvent, trimProgressTrace, } from './supervision.js';
11
+ import { injectionTypeFor } from './injection-type.js';
12
+ import { classifyError } from './error-classification.js';
13
+ import { findModelProfile } from '../routing/model-profiles.js';
7
14
  // CODEX_DEBUG=1 causes the runner to log raw HTTP request/response bodies to
8
15
  // stderr. Those bodies routinely include the user's prompt, file contents,
9
16
  // tool arguments, and other sensitive data — fine for local debugging,
@@ -16,6 +23,11 @@ if (process.env.CODEX_DEBUG === '1') {
16
23
  'bodies (including prompts and file contents) will be logged to stderr. ' +
17
24
  'Disable in any environment where logs may be retained or shared.');
18
25
  }
26
+ /**
27
+ * Hard cap on supervision re-prompts before we give up and salvage. Three is
28
+ * the value chosen in the spec (A.2.2); mirrors openai-runner and claude-runner.
29
+ */
30
+ const MAX_SUPERVISION_RETRIES = 3;
19
31
  export function createCodexClient(capture) {
20
32
  const debug = process.env.CODEX_DEBUG === '1';
21
33
  // A custom fetch that tees error-response bodies into `capture`.
@@ -146,7 +158,37 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
146
158
  const sandboxPolicy = options.sandboxPolicy ?? providerConfig.sandboxPolicy ?? 'cwd-only';
147
159
  const effort = options.effort ?? providerConfig.effort;
148
160
  const abortController = new AbortController();
149
- const tracker = new FileTracker();
161
+ // --- Progress event emission (Task 11) ----------------------------------
162
+ //
163
+ // `onProgress` is already wrapped in `safeSink` by the orchestrator
164
+ // (Task 8), so any throw from the consumer callback is swallowed
165
+ // upstream and cannot corrupt this loop. We do not need to wrap it
166
+ // again here.
167
+ const onProgress = options.onProgress;
168
+ const shouldCaptureTrace = options.includeProgressTrace ?? false;
169
+ const traceBuffer = [];
170
+ const emit = (event) => {
171
+ if (shouldCaptureTrace)
172
+ traceBuffer.push(event);
173
+ if (onProgress)
174
+ onProgress(event);
175
+ };
176
+ // Accumulated state (hoisted so the timeout callback can read partial
177
+ // progress, AND so the FileTracker callback closure — constructed below
178
+ // — can read the running turn count at firing time).
179
+ //
180
+ // Turn attribution for tool calls: in codex-runner, tool calls fire in
181
+ // the tool-execution loop AFTER the model's stream for that turn has
182
+ // completed but BEFORE the next iteration of `while` starts. The `turns`
183
+ // variable already reflects the current turn at that point (it was
184
+ // incremented at the top of the iteration), so the callback can read it
185
+ // directly — no +1 offset.
186
+ let inputTokens = 0;
187
+ let outputTokens = 0;
188
+ let turns = 0;
189
+ const tracker = new FileTracker((summary) => {
190
+ emit({ kind: 'tool_call', turn: turns, toolSummary: summary });
191
+ });
150
192
  const toolImpls = createToolImplementations(tracker, cwd, sandboxPolicy, abortController.signal);
151
193
  const codexTools = toolMode === 'full' ? buildCodexTools(toolImpls, sandboxPolicy) : [];
152
194
  const toolsByName = new Map(codexTools.map(t => [t.name, t]));
@@ -167,29 +209,111 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
167
209
  ? configuredHostedTools.map(t => ({ type: t }))
168
210
  : [];
169
211
  const allTools = [...responsesTools, ...hostedTools];
170
- // Accumulated state (hoisted so the timeout callback can read partial progress)
171
- let inputTokens = 0;
172
- let outputTokens = 0;
173
- let turns = 0;
212
+ // --- Prevention layer: system prompt + budget hint ---
213
+ //
214
+ // buildSystemPrompt() is deliberately static and parameter-free (same
215
+ // decision as openai-runner and claude-runner: Task 1 review rejected
216
+ // provider/maxTurns options). The budget hint is prepended to the user
217
+ // prompt so the model sees it as part of its task brief, while the system
218
+ // prompt is threaded through the Responses API `instructions` field.
219
+ const systemPrompt = buildSystemPrompt();
220
+ const budgetHint = buildBudgetHint({ maxTurns });
221
+ const promptWithBudgetHint = `${budgetHint}\n\n${prompt}`;
222
+ // --- onInitialRequest (Task 12) ----------------------------------------
223
+ //
224
+ // Fire once per attempt with the canonical orchestrator-side initial
225
+ // brief: `${systemPrompt}\n\n${promptWithBudgetHint}`. This is NOT the
226
+ // literal request body the OpenAI Responses API transmits — codex
227
+ // sends the systemPrompt via the Responses API `instructions` field
228
+ // and the user prompt as a structured `input` message array. We hash
229
+ // the canonical form instead so the hash is cross-runner stable:
230
+ // openai-runner and claude-runner compute the same hash from the same
231
+ // canonical string. See `AttemptRecord.initialPromptHash` in types.ts
232
+ // for the full wire-level caveat.
233
+ if (options.onInitialRequest) {
234
+ const canonicalInitialBrief = `${systemPrompt}\n\n${promptWithBudgetHint}`;
235
+ try {
236
+ options.onInitialRequest({
237
+ lengthChars: canonicalInitialBrief.length,
238
+ sha256: createHash('sha256').update(canonicalInitialBrief).digest('hex'),
239
+ });
240
+ }
241
+ catch {
242
+ // Swallow — a broken callback must not affect dispatch.
243
+ }
244
+ }
245
+ // --- Scratchpad: buffers every text emission the codex backend streams
246
+ // through our loop. Every termination path (ok / incomplete / max_turns /
247
+ // error / timeout / force_salvage) salvages `scratchpad.latest()` when
248
+ // the final message is empty or degenerate. ---
249
+ const scratchpad = new TextScratchpad();
250
+ // --- Watchdog: resolve the input-token soft limit once per run ---
251
+ const profile = findModelProfile(providerConfig.model);
252
+ const softLimit = resolveInputTokenSoftLimit(providerConfig, profile);
253
+ // --- Task timing + parent model (Task 9) --------------------------------
254
+ const taskStartMs = Date.now();
255
+ const parentModel = options.parentModel;
174
256
  const run = async () => {
175
257
  const capture = {};
176
258
  const client = createCodexClient(capture);
177
259
  const input = [
178
260
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
179
- { role: 'user', content: prompt },
261
+ { role: 'user', content: promptWithBudgetHint },
180
262
  ];
181
263
  let output = '';
264
+ // --- Abort-path investigation (plan Step 2) ---------------------------
265
+ //
266
+ // The 2026-04-10 Fate dispatch captured an error "Request was aborted |
267
+ // last response status: completed". The "completed" suffix was
268
+ // misleading: it was captured from a PREVIOUS successful turn, not the
269
+ // failed one. Mechanism:
270
+ //
271
+ // 1. Turn N's stream emits `response.completed` with status
272
+ // `'completed'`. We update `lastResponseStatus = 'completed'`.
273
+ // 2. Turn N+1 starts; `client.responses.create(...)` opens a new
274
+ // stream, but the abort signal fires before any
275
+ // `response.completed` event is received.
276
+ // 3. The thrown error is caught below. The catch branch reads
277
+ // `lastResponseStatus` — which is STILL `'completed'` from turn N
278
+ // — and appends it as "last response status: completed", making
279
+ // the error look like it originated from a successful response.
280
+ //
281
+ // Fix: track which turn the status was captured on. If the status was
282
+ // NOT captured on the current (failed) turn, drop the suffix. That way
283
+ // we never emit a status that belongs to a different, already-
284
+ // concluded request. Users saw the misleading suffix and wasted time
285
+ // debugging a phantom "the request completed but was aborted" condition
286
+ // that doesn't exist.
182
287
  let lastResponseStatus = null;
288
+ let lastResponseStatusTurn = null;
289
+ // --- Supervision / watchdog bookkeeping ---
290
+ let supervisionRetries = 0;
291
+ // Initialised to `null` (NOT ''): on the first turn there is no
292
+ // previous degenerate output to compare against, so the same-output
293
+ // early-out must be skipped. Initialising to '' would cause
294
+ // sameDegenerateOutput('', '') to fire on a first-turn empty output
295
+ // and break the loop before any retries run. See openai-runner
296
+ // regression #5.
297
+ let lastDegenerateOutput = null;
298
+ // High-watermark guard for the watchdog warning nudge — fire at most
299
+ // once per distinct input-token level. Mirrors openai-runner and
300
+ // claude-runner.
301
+ let lastWarnedInputTokens = -1;
183
302
  try {
184
303
  while (turns < maxTurns) {
185
304
  turns++;
305
+ // Emit turn_start AFTER incrementing so `turn` matches the 1-indexed
306
+ // turn number we use everywhere else in this runner (the scratchpad
307
+ // append, watchdog logs, error diagnostics, result.turns).
308
+ emit({ kind: 'turn_start', turn: turns, provider: 'codex' });
186
309
  // Codex backend requires streaming. The Codex backend's
187
310
  // `response.completed` event does NOT populate `response.output` —
188
311
  // we must accumulate content from individual stream events.
189
- // `instructions` is required (mirrors gumi-agent's proven shape).
312
+ // `instructions` carries the prevention-layer system prompt; the
313
+ // per-run budget hint is already prepended to the first user input.
190
314
  const stream = await client.responses.create({
191
315
  model: providerConfig.model,
192
- instructions: prompt,
316
+ instructions: systemPrompt,
193
317
  input,
194
318
  stream: true,
195
319
  store: false,
@@ -241,8 +365,10 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
241
365
  inputTokens += r.usage.input_tokens ?? 0;
242
366
  outputTokens += r.usage.output_tokens ?? 0;
243
367
  }
244
- if (r?.status)
368
+ if (r?.status) {
245
369
  lastResponseStatus = r.status;
370
+ lastResponseStatusTurn = turns;
371
+ }
246
372
  }
247
373
  }
248
374
  if (process.env.CODEX_DEBUG === '1') {
@@ -256,6 +382,20 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
256
382
  if (!sawCompleted) {
257
383
  throw new Error('Codex stream ended without a response.completed event');
258
384
  }
385
+ // Buffer this turn's text into the scratchpad BEFORE any exit so
386
+ // every termination path (including supervision exhaustion and
387
+ // force_salvage) can salvage it. Codex does not emit <think> tags
388
+ // by default, so there is no stripping step here.
389
+ if (textThisTurn) {
390
+ scratchpad.append(turns, textThisTurn);
391
+ emit({
392
+ kind: 'text_emission',
393
+ turn: turns,
394
+ chars: textThisTurn.length,
395
+ preview: textThisTurn.slice(0, 200),
396
+ });
397
+ output = textThisTurn;
398
+ }
259
399
  // Replay only function_call items into the next turn's input.
260
400
  //
261
401
  // We send `store: false` to the Responses API, which means the server
@@ -287,26 +427,175 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
287
427
  });
288
428
  }
289
429
  }
290
- if (textThisTurn) {
291
- output = textThisTurn;
430
+ // --- Watchdog checks after tokens are updated -------------------
431
+ const watchdogStatus = checkWatchdogThreshold(inputTokens, softLimit);
432
+ if (watchdogStatus !== 'ok') {
433
+ logWatchdogEvent(watchdogStatus, {
434
+ provider: 'codex',
435
+ model: providerConfig.model,
436
+ turn: turns,
437
+ inputTokens,
438
+ softLimit,
439
+ scratchpadChars: scratchpad.toString().length,
440
+ });
292
441
  }
293
- else if (toolCalls.length === 0) {
294
- output = `[codex returned no text items streamed: ${itemTypesSeen.join(', ') || '(none)'}]`;
442
+ if (watchdogStatus === 'force_salvage') {
443
+ // `watchdog_force_salvage` is not an injected message no
444
+ // re-prompt is sent — but observers still want to see exactly
445
+ // why the run is being killed. Emit with contentLengthChars: 0
446
+ // to reflect the "nothing was injected, we just terminated"
447
+ // semantics (mirrors openai-runner and claude-runner).
448
+ emit({
449
+ kind: 'injection',
450
+ injectionType: 'watchdog_force_salvage',
451
+ turn: turns,
452
+ contentLengthChars: 0,
453
+ });
454
+ const salvaged = buildCodexForceSalvageResult({
455
+ tracker,
456
+ scratchpad,
457
+ providerConfig,
458
+ inputTokens,
459
+ outputTokens,
460
+ turns,
461
+ softLimit,
462
+ durationMs: Date.now() - taskStartMs,
463
+ parentModel,
464
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
465
+ });
466
+ emit({ kind: 'done', status: salvaged.status });
467
+ return salvaged;
468
+ }
469
+ // Warning-band nudge: fire at most once per distinct input-token
470
+ // high-watermark. Pushed as a user message so the next turn of
471
+ // the codex loop addresses the budget-pressure prompt. We use
472
+ // the shared prevention helper (NOT an inline string) so every
473
+ // runner emits byte-identical wording.
474
+ if (watchdogStatus === 'warning' && inputTokens > lastWarnedInputTokens) {
475
+ lastWarnedInputTokens = inputTokens;
476
+ const warning = buildBudgetPressureNudge({ inputTokens, softLimit });
477
+ input.push({
478
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
479
+ role: 'user',
480
+ content: warning,
481
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
482
+ });
483
+ emit({
484
+ kind: 'injection',
485
+ injectionType: 'watchdog_warning',
486
+ turn: turns,
487
+ contentLengthChars: warning.length,
488
+ });
295
489
  }
296
- // If the model made no tool calls, it's done
490
+ // --- Periodic re-grounding inside the loop ---------------------
491
+ if (turns > 0 && turns % RE_GROUNDING_INTERVAL_TURNS === 0) {
492
+ const reground = buildReGroundingMessage({
493
+ originalPromptExcerpt: prompt,
494
+ currentTurn: turns,
495
+ maxTurns,
496
+ toolCallsSoFar: tracker.getToolCalls().length,
497
+ filesReadSoFar: tracker.getReads().length,
498
+ });
499
+ input.push({
500
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
501
+ role: 'user',
502
+ content: reground,
503
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
504
+ });
505
+ emit({
506
+ kind: 'injection',
507
+ injectionType: 'reground',
508
+ turn: turns,
509
+ contentLengthChars: reground.length,
510
+ });
511
+ }
512
+ // --- turn_complete: one event per while-iteration. Fires after the
513
+ // watchdog + re-grounding checks have run (so cumulative token
514
+ // counts and any injection events are already on the wire) and
515
+ // BEFORE the supervision branching / tool-execution loop. Every
516
+ // continue/return in the branches below happens AFTER this event,
517
+ // so the sequence "turn_start ... text_emission ... turn_complete"
518
+ // is guaranteed per iteration.
519
+ emit({
520
+ kind: 'turn_complete',
521
+ turn: turns,
522
+ cumulativeInputTokens: inputTokens,
523
+ cumulativeOutputTokens: outputTokens,
524
+ });
525
+ // If the model made no tool calls, the turn ended with either a
526
+ // final answer or a degenerate emission. Wrap in the supervision
527
+ // state machine: valid text is an immediate ok-exit; degenerate
528
+ // either re-prompts (and continues the loop) or — if the retry
529
+ // budget is spent / same-output early-out fires — exits as
530
+ // incomplete with scratchpad salvage.
297
531
  if (toolCalls.length === 0) {
298
- return {
299
- output,
300
- status: 'ok',
301
- usage: {
532
+ const stripped = textThisTurn; // codex does not emit <think> tags
533
+ const validation = validateCompletion(stripped);
534
+ // NEW: coverage check — only when syntactic validation passes
535
+ if (validation.valid && options.expectedCoverage) {
536
+ const coverageValidation = validateCoverage(stripped, options.expectedCoverage);
537
+ if (!coverageValidation.valid) {
538
+ validation.valid = false;
539
+ validation.kind = coverageValidation.kind;
540
+ validation.reason = coverageValidation.reason;
541
+ }
542
+ }
543
+ if (validation.valid) {
544
+ const ok = buildCodexOkResult({
545
+ tracker,
546
+ scratchpad,
547
+ providerConfig,
302
548
  inputTokens,
303
549
  outputTokens,
304
- totalTokens: inputTokens + outputTokens,
305
- costUSD: null,
306
- },
307
- turns,
308
- files: tracker.getFiles(),
309
- };
550
+ turns,
551
+ output: stripped,
552
+ durationMs: Date.now() - taskStartMs,
553
+ parentModel,
554
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
555
+ });
556
+ emit({ kind: 'done', status: ok.status });
557
+ return ok;
558
+ }
559
+ // Same-output early-out: only compare when we have a previous
560
+ // degenerate output. First-turn degeneracy must still get
561
+ // retries — see openai-runner regression #5.
562
+ if ((lastDegenerateOutput !== null &&
563
+ sameDegenerateOutput(stripped, lastDegenerateOutput)) ||
564
+ supervisionRetries >= MAX_SUPERVISION_RETRIES) {
565
+ const exhausted = buildCodexIncompleteResult({
566
+ tracker,
567
+ scratchpad,
568
+ providerConfig,
569
+ inputTokens,
570
+ outputTokens,
571
+ turns,
572
+ reason: `supervision loop exhausted after ${supervisionRetries} re-prompts (last kind: ${validation.kind ?? 'unknown'})`,
573
+ durationMs: Date.now() - taskStartMs,
574
+ parentModel,
575
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
576
+ });
577
+ emit({ kind: 'done', status: exhausted.status });
578
+ return exhausted;
579
+ }
580
+ // Inject the re-prompt as the next user input and continue
581
+ // the loop. The next turn of the codex backend will respond
582
+ // to the re-prompt directly.
583
+ lastDegenerateOutput = stripped;
584
+ supervisionRetries++;
585
+ const rePrompt = buildRePrompt(validation);
586
+ input.push({
587
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
588
+ role: 'user',
589
+ content: rePrompt,
590
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
591
+ });
592
+ emit({
593
+ kind: 'injection',
594
+ injectionType: injectionTypeFor(validation.kind),
595
+ turn: turns,
596
+ contentLengthChars: rePrompt.length,
597
+ });
598
+ continue;
310
599
  }
311
600
  // Execute tool calls and feed outputs back
312
601
  for (const call of toolCalls) {
@@ -331,19 +620,23 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
331
620
  });
332
621
  }
333
622
  }
334
- // Max turns exhausted
335
- return {
336
- output: output || `Agent exceeded max turns (${maxTurns}).`,
337
- status: 'max_turns',
338
- usage: {
339
- inputTokens,
340
- outputTokens,
341
- totalTokens: inputTokens + outputTokens,
342
- costUSD: null,
343
- },
623
+ // Max turns exhausted — salvage any buffered text.
624
+ const maxTurnsResult = buildCodexMaxTurnsResult({
625
+ tracker,
626
+ scratchpad,
627
+ providerConfig,
628
+ inputTokens,
629
+ outputTokens,
344
630
  turns,
345
- files: tracker.getFiles(),
346
- };
631
+ maxTurns,
632
+ lastOutput: output,
633
+ reason: `hand-rolled loop exited after completing ${turns} of ${maxTurns} user-declared turns without producing a clean final answer`,
634
+ durationMs: Date.now() - taskStartMs,
635
+ parentModel,
636
+ traceBuffer: shouldCaptureTrace ? traceBuffer : undefined,
637
+ });
638
+ emit({ kind: 'done', status: maxTurnsResult.status });
639
+ return maxTurnsResult;
347
640
  }
348
641
  catch (err) {
349
642
  // OpenAI SDK's APIError carries status/body/headers — surface them
@@ -374,30 +667,230 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
374
667
  }
375
668
  if (e?.requestID)
376
669
  pieces.push(`req_id=${e.requestID}`);
377
- if (lastResponseStatus)
670
+ // Only include `last response status` when it was captured on the
671
+ // CURRENT (failing) turn — otherwise it belongs to a previous,
672
+ // separate request and appending it is actively misleading. See the
673
+ // abort-path investigation comment at the top of `run()`.
674
+ if (lastResponseStatus && lastResponseStatusTurn === turns) {
378
675
  pieces.push(`last response status: ${lastResponseStatus}`);
676
+ }
677
+ else if (lastResponseStatus && lastResponseStatusTurn !== turns) {
678
+ pieces.push(`note: a previous request (turn ${lastResponseStatusTurn}) completed with status ` +
679
+ `"${lastResponseStatus}" — it is unrelated to this failure`);
680
+ }
379
681
  const detailed = pieces.join(' | ') || String(err);
682
+ // Classify the thrown error into a finer-grained RunStatus. Task 7
683
+ // introduces api_aborted / api_error / network_error alongside the
684
+ // catch-all 'error' status. The turn-scoped `lastResponseStatus`
685
+ // disambiguation above is ORTHOGONAL to this classification: the
686
+ // `detailed` message is still the rich operator-facing diagnostic,
687
+ // and `classifyError` only decides which RunStatus bucket the
688
+ // failure lands in.
689
+ const { status } = classifyError(err);
690
+ // Salvage: if the scratchpad has buffered text from earlier turns,
691
+ // return it as the output. Pre-Task-5 behavior returned only the
692
+ // error string, losing 30k+ tokens of work on abort.
693
+ emit({ kind: 'done', status });
694
+ const hasSalvage = !scratchpad.isEmpty();
695
+ const costUSD = computeCostUSD(inputTokens, outputTokens, providerConfig);
696
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
380
697
  return {
381
- output: `Sub-agent error: ${detailed}`,
382
- status: 'error',
698
+ output: hasSalvage ? scratchpad.latest() : `Sub-agent error: ${detailed}`,
699
+ status,
383
700
  usage: {
384
701
  inputTokens,
385
702
  outputTokens,
386
703
  totalTokens: inputTokens + outputTokens,
387
- costUSD: null,
704
+ costUSD,
705
+ savedCostUSD,
388
706
  },
389
707
  turns,
390
- files: tracker.getFiles(),
708
+ filesRead: tracker.getReads(),
709
+ directoriesListed: tracker.getDirectoriesListed(),
710
+ filesWritten: tracker.getWrites(),
711
+ toolCalls: tracker.getToolCalls(),
712
+ outputIsDiagnostic: !hasSalvage,
713
+ escalationLog: [],
391
714
  error: detailed,
715
+ durationMs: Date.now() - taskStartMs,
716
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
392
717
  };
393
718
  }
394
719
  };
395
- return withTimeout(run(), timeoutMs, () => ({
396
- output: `Agent timed out after ${timeoutMs}ms.`,
397
- status: 'timeout',
398
- files: tracker.getFiles(),
399
- usage: { inputTokens, outputTokens, totalTokens: inputTokens + outputTokens, costUSD: null },
720
+ return withTimeout(run(), timeoutMs, () => {
721
+ emit({ kind: 'done', status: 'timeout' });
722
+ const hasSalvage = !scratchpad.isEmpty();
723
+ const costUSD = computeCostUSD(inputTokens, outputTokens, providerConfig);
724
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
725
+ return {
726
+ // Preserve any text the scratchpad buffered before the timeout fired.
727
+ // Partial usage is read from the running accumulators hoisted above —
728
+ // hardcoded zeros would discard every token counted on partial turns.
729
+ output: hasSalvage ? scratchpad.latest() : `Agent timed out after ${timeoutMs}ms.`,
730
+ status: 'timeout',
731
+ filesRead: tracker.getReads(),
732
+ directoriesListed: tracker.getDirectoriesListed(),
733
+ filesWritten: tracker.getWrites(),
734
+ toolCalls: tracker.getToolCalls(),
735
+ usage: {
736
+ inputTokens,
737
+ outputTokens,
738
+ totalTokens: inputTokens + outputTokens,
739
+ costUSD,
740
+ savedCostUSD,
741
+ },
742
+ turns,
743
+ outputIsDiagnostic: !hasSalvage,
744
+ escalationLog: [],
745
+ durationMs: Date.now() - taskStartMs,
746
+ ...(shouldCaptureTrace && { progressTrace: trimProgressTrace(traceBuffer) }),
747
+ };
748
+ }, abortController);
749
+ }
750
+ function buildCodexOkResult(args) {
751
+ const { tracker, providerConfig, inputTokens, outputTokens, turns, output, durationMs, parentModel, traceBuffer } = args;
752
+ const costUSD = computeCostUSD(inputTokens, outputTokens, providerConfig);
753
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
754
+ return {
755
+ output,
756
+ status: 'ok',
757
+ usage: {
758
+ inputTokens,
759
+ outputTokens,
760
+ totalTokens: inputTokens + outputTokens,
761
+ costUSD,
762
+ savedCostUSD,
763
+ },
764
+ turns,
765
+ filesRead: tracker.getReads(),
766
+ directoriesListed: tracker.getDirectoriesListed(),
767
+ filesWritten: tracker.getWrites(),
768
+ toolCalls: tracker.getToolCalls(),
769
+ // `ok` always carries a real model answer — never a diagnostic.
770
+ outputIsDiagnostic: false,
771
+ escalationLog: [],
772
+ durationMs,
773
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
774
+ };
775
+ }
776
+ /**
777
+ * Supervision-exhausted path: retry cap hit or same-output early-out. Prefer
778
+ * scratchpad salvage; fall back to the incomplete diagnostic.
779
+ */
780
+ function buildCodexIncompleteResult(args) {
781
+ const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, reason, durationMs, parentModel, traceBuffer } = args;
782
+ const filesRead = tracker.getReads();
783
+ const filesWritten = tracker.getWrites();
784
+ const costUSD = computeCostUSD(inputTokens, outputTokens, providerConfig);
785
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
786
+ const hasSalvage = !scratchpad.isEmpty();
787
+ return {
788
+ output: hasSalvage
789
+ ? scratchpad.latest()
790
+ : buildCodexIncompleteDiagnostic({
791
+ turns,
792
+ inputTokens,
793
+ outputTokens,
794
+ filesRead,
795
+ filesWritten,
796
+ }),
797
+ status: 'incomplete',
798
+ usage: {
799
+ inputTokens,
800
+ outputTokens,
801
+ totalTokens: inputTokens + outputTokens,
802
+ costUSD,
803
+ savedCostUSD,
804
+ },
400
805
  turns,
401
- }), abortController);
806
+ filesRead,
807
+ directoriesListed: tracker.getDirectoriesListed(),
808
+ filesWritten,
809
+ toolCalls: tracker.getToolCalls(),
810
+ outputIsDiagnostic: !hasSalvage,
811
+ escalationLog: [],
812
+ error: reason,
813
+ durationMs,
814
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
815
+ };
816
+ }
817
+ function buildCodexForceSalvageResult(args) {
818
+ const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, softLimit, durationMs, parentModel, traceBuffer } = args;
819
+ const costUSD = computeCostUSD(inputTokens, outputTokens, providerConfig);
820
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
821
+ const hasSalvage = !scratchpad.isEmpty();
822
+ return {
823
+ output: hasSalvage
824
+ ? scratchpad.latest()
825
+ : `[codex sub-agent forcibly terminated at ${inputTokens} input tokens (soft limit ${softLimit}). No usable text was buffered.]`,
826
+ status: 'incomplete',
827
+ usage: {
828
+ inputTokens,
829
+ outputTokens,
830
+ totalTokens: inputTokens + outputTokens,
831
+ costUSD,
832
+ savedCostUSD,
833
+ },
834
+ turns,
835
+ filesRead: tracker.getReads(),
836
+ directoriesListed: tracker.getDirectoriesListed(),
837
+ filesWritten: tracker.getWrites(),
838
+ toolCalls: tracker.getToolCalls(),
839
+ outputIsDiagnostic: !hasSalvage,
840
+ escalationLog: [],
841
+ durationMs,
842
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
843
+ };
844
+ }
845
+ function buildCodexMaxTurnsResult(args) {
846
+ const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, maxTurns, lastOutput, reason, durationMs, parentModel, traceBuffer } = args;
847
+ const hasSalvage = !scratchpad.isEmpty();
848
+ // Note: `lastOutput` here is the model's final text for the max-turns
849
+ // boundary — real model content, not a diagnostic template. Only the
850
+ // `Agent exceeded max turns…` fallback (empty scratchpad AND empty
851
+ // lastOutput) is a diagnostic.
852
+ const output = hasSalvage
853
+ ? scratchpad.latest()
854
+ : (lastOutput || `Agent exceeded max turns (${maxTurns}).`);
855
+ const outputIsDiagnostic = !hasSalvage && !lastOutput;
856
+ const costUSD = computeCostUSD(inputTokens, outputTokens, providerConfig);
857
+ const savedCostUSD = computeSavedCostUSD(costUSD, inputTokens, outputTokens, parentModel);
858
+ return {
859
+ output,
860
+ status: 'max_turns',
861
+ usage: {
862
+ inputTokens,
863
+ outputTokens,
864
+ totalTokens: inputTokens + outputTokens,
865
+ costUSD,
866
+ savedCostUSD,
867
+ },
868
+ turns,
869
+ filesRead: tracker.getReads(),
870
+ directoriesListed: tracker.getDirectoriesListed(),
871
+ filesWritten: tracker.getWrites(),
872
+ toolCalls: tracker.getToolCalls(),
873
+ outputIsDiagnostic,
874
+ escalationLog: [],
875
+ error: reason,
876
+ durationMs,
877
+ ...(traceBuffer && { progressTrace: trimProgressTrace(traceBuffer) }),
878
+ };
879
+ }
880
+ function buildCodexIncompleteDiagnostic(opts) {
881
+ return [
882
+ '[codex sub-agent terminated without producing a final answer]',
883
+ '',
884
+ 'The model emitted no tool calls and no usable text on its final turn, and',
885
+ 'supervision re-prompts did not recover a valid response.',
886
+ '',
887
+ `Turns used: ${opts.turns}`,
888
+ `Input tokens: ${opts.inputTokens}`,
889
+ `Output tokens: ${opts.outputTokens}`,
890
+ `Files read: ${opts.filesRead.length}`,
891
+ `Files written: ${opts.filesWritten.length}`,
892
+ '',
893
+ 'Recommended action: re-dispatch with a tighter brief, or escalate provider tier.',
894
+ ].join('\n');
402
895
  }
403
896
  //# sourceMappingURL=codex-runner.js.map