@zhixuan92/multi-model-agent-core 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -6
- package/dist/config/schema.d.ts +27 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +13 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/context/context-block-store.d.ts +75 -0
- package/dist/context/context-block-store.d.ts.map +1 -0
- package/dist/context/context-block-store.js +82 -0
- package/dist/context/context-block-store.js.map +1 -0
- package/dist/context/expand-context-blocks.d.ts +20 -0
- package/dist/context/expand-context-blocks.d.ts.map +1 -0
- package/dist/context/expand-context-blocks.js +46 -0
- package/dist/context/expand-context-blocks.js.map +1 -0
- package/dist/delegate-with-escalation.d.ts +34 -0
- package/dist/delegate-with-escalation.d.ts.map +1 -0
- package/dist/delegate-with-escalation.js +168 -0
- package/dist/delegate-with-escalation.js.map +1 -0
- package/dist/index.d.ts +4 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -1
- package/dist/model-profiles.json +8 -4
- package/dist/provider.d.ts.map +1 -1
- package/dist/provider.js +7 -1
- package/dist/provider.js.map +1 -1
- package/dist/routing/model-profiles.d.ts +1 -0
- package/dist/routing/model-profiles.d.ts.map +1 -1
- package/dist/routing/model-profiles.js +4 -0
- package/dist/routing/model-profiles.js.map +1 -1
- package/dist/run-tasks.d.ts +26 -2
- package/dist/run-tasks.d.ts.map +1 -1
- package/dist/run-tasks.js +61 -19
- package/dist/run-tasks.js.map +1 -1
- package/dist/runners/claude-runner.d.ts.map +1 -1
- package/dist/runners/claude-runner.js +643 -32
- package/dist/runners/claude-runner.js.map +1 -1
- package/dist/runners/codex-runner.d.ts.map +1 -1
- package/dist/runners/codex-runner.js +473 -48
- package/dist/runners/codex-runner.js.map +1 -1
- package/dist/runners/error-classification.d.ts +30 -0
- package/dist/runners/error-classification.d.ts.map +1 -0
- package/dist/runners/error-classification.js +72 -0
- package/dist/runners/error-classification.js.map +1 -0
- package/dist/runners/injection-type.d.ts +17 -0
- package/dist/runners/injection-type.d.ts.map +1 -0
- package/dist/runners/injection-type.js +27 -0
- package/dist/runners/injection-type.js.map +1 -0
- package/dist/runners/openai-runner.d.ts +5 -0
- package/dist/runners/openai-runner.d.ts.map +1 -1
- package/dist/runners/openai-runner.js +508 -36
- package/dist/runners/openai-runner.js.map +1 -1
- package/dist/runners/prevention.d.ts +41 -0
- package/dist/runners/prevention.d.ts.map +1 -0
- package/dist/runners/prevention.js +68 -0
- package/dist/runners/prevention.js.map +1 -0
- package/dist/runners/supervision.d.ts +130 -0
- package/dist/runners/supervision.d.ts.map +1 -0
- package/dist/runners/supervision.js +238 -0
- package/dist/runners/supervision.js.map +1 -0
- package/dist/tools/claude-adapter.d.ts.map +1 -1
- package/dist/tools/claude-adapter.js +6 -3
- package/dist/tools/claude-adapter.js.map +1 -1
- package/dist/tools/definitions.d.ts +3 -1
- package/dist/tools/definitions.d.ts.map +1 -1
- package/dist/tools/definitions.js +56 -5
- package/dist/tools/definitions.js.map +1 -1
- package/dist/tools/openai-adapter.d.ts.map +1 -1
- package/dist/tools/openai-adapter.js +6 -3
- package/dist/tools/openai-adapter.js.map +1 -1
- package/dist/tools/scratchpad.d.ts +28 -0
- package/dist/tools/scratchpad.d.ts.map +1 -0
- package/dist/tools/scratchpad.js +49 -0
- package/dist/tools/scratchpad.js.map +1 -0
- package/dist/tools/tracker.d.ts +38 -2
- package/dist/tools/tracker.d.ts.map +1 -1
- package/dist/tools/tracker.js +54 -5
- package/dist/tools/tracker.js.map +1 -1
- package/dist/types.d.ts +184 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +17 -1
- package/dist/types.js.map +1 -1
- package/package.json +9 -15
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
import OpenAI from 'openai';
|
|
2
2
|
import { z } from 'zod';
|
|
3
|
+
import { createHash } from 'node:crypto';
|
|
3
4
|
import { getCodexAuth } from '../auth/codex-oauth.js';
|
|
4
|
-
import { withTimeout } from '../types.js';
|
|
5
|
+
import { withTimeout, computeCostUSD, } from '../types.js';
|
|
5
6
|
import { FileTracker } from '../tools/tracker.js';
|
|
6
7
|
import { createToolImplementations } from '../tools/definitions.js';
|
|
8
|
+
import { TextScratchpad } from '../tools/scratchpad.js';
|
|
9
|
+
import { buildSystemPrompt, buildBudgetHint, buildReGroundingMessage, buildBudgetPressureNudge, RE_GROUNDING_INTERVAL_TURNS, } from './prevention.js';
|
|
10
|
+
import { validateCompletion, buildRePrompt, sameDegenerateOutput, resolveInputTokenSoftLimit, checkWatchdogThreshold, logWatchdogEvent, } from './supervision.js';
|
|
11
|
+
import { injectionTypeFor } from './injection-type.js';
|
|
12
|
+
import { classifyError } from './error-classification.js';
|
|
13
|
+
import { findModelProfile } from '../routing/model-profiles.js';
|
|
7
14
|
// CODEX_DEBUG=1 causes the runner to log raw HTTP request/response bodies to
|
|
8
15
|
// stderr. Those bodies routinely include the user's prompt, file contents,
|
|
9
16
|
// tool arguments, and other sensitive data — fine for local debugging,
|
|
@@ -16,6 +23,11 @@ if (process.env.CODEX_DEBUG === '1') {
|
|
|
16
23
|
'bodies (including prompts and file contents) will be logged to stderr. ' +
|
|
17
24
|
'Disable in any environment where logs may be retained or shared.');
|
|
18
25
|
}
|
|
26
|
+
/**
|
|
27
|
+
* Hard cap on supervision re-prompts before we give up and salvage. Three is
|
|
28
|
+
* the value chosen in the spec (A.2.2); mirrors openai-runner and claude-runner.
|
|
29
|
+
*/
|
|
30
|
+
const MAX_SUPERVISION_RETRIES = 3;
|
|
19
31
|
export function createCodexClient(capture) {
|
|
20
32
|
const debug = process.env.CODEX_DEBUG === '1';
|
|
21
33
|
// A custom fetch that tees error-response bodies into `capture`.
|
|
@@ -146,7 +158,33 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
146
158
|
const sandboxPolicy = options.sandboxPolicy ?? providerConfig.sandboxPolicy ?? 'cwd-only';
|
|
147
159
|
const effort = options.effort ?? providerConfig.effort;
|
|
148
160
|
const abortController = new AbortController();
|
|
149
|
-
|
|
161
|
+
// --- Progress event emission (Task 11) ----------------------------------
|
|
162
|
+
//
|
|
163
|
+
// `onProgress` is already wrapped in `safeSink` by the orchestrator
|
|
164
|
+
// (Task 8), so any throw from the consumer callback is swallowed
|
|
165
|
+
// upstream and cannot corrupt this loop. We do not need to wrap it
|
|
166
|
+
// again here.
|
|
167
|
+
const onProgress = options.onProgress;
|
|
168
|
+
const emit = (event) => {
|
|
169
|
+
if (onProgress)
|
|
170
|
+
onProgress(event);
|
|
171
|
+
};
|
|
172
|
+
// Accumulated state (hoisted so the timeout callback can read partial
|
|
173
|
+
// progress, AND so the FileTracker callback closure — constructed below
|
|
174
|
+
// — can read the running turn count at firing time).
|
|
175
|
+
//
|
|
176
|
+
// Turn attribution for tool calls: in codex-runner, tool calls fire in
|
|
177
|
+
// the tool-execution loop AFTER the model's stream for that turn has
|
|
178
|
+
// completed but BEFORE the next iteration of `while` starts. The `turns`
|
|
179
|
+
// variable already reflects the current turn at that point (it was
|
|
180
|
+
// incremented at the top of the iteration), so the callback can read it
|
|
181
|
+
// directly — no +1 offset.
|
|
182
|
+
let inputTokens = 0;
|
|
183
|
+
let outputTokens = 0;
|
|
184
|
+
let turns = 0;
|
|
185
|
+
const tracker = new FileTracker((summary) => {
|
|
186
|
+
emit({ kind: 'tool_call', turn: turns, toolSummary: summary });
|
|
187
|
+
});
|
|
150
188
|
const toolImpls = createToolImplementations(tracker, cwd, sandboxPolicy, abortController.signal);
|
|
151
189
|
const codexTools = toolMode === 'full' ? buildCodexTools(toolImpls, sandboxPolicy) : [];
|
|
152
190
|
const toolsByName = new Map(codexTools.map(t => [t.name, t]));
|
|
@@ -167,29 +205,108 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
167
205
|
? configuredHostedTools.map(t => ({ type: t }))
|
|
168
206
|
: [];
|
|
169
207
|
const allTools = [...responsesTools, ...hostedTools];
|
|
170
|
-
//
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
208
|
+
// --- Prevention layer: system prompt + budget hint ---
|
|
209
|
+
//
|
|
210
|
+
// buildSystemPrompt() is deliberately static and parameter-free (same
|
|
211
|
+
// decision as openai-runner and claude-runner: Task 1 review rejected
|
|
212
|
+
// provider/maxTurns options). The budget hint is prepended to the user
|
|
213
|
+
// prompt so the model sees it as part of its task brief, while the system
|
|
214
|
+
// prompt is threaded through the Responses API `instructions` field.
|
|
215
|
+
const systemPrompt = buildSystemPrompt();
|
|
216
|
+
const budgetHint = buildBudgetHint({ maxTurns });
|
|
217
|
+
const promptWithBudgetHint = `${budgetHint}\n\n${prompt}`;
|
|
218
|
+
// --- onInitialRequest (Task 12) ----------------------------------------
|
|
219
|
+
//
|
|
220
|
+
// Fire once per attempt with the canonical orchestrator-side initial
|
|
221
|
+
// brief: `${systemPrompt}\n\n${promptWithBudgetHint}`. This is NOT the
|
|
222
|
+
// literal request body the OpenAI Responses API transmits — codex
|
|
223
|
+
// sends the systemPrompt via the Responses API `instructions` field
|
|
224
|
+
// and the user prompt as a structured `input` message array. We hash
|
|
225
|
+
// the canonical form instead so the hash is cross-runner stable:
|
|
226
|
+
// openai-runner and claude-runner compute the same hash from the same
|
|
227
|
+
// canonical string. See `AttemptRecord.initialPromptHash` in types.ts
|
|
228
|
+
// for the full wire-level caveat.
|
|
229
|
+
if (options.onInitialRequest) {
|
|
230
|
+
const canonicalInitialBrief = `${systemPrompt}\n\n${promptWithBudgetHint}`;
|
|
231
|
+
try {
|
|
232
|
+
options.onInitialRequest({
|
|
233
|
+
lengthChars: canonicalInitialBrief.length,
|
|
234
|
+
sha256: createHash('sha256').update(canonicalInitialBrief).digest('hex'),
|
|
235
|
+
});
|
|
236
|
+
}
|
|
237
|
+
catch {
|
|
238
|
+
// Swallow — a broken callback must not affect dispatch.
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
// --- Scratchpad: buffers every text emission the codex backend streams
|
|
242
|
+
// through our loop. Every termination path (ok / incomplete / max_turns /
|
|
243
|
+
// error / timeout / force_salvage) salvages `scratchpad.latest()` when
|
|
244
|
+
// the final message is empty or degenerate. ---
|
|
245
|
+
const scratchpad = new TextScratchpad();
|
|
246
|
+
// --- Watchdog: resolve the input-token soft limit once per run ---
|
|
247
|
+
const profile = findModelProfile(providerConfig.model);
|
|
248
|
+
const softLimit = resolveInputTokenSoftLimit(providerConfig, profile);
|
|
174
249
|
const run = async () => {
|
|
175
250
|
const capture = {};
|
|
176
251
|
const client = createCodexClient(capture);
|
|
177
252
|
const input = [
|
|
178
253
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
179
|
-
{ role: 'user', content:
|
|
254
|
+
{ role: 'user', content: promptWithBudgetHint },
|
|
180
255
|
];
|
|
181
256
|
let output = '';
|
|
257
|
+
// --- Abort-path investigation (plan Step 2) ---------------------------
|
|
258
|
+
//
|
|
259
|
+
// The 2026-04-10 Fate dispatch captured an error "Request was aborted |
|
|
260
|
+
// last response status: completed". The "completed" suffix was
|
|
261
|
+
// misleading: it was captured from a PREVIOUS successful turn, not the
|
|
262
|
+
// failed one. Mechanism:
|
|
263
|
+
//
|
|
264
|
+
// 1. Turn N's stream emits `response.completed` with status
|
|
265
|
+
// `'completed'`. We update `lastResponseStatus = 'completed'`.
|
|
266
|
+
// 2. Turn N+1 starts; `client.responses.create(...)` opens a new
|
|
267
|
+
// stream, but the abort signal fires before any
|
|
268
|
+
// `response.completed` event is received.
|
|
269
|
+
// 3. The thrown error is caught below. The catch branch reads
|
|
270
|
+
// `lastResponseStatus` — which is STILL `'completed'` from turn N
|
|
271
|
+
// — and appends it as "last response status: completed", making
|
|
272
|
+
// the error look like it originated from a successful response.
|
|
273
|
+
//
|
|
274
|
+
// Fix: track which turn the status was captured on. If the status was
|
|
275
|
+
// NOT captured on the current (failed) turn, drop the suffix. That way
|
|
276
|
+
// we never emit a status that belongs to a different, already-
|
|
277
|
+
// concluded request. Users saw the misleading suffix and wasted time
|
|
278
|
+
// debugging a phantom "the request completed but was aborted" condition
|
|
279
|
+
// that doesn't exist.
|
|
182
280
|
let lastResponseStatus = null;
|
|
281
|
+
let lastResponseStatusTurn = null;
|
|
282
|
+
// --- Supervision / watchdog bookkeeping ---
|
|
283
|
+
let supervisionRetries = 0;
|
|
284
|
+
// Initialised to `null` (NOT ''): on the first turn there is no
|
|
285
|
+
// previous degenerate output to compare against, so the same-output
|
|
286
|
+
// early-out must be skipped. Initialising to '' would cause
|
|
287
|
+
// sameDegenerateOutput('', '') to fire on a first-turn empty output
|
|
288
|
+
// and break the loop before any retries run. See openai-runner
|
|
289
|
+
// regression #5.
|
|
290
|
+
let lastDegenerateOutput = null;
|
|
291
|
+
// High-watermark guard for the watchdog warning nudge — fire at most
|
|
292
|
+
// once per distinct input-token level. Mirrors openai-runner and
|
|
293
|
+
// claude-runner.
|
|
294
|
+
let lastWarnedInputTokens = -1;
|
|
183
295
|
try {
|
|
184
296
|
while (turns < maxTurns) {
|
|
185
297
|
turns++;
|
|
298
|
+
// Emit turn_start AFTER incrementing so `turn` matches the 1-indexed
|
|
299
|
+
// turn number we use everywhere else in this runner (the scratchpad
|
|
300
|
+
// append, watchdog logs, error diagnostics, result.turns).
|
|
301
|
+
emit({ kind: 'turn_start', turn: turns, provider: 'codex' });
|
|
186
302
|
// Codex backend requires streaming. The Codex backend's
|
|
187
303
|
// `response.completed` event does NOT populate `response.output` —
|
|
188
304
|
// we must accumulate content from individual stream events.
|
|
189
|
-
// `instructions`
|
|
305
|
+
// `instructions` carries the prevention-layer system prompt; the
|
|
306
|
+
// per-run budget hint is already prepended to the first user input.
|
|
190
307
|
const stream = await client.responses.create({
|
|
191
308
|
model: providerConfig.model,
|
|
192
|
-
instructions:
|
|
309
|
+
instructions: systemPrompt,
|
|
193
310
|
input,
|
|
194
311
|
stream: true,
|
|
195
312
|
store: false,
|
|
@@ -241,8 +358,10 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
241
358
|
inputTokens += r.usage.input_tokens ?? 0;
|
|
242
359
|
outputTokens += r.usage.output_tokens ?? 0;
|
|
243
360
|
}
|
|
244
|
-
if (r?.status)
|
|
361
|
+
if (r?.status) {
|
|
245
362
|
lastResponseStatus = r.status;
|
|
363
|
+
lastResponseStatusTurn = turns;
|
|
364
|
+
}
|
|
246
365
|
}
|
|
247
366
|
}
|
|
248
367
|
if (process.env.CODEX_DEBUG === '1') {
|
|
@@ -256,6 +375,20 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
256
375
|
if (!sawCompleted) {
|
|
257
376
|
throw new Error('Codex stream ended without a response.completed event');
|
|
258
377
|
}
|
|
378
|
+
// Buffer this turn's text into the scratchpad BEFORE any exit so
|
|
379
|
+
// every termination path (including supervision exhaustion and
|
|
380
|
+
// force_salvage) can salvage it. Codex does not emit <think> tags
|
|
381
|
+
// by default, so there is no stripping step here.
|
|
382
|
+
if (textThisTurn) {
|
|
383
|
+
scratchpad.append(turns, textThisTurn);
|
|
384
|
+
emit({
|
|
385
|
+
kind: 'text_emission',
|
|
386
|
+
turn: turns,
|
|
387
|
+
chars: textThisTurn.length,
|
|
388
|
+
preview: textThisTurn.slice(0, 200),
|
|
389
|
+
});
|
|
390
|
+
output = textThisTurn;
|
|
391
|
+
}
|
|
259
392
|
// Replay only function_call items into the next turn's input.
|
|
260
393
|
//
|
|
261
394
|
// We send `store: false` to the Responses API, which means the server
|
|
@@ -287,26 +420,156 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
287
420
|
});
|
|
288
421
|
}
|
|
289
422
|
}
|
|
290
|
-
|
|
291
|
-
|
|
423
|
+
// --- Watchdog checks after tokens are updated -------------------
|
|
424
|
+
const watchdogStatus = checkWatchdogThreshold(inputTokens, softLimit);
|
|
425
|
+
if (watchdogStatus !== 'ok') {
|
|
426
|
+
logWatchdogEvent(watchdogStatus, {
|
|
427
|
+
provider: 'codex',
|
|
428
|
+
model: providerConfig.model,
|
|
429
|
+
turn: turns,
|
|
430
|
+
inputTokens,
|
|
431
|
+
softLimit,
|
|
432
|
+
scratchpadChars: scratchpad.toString().length,
|
|
433
|
+
});
|
|
292
434
|
}
|
|
293
|
-
|
|
294
|
-
|
|
435
|
+
if (watchdogStatus === 'force_salvage') {
|
|
436
|
+
// `watchdog_force_salvage` is not an injected message — no
|
|
437
|
+
// re-prompt is sent — but observers still want to see exactly
|
|
438
|
+
// why the run is being killed. Emit with contentLengthChars: 0
|
|
439
|
+
// to reflect the "nothing was injected, we just terminated"
|
|
440
|
+
// semantics (mirrors openai-runner and claude-runner).
|
|
441
|
+
emit({
|
|
442
|
+
kind: 'injection',
|
|
443
|
+
injectionType: 'watchdog_force_salvage',
|
|
444
|
+
turn: turns,
|
|
445
|
+
contentLengthChars: 0,
|
|
446
|
+
});
|
|
447
|
+
const salvaged = buildCodexForceSalvageResult({
|
|
448
|
+
tracker,
|
|
449
|
+
scratchpad,
|
|
450
|
+
providerConfig,
|
|
451
|
+
inputTokens,
|
|
452
|
+
outputTokens,
|
|
453
|
+
turns,
|
|
454
|
+
softLimit,
|
|
455
|
+
});
|
|
456
|
+
emit({ kind: 'done', status: salvaged.status });
|
|
457
|
+
return salvaged;
|
|
458
|
+
}
|
|
459
|
+
// Warning-band nudge: fire at most once per distinct input-token
|
|
460
|
+
// high-watermark. Pushed as a user message so the next turn of
|
|
461
|
+
// the codex loop addresses the budget-pressure prompt. We use
|
|
462
|
+
// the shared prevention helper (NOT an inline string) so every
|
|
463
|
+
// runner emits byte-identical wording.
|
|
464
|
+
if (watchdogStatus === 'warning' && inputTokens > lastWarnedInputTokens) {
|
|
465
|
+
lastWarnedInputTokens = inputTokens;
|
|
466
|
+
const warning = buildBudgetPressureNudge({ inputTokens, softLimit });
|
|
467
|
+
input.push({
|
|
468
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
469
|
+
role: 'user',
|
|
470
|
+
content: warning,
|
|
471
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
472
|
+
});
|
|
473
|
+
emit({
|
|
474
|
+
kind: 'injection',
|
|
475
|
+
injectionType: 'watchdog_warning',
|
|
476
|
+
turn: turns,
|
|
477
|
+
contentLengthChars: warning.length,
|
|
478
|
+
});
|
|
295
479
|
}
|
|
296
|
-
//
|
|
480
|
+
// --- Periodic re-grounding inside the loop ---------------------
|
|
481
|
+
if (turns > 0 && turns % RE_GROUNDING_INTERVAL_TURNS === 0) {
|
|
482
|
+
const reground = buildReGroundingMessage({
|
|
483
|
+
originalPromptExcerpt: prompt,
|
|
484
|
+
currentTurn: turns,
|
|
485
|
+
maxTurns,
|
|
486
|
+
toolCallsSoFar: tracker.getToolCalls().length,
|
|
487
|
+
filesReadSoFar: tracker.getReads().length,
|
|
488
|
+
});
|
|
489
|
+
input.push({
|
|
490
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
491
|
+
role: 'user',
|
|
492
|
+
content: reground,
|
|
493
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
494
|
+
});
|
|
495
|
+
emit({
|
|
496
|
+
kind: 'injection',
|
|
497
|
+
injectionType: 'reground',
|
|
498
|
+
turn: turns,
|
|
499
|
+
contentLengthChars: reground.length,
|
|
500
|
+
});
|
|
501
|
+
}
|
|
502
|
+
// --- turn_complete: one event per while-iteration. Fires after the
|
|
503
|
+
// watchdog + re-grounding checks have run (so cumulative token
|
|
504
|
+
// counts and any injection events are already on the wire) and
|
|
505
|
+
// BEFORE the supervision branching / tool-execution loop. Every
|
|
506
|
+
// continue/return in the branches below happens AFTER this event,
|
|
507
|
+
// so the sequence "turn_start ... text_emission ... turn_complete"
|
|
508
|
+
// is guaranteed per iteration.
|
|
509
|
+
emit({
|
|
510
|
+
kind: 'turn_complete',
|
|
511
|
+
turn: turns,
|
|
512
|
+
cumulativeInputTokens: inputTokens,
|
|
513
|
+
cumulativeOutputTokens: outputTokens,
|
|
514
|
+
});
|
|
515
|
+
// If the model made no tool calls, the turn ended with either a
|
|
516
|
+
// final answer or a degenerate emission. Wrap in the supervision
|
|
517
|
+
// state machine: valid text is an immediate ok-exit; degenerate
|
|
518
|
+
// either re-prompts (and continues the loop) or — if the retry
|
|
519
|
+
// budget is spent / same-output early-out fires — exits as
|
|
520
|
+
// incomplete with scratchpad salvage.
|
|
297
521
|
if (toolCalls.length === 0) {
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
522
|
+
const stripped = textThisTurn; // codex does not emit <think> tags
|
|
523
|
+
const validation = validateCompletion(stripped);
|
|
524
|
+
if (validation.valid) {
|
|
525
|
+
const ok = buildCodexOkResult({
|
|
526
|
+
tracker,
|
|
527
|
+
scratchpad,
|
|
528
|
+
providerConfig,
|
|
302
529
|
inputTokens,
|
|
303
530
|
outputTokens,
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
}
|
|
531
|
+
turns,
|
|
532
|
+
output: stripped,
|
|
533
|
+
});
|
|
534
|
+
emit({ kind: 'done', status: ok.status });
|
|
535
|
+
return ok;
|
|
536
|
+
}
|
|
537
|
+
// Same-output early-out: only compare when we have a previous
|
|
538
|
+
// degenerate output. First-turn degeneracy must still get
|
|
539
|
+
// retries — see openai-runner regression #5.
|
|
540
|
+
if ((lastDegenerateOutput !== null &&
|
|
541
|
+
sameDegenerateOutput(stripped, lastDegenerateOutput)) ||
|
|
542
|
+
supervisionRetries >= MAX_SUPERVISION_RETRIES) {
|
|
543
|
+
const exhausted = buildCodexIncompleteResult({
|
|
544
|
+
tracker,
|
|
545
|
+
scratchpad,
|
|
546
|
+
providerConfig,
|
|
547
|
+
inputTokens,
|
|
548
|
+
outputTokens,
|
|
549
|
+
turns,
|
|
550
|
+
});
|
|
551
|
+
emit({ kind: 'done', status: exhausted.status });
|
|
552
|
+
return exhausted;
|
|
553
|
+
}
|
|
554
|
+
// Inject the re-prompt as the next user input and continue
|
|
555
|
+
// the loop. The next turn of the codex backend will respond
|
|
556
|
+
// to the re-prompt directly.
|
|
557
|
+
lastDegenerateOutput = stripped;
|
|
558
|
+
supervisionRetries++;
|
|
559
|
+
const rePrompt = buildRePrompt(validation);
|
|
560
|
+
input.push({
|
|
561
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
562
|
+
role: 'user',
|
|
563
|
+
content: rePrompt,
|
|
564
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
565
|
+
});
|
|
566
|
+
emit({
|
|
567
|
+
kind: 'injection',
|
|
568
|
+
injectionType: injectionTypeFor(validation.kind),
|
|
569
|
+
turn: turns,
|
|
570
|
+
contentLengthChars: rePrompt.length,
|
|
571
|
+
});
|
|
572
|
+
continue;
|
|
310
573
|
}
|
|
311
574
|
// Execute tool calls and feed outputs back
|
|
312
575
|
for (const call of toolCalls) {
|
|
@@ -331,19 +594,19 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
331
594
|
});
|
|
332
595
|
}
|
|
333
596
|
}
|
|
334
|
-
// Max turns exhausted
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
totalTokens: inputTokens + outputTokens,
|
|
342
|
-
costUSD: null,
|
|
343
|
-
},
|
|
597
|
+
// Max turns exhausted — salvage any buffered text.
|
|
598
|
+
const maxTurnsResult = buildCodexMaxTurnsResult({
|
|
599
|
+
tracker,
|
|
600
|
+
scratchpad,
|
|
601
|
+
providerConfig,
|
|
602
|
+
inputTokens,
|
|
603
|
+
outputTokens,
|
|
344
604
|
turns,
|
|
345
|
-
|
|
346
|
-
|
|
605
|
+
maxTurns,
|
|
606
|
+
lastOutput: output,
|
|
607
|
+
});
|
|
608
|
+
emit({ kind: 'done', status: maxTurnsResult.status });
|
|
609
|
+
return maxTurnsResult;
|
|
347
610
|
}
|
|
348
611
|
catch (err) {
|
|
349
612
|
// OpenAI SDK's APIError carries status/body/headers — surface them
|
|
@@ -374,30 +637,192 @@ export async function runCodex(prompt, options, providerConfig, defaults) {
|
|
|
374
637
|
}
|
|
375
638
|
if (e?.requestID)
|
|
376
639
|
pieces.push(`req_id=${e.requestID}`);
|
|
377
|
-
|
|
640
|
+
// Only include `last response status` when it was captured on the
|
|
641
|
+
// CURRENT (failing) turn — otherwise it belongs to a previous,
|
|
642
|
+
// separate request and appending it is actively misleading. See the
|
|
643
|
+
// abort-path investigation comment at the top of `run()`.
|
|
644
|
+
if (lastResponseStatus && lastResponseStatusTurn === turns) {
|
|
378
645
|
pieces.push(`last response status: ${lastResponseStatus}`);
|
|
646
|
+
}
|
|
647
|
+
else if (lastResponseStatus && lastResponseStatusTurn !== turns) {
|
|
648
|
+
pieces.push(`note: a previous request (turn ${lastResponseStatusTurn}) completed with status ` +
|
|
649
|
+
`"${lastResponseStatus}" — it is unrelated to this failure`);
|
|
650
|
+
}
|
|
379
651
|
const detailed = pieces.join(' | ') || String(err);
|
|
652
|
+
// Classify the thrown error into a finer-grained RunStatus. Task 7
|
|
653
|
+
// introduces api_aborted / api_error / network_error alongside the
|
|
654
|
+
// catch-all 'error' status. The turn-scoped `lastResponseStatus`
|
|
655
|
+
// disambiguation above is ORTHOGONAL to this classification: the
|
|
656
|
+
// `detailed` message is still the rich operator-facing diagnostic,
|
|
657
|
+
// and `classifyError` only decides which RunStatus bucket the
|
|
658
|
+
// failure lands in.
|
|
659
|
+
const { status } = classifyError(err);
|
|
660
|
+
// Salvage: if the scratchpad has buffered text from earlier turns,
|
|
661
|
+
// return it as the output. Pre-Task-5 behavior returned only the
|
|
662
|
+
// error string, losing 30k+ tokens of work on abort.
|
|
663
|
+
emit({ kind: 'done', status });
|
|
664
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
380
665
|
return {
|
|
381
|
-
output: `Sub-agent error: ${detailed}`,
|
|
382
|
-
status
|
|
666
|
+
output: hasSalvage ? scratchpad.latest() : `Sub-agent error: ${detailed}`,
|
|
667
|
+
status,
|
|
383
668
|
usage: {
|
|
384
669
|
inputTokens,
|
|
385
670
|
outputTokens,
|
|
386
671
|
totalTokens: inputTokens + outputTokens,
|
|
387
|
-
costUSD:
|
|
672
|
+
costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
|
|
388
673
|
},
|
|
389
674
|
turns,
|
|
390
|
-
|
|
675
|
+
filesRead: tracker.getReads(),
|
|
676
|
+
filesWritten: tracker.getWrites(),
|
|
677
|
+
toolCalls: tracker.getToolCalls(),
|
|
678
|
+
outputIsDiagnostic: !hasSalvage,
|
|
679
|
+
escalationLog: [],
|
|
391
680
|
error: detailed,
|
|
392
681
|
};
|
|
393
682
|
}
|
|
394
683
|
};
|
|
395
|
-
return withTimeout(run(), timeoutMs, () =>
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
684
|
+
return withTimeout(run(), timeoutMs, () => {
|
|
685
|
+
emit({ kind: 'done', status: 'timeout' });
|
|
686
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
687
|
+
return {
|
|
688
|
+
// Preserve any text the scratchpad buffered before the timeout fired.
|
|
689
|
+
// Partial usage is read from the running accumulators hoisted above —
|
|
690
|
+
// hardcoded zeros would discard every token counted on partial turns.
|
|
691
|
+
output: hasSalvage ? scratchpad.latest() : `Agent timed out after ${timeoutMs}ms.`,
|
|
692
|
+
status: 'timeout',
|
|
693
|
+
filesRead: tracker.getReads(),
|
|
694
|
+
filesWritten: tracker.getWrites(),
|
|
695
|
+
toolCalls: tracker.getToolCalls(),
|
|
696
|
+
usage: {
|
|
697
|
+
inputTokens,
|
|
698
|
+
outputTokens,
|
|
699
|
+
totalTokens: inputTokens + outputTokens,
|
|
700
|
+
costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
|
|
701
|
+
},
|
|
702
|
+
turns,
|
|
703
|
+
outputIsDiagnostic: !hasSalvage,
|
|
704
|
+
escalationLog: [],
|
|
705
|
+
};
|
|
706
|
+
}, abortController);
|
|
707
|
+
}
|
|
708
|
+
function buildCodexOkResult(args) {
|
|
709
|
+
const { tracker, providerConfig, inputTokens, outputTokens, turns, output } = args;
|
|
710
|
+
return {
|
|
711
|
+
output,
|
|
712
|
+
status: 'ok',
|
|
713
|
+
usage: {
|
|
714
|
+
inputTokens,
|
|
715
|
+
outputTokens,
|
|
716
|
+
totalTokens: inputTokens + outputTokens,
|
|
717
|
+
costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
|
|
718
|
+
},
|
|
719
|
+
turns,
|
|
720
|
+
filesRead: tracker.getReads(),
|
|
721
|
+
filesWritten: tracker.getWrites(),
|
|
722
|
+
toolCalls: tracker.getToolCalls(),
|
|
723
|
+
// `ok` always carries a real model answer — never a diagnostic.
|
|
724
|
+
outputIsDiagnostic: false,
|
|
725
|
+
escalationLog: [],
|
|
726
|
+
};
|
|
727
|
+
}
|
|
728
|
+
/**
|
|
729
|
+
* Supervision-exhausted path: retry cap hit or same-output early-out. Prefer
|
|
730
|
+
* scratchpad salvage; fall back to the incomplete diagnostic.
|
|
731
|
+
*/
|
|
732
|
+
function buildCodexIncompleteResult(args) {
|
|
733
|
+
const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns } = args;
|
|
734
|
+
const filesRead = tracker.getReads();
|
|
735
|
+
const filesWritten = tracker.getWrites();
|
|
736
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
737
|
+
return {
|
|
738
|
+
output: hasSalvage
|
|
739
|
+
? scratchpad.latest()
|
|
740
|
+
: buildCodexIncompleteDiagnostic({
|
|
741
|
+
turns,
|
|
742
|
+
inputTokens,
|
|
743
|
+
outputTokens,
|
|
744
|
+
filesRead,
|
|
745
|
+
filesWritten,
|
|
746
|
+
}),
|
|
747
|
+
status: 'incomplete',
|
|
748
|
+
usage: {
|
|
749
|
+
inputTokens,
|
|
750
|
+
outputTokens,
|
|
751
|
+
totalTokens: inputTokens + outputTokens,
|
|
752
|
+
costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
|
|
753
|
+
},
|
|
754
|
+
turns,
|
|
755
|
+
filesRead,
|
|
756
|
+
filesWritten,
|
|
757
|
+
toolCalls: tracker.getToolCalls(),
|
|
758
|
+
outputIsDiagnostic: !hasSalvage,
|
|
759
|
+
escalationLog: [],
|
|
760
|
+
};
|
|
761
|
+
}
|
|
762
|
+
function buildCodexForceSalvageResult(args) {
|
|
763
|
+
const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, softLimit } = args;
|
|
764
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
765
|
+
return {
|
|
766
|
+
output: hasSalvage
|
|
767
|
+
? scratchpad.latest()
|
|
768
|
+
: `[codex sub-agent forcibly terminated at ${inputTokens} input tokens (soft limit ${softLimit}). No usable text was buffered.]`,
|
|
769
|
+
status: 'incomplete',
|
|
770
|
+
usage: {
|
|
771
|
+
inputTokens,
|
|
772
|
+
outputTokens,
|
|
773
|
+
totalTokens: inputTokens + outputTokens,
|
|
774
|
+
costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
|
|
775
|
+
},
|
|
400
776
|
turns,
|
|
401
|
-
|
|
777
|
+
filesRead: tracker.getReads(),
|
|
778
|
+
filesWritten: tracker.getWrites(),
|
|
779
|
+
toolCalls: tracker.getToolCalls(),
|
|
780
|
+
outputIsDiagnostic: !hasSalvage,
|
|
781
|
+
escalationLog: [],
|
|
782
|
+
};
|
|
783
|
+
}
|
|
784
|
+
function buildCodexMaxTurnsResult(args) {
|
|
785
|
+
const { tracker, scratchpad, providerConfig, inputTokens, outputTokens, turns, maxTurns, lastOutput } = args;
|
|
786
|
+
const hasSalvage = !scratchpad.isEmpty();
|
|
787
|
+
// Note: `lastOutput` here is the model's final text for the max-turns
|
|
788
|
+
// boundary — real model content, not a diagnostic template. Only the
|
|
789
|
+
// `Agent exceeded max turns…` fallback (empty scratchpad AND empty
|
|
790
|
+
// lastOutput) is a diagnostic.
|
|
791
|
+
const output = hasSalvage
|
|
792
|
+
? scratchpad.latest()
|
|
793
|
+
: (lastOutput || `Agent exceeded max turns (${maxTurns}).`);
|
|
794
|
+
const outputIsDiagnostic = !hasSalvage && !lastOutput;
|
|
795
|
+
return {
|
|
796
|
+
output,
|
|
797
|
+
status: 'max_turns',
|
|
798
|
+
usage: {
|
|
799
|
+
inputTokens,
|
|
800
|
+
outputTokens,
|
|
801
|
+
totalTokens: inputTokens + outputTokens,
|
|
802
|
+
costUSD: computeCostUSD(inputTokens, outputTokens, providerConfig),
|
|
803
|
+
},
|
|
804
|
+
turns,
|
|
805
|
+
filesRead: tracker.getReads(),
|
|
806
|
+
filesWritten: tracker.getWrites(),
|
|
807
|
+
toolCalls: tracker.getToolCalls(),
|
|
808
|
+
outputIsDiagnostic,
|
|
809
|
+
escalationLog: [],
|
|
810
|
+
};
|
|
811
|
+
}
|
|
812
|
+
function buildCodexIncompleteDiagnostic(opts) {
|
|
813
|
+
return [
|
|
814
|
+
'[codex sub-agent terminated without producing a final answer]',
|
|
815
|
+
'',
|
|
816
|
+
'The model emitted no tool calls and no usable text on its final turn, and',
|
|
817
|
+
'supervision re-prompts did not recover a valid response.',
|
|
818
|
+
'',
|
|
819
|
+
`Turns used: ${opts.turns}`,
|
|
820
|
+
`Input tokens: ${opts.inputTokens}`,
|
|
821
|
+
`Output tokens: ${opts.outputTokens}`,
|
|
822
|
+
`Files read: ${opts.filesRead.length}`,
|
|
823
|
+
`Files written: ${opts.filesWritten.length}`,
|
|
824
|
+
'',
|
|
825
|
+
'Recommended action: re-dispatch with a tighter brief, or escalate provider tier.',
|
|
826
|
+
].join('\n');
|
|
402
827
|
}
|
|
403
828
|
//# sourceMappingURL=codex-runner.js.map
|