@cat-factory/executor-harness 1.31.12 → 1.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-runner.js +206 -25
- package/dist/agent.js +87 -13
- package/dist/coding-agent.js +3 -1
- package/dist/job.js +59 -0
- package/dist/package-registries.js +51 -0
- package/dist/redact.js +17 -4
- package/package.json +3 -3
- package/src/agent-runner.ts +225 -25
- package/src/agent.ts +87 -12
- package/src/coding-agent.ts +6 -2
- package/src/job.ts +100 -1
- package/src/package-registries.ts +58 -0
- package/src/pi.ts +40 -0
- package/src/redact.ts +18 -4
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { chmod, rm, writeFile } from 'node:fs/promises';
|
|
2
|
+
import { homedir } from 'node:os';
|
|
3
|
+
import { join } from 'node:path';
|
|
4
|
+
import { registerKnownSecrets } from './redact.js';
|
|
5
|
+
// Private package-registry auth for the checkout's installs (npm private orgs,
|
|
6
|
+
// GitHub Packages). The job's allowlisted entries are rendered into the USER
|
|
7
|
+
// `~/.npmrc` — read by npm, pnpm and yarn v1 alike, and inherited by every child
|
|
8
|
+
// process (the agent's own shell installs and the frontend-infra stand-up's) — so
|
|
9
|
+
// the token never rides argv or the checkout. Written per job; a job with NO
|
|
10
|
+
// entries removes any stale file, because warm-pool containers are reused across
|
|
11
|
+
// jobs and must not leak a prior workspace's token.
|
|
12
|
+
/** Where the per-job npm auth lands (the user npmrc, outside any checkout). */
|
|
13
|
+
export function npmrcPath() {
|
|
14
|
+
return join(homedir(), '.npmrc');
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Render the job's registry entries as npmrc lines: each scope routed to its
|
|
18
|
+
* registry, plus one `_authToken` credential line per distinct host.
|
|
19
|
+
*/
|
|
20
|
+
export function renderNpmrc(entries) {
|
|
21
|
+
const lines = [];
|
|
22
|
+
const hosts = new Map();
|
|
23
|
+
for (const entry of entries) {
|
|
24
|
+
for (const scope of entry.scopes) {
|
|
25
|
+
lines.push(`${scope}:registry=https://${entry.host}/`);
|
|
26
|
+
}
|
|
27
|
+
// Last entry wins per host — entries for the same host carry the same vendor
|
|
28
|
+
// token in practice (the backend stores one token per entry).
|
|
29
|
+
hosts.set(entry.host, entry.token);
|
|
30
|
+
}
|
|
31
|
+
for (const [host, token] of hosts) {
|
|
32
|
+
lines.push(`//${host}/:_authToken=${token}`);
|
|
33
|
+
}
|
|
34
|
+
return `${lines.join('\n')}\n`;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Write (or clear) the per-job `~/.npmrc` before the agent runs. Tokens are
|
|
38
|
+
* registered for output redaction so a token echoed in an npm error never reaches
|
|
39
|
+
* logs or stored output.
|
|
40
|
+
*/
|
|
41
|
+
export async function configurePackageRegistries(entries) {
|
|
42
|
+
const path = npmrcPath();
|
|
43
|
+
if (!entries || entries.length === 0) {
|
|
44
|
+
await rm(path, { force: true });
|
|
45
|
+
return;
|
|
46
|
+
}
|
|
47
|
+
registerKnownSecrets(entries.map((entry) => entry.token));
|
|
48
|
+
await writeFile(path, renderNpmrc(entries), { mode: 0o600 });
|
|
49
|
+
// writeFile's mode only applies on create — tighten an existing file too.
|
|
50
|
+
await chmod(path, 0o600);
|
|
51
|
+
}
|
package/dist/redact.js
CHANGED
|
@@ -28,12 +28,25 @@ const MIN_HARVEST_LEN = 12;
|
|
|
28
28
|
// `DB_ACCESS_KEY`/`api_key` are covered; `auth` is deliberately excluded so it can't
|
|
29
29
|
// clobber a git `Author:` line. The value is the first whitespace-delimited run.
|
|
30
30
|
const CREDENTIAL_ASSIGNMENT = /\b([A-Za-z0-9_]*(?:password|passwd|pwd|secret|token|key|credential)[A-Za-z0-9_]*\s*[:=]\s*)\S+/gi;
|
|
31
|
+
// Known-secret values registered per JOB (e.g. the job's private-registry tokens),
|
|
32
|
+
// scrubbed on EVERY redaction — including the pattern-only `redactSecrets` call sites
|
|
33
|
+
// that carry no per-call secret list. Accumulating across jobs on a reused container
|
|
34
|
+
// is safe: redaction only ever widens.
|
|
35
|
+
const REGISTERED_SECRETS = new Set();
|
|
36
|
+
/** Register known secret values to scrub on every subsequent redaction. */
|
|
37
|
+
export function registerKnownSecrets(values) {
|
|
38
|
+
for (const value of values) {
|
|
39
|
+
if (value && value.length >= MIN_REDACT_LEN)
|
|
40
|
+
REGISTERED_SECRETS.add(value);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
31
43
|
/**
|
|
32
44
|
* Strip credentials out of any string before it is logged or stored. Applies the
|
|
33
45
|
* pattern rules (URL userinfo `https://user:pass@host`, `x-access-token:<token>`, bare
|
|
34
46
|
* `ghs_`/`ghp_`/`gho_`/`github_pat_` shapes, and credential-named `KEY=value` / `KEY:
|
|
35
|
-
* value` assignments) and then scrubs every supplied known-secret value
|
|
36
|
-
* safe to call on
|
|
47
|
+
* value` assignments) and then scrubs every supplied known-secret value plus the
|
|
48
|
+
* module-registered ones ({@link registerKnownSecrets}). Idempotent — safe to call on
|
|
49
|
+
* already-redacted text.
|
|
37
50
|
*/
|
|
38
51
|
export function redact(input, knownSecrets = []) {
|
|
39
52
|
let out = input
|
|
@@ -41,14 +54,14 @@ export function redact(input, knownSecrets = []) {
|
|
|
41
54
|
.replace(/x-access-token:[^@\s]+/gi, 'x-access-token:***')
|
|
42
55
|
.replace(/\b(gh[pso]_|github_pat_)[A-Za-z0-9_]+/g, '$1***')
|
|
43
56
|
.replace(CREDENTIAL_ASSIGNMENT, '$1***');
|
|
44
|
-
for (const secret of knownSecrets) {
|
|
57
|
+
for (const secret of [...knownSecrets, ...REGISTERED_SECRETS]) {
|
|
45
58
|
// Guard against scrubbing trivially-short values that would mangle output.
|
|
46
59
|
if (secret.length >= MIN_REDACT_LEN)
|
|
47
60
|
out = out.split(secret).join('***');
|
|
48
61
|
}
|
|
49
62
|
return out;
|
|
50
63
|
}
|
|
51
|
-
/** Pattern-
|
|
64
|
+
/** Pattern + registered-value redaction. Kept for callers without a per-call secret list. */
|
|
52
65
|
export function redactSecrets(input) {
|
|
53
66
|
return redact(input);
|
|
54
67
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/executor-harness",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.34.0",
|
|
4
4
|
"description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,8 +26,8 @@
|
|
|
26
26
|
"hono": "^4.12.27",
|
|
27
27
|
"typescript": "^6.0.3",
|
|
28
28
|
"vitest": "^4.1.9",
|
|
29
|
-
"@cat-factory/
|
|
30
|
-
"@cat-factory/
|
|
29
|
+
"@cat-factory/server": "0.69.0",
|
|
30
|
+
"@cat-factory/spend": "0.10.77"
|
|
31
31
|
},
|
|
32
32
|
"scripts": {
|
|
33
33
|
"build": "tsc -p tsconfig.json",
|
package/src/agent-runner.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process'
|
|
|
2
2
|
import { mkdtemp, rm, writeFile } from 'node:fs/promises'
|
|
3
3
|
import { tmpdir } from 'node:os'
|
|
4
4
|
import { join } from 'node:path'
|
|
5
|
-
import type { PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
|
|
5
|
+
import type { HarnessCallMetric, PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
|
|
6
6
|
import { killChildProcess, spawnDetached } from './process.js'
|
|
7
7
|
import { redact, secretsToRedact } from './redact.js'
|
|
8
8
|
|
|
@@ -64,6 +64,29 @@ function isObject(value: unknown): value is Record<string, unknown> {
|
|
|
64
64
|
return typeof value === 'object' && value !== null
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
/** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
|
|
68
|
+
function redactBody(text: string, secrets: string[]): string {
|
|
69
|
+
return secrets.length ? redact(text, secrets) : text
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Fallback token attribution: if a CLI reported a cumulative total but no per-turn
|
|
74
|
+
* usage (so every captured call has zero tokens), pin the whole total onto the LAST
|
|
75
|
+
* call rather than dropping it — the run's tokens are still accounted, just not split
|
|
76
|
+
* per turn. A no-op when the calls already carry per-turn tokens.
|
|
77
|
+
*/
|
|
78
|
+
function attributeCumulativeUsage(
|
|
79
|
+
calls: HarnessCallMetric[],
|
|
80
|
+
usage: { inputTokens: number; outputTokens: number } | undefined,
|
|
81
|
+
): void {
|
|
82
|
+
if (!usage || calls.length === 0) return
|
|
83
|
+
const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0)
|
|
84
|
+
if (anyTokens) return
|
|
85
|
+
const last = calls[calls.length - 1]!
|
|
86
|
+
last.inputTokens = usage.inputTokens
|
|
87
|
+
last.outputTokens = usage.outputTokens
|
|
88
|
+
}
|
|
89
|
+
|
|
67
90
|
/**
|
|
68
91
|
* Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
|
|
69
92
|
* through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
|
|
@@ -184,25 +207,59 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
|
|
|
184
207
|
let summary = ''
|
|
185
208
|
let usage: { inputTokens: number; outputTokens: number } | undefined
|
|
186
209
|
|
|
210
|
+
// Reconstruct the full per-call request/response bodies for telemetry from the
|
|
211
|
+
// stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
|
|
212
|
+
// Anthropic Messages envelope, so `assistant` events carry the complete response
|
|
213
|
+
// (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
|
|
214
|
+
// fed back — together the growing prompt transcript. We seed it with the two inputs
|
|
215
|
+
// the harness supplies (they never appear in the stream): the system + first user
|
|
216
|
+
// message. Bodies are credential-scrubbed (they can echo the leased token).
|
|
217
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
|
|
218
|
+
const messages: Array<{ role: string; content: unknown }> = [
|
|
219
|
+
{ role: 'system', content: opts.systemPrompt },
|
|
220
|
+
{ role: 'user', content: opts.userPrompt },
|
|
221
|
+
]
|
|
222
|
+
const calls: HarnessCallMetric[] = []
|
|
223
|
+
|
|
187
224
|
const onEvent = (event: Record<string, unknown>): void => {
|
|
188
225
|
const type = event.type
|
|
189
226
|
if (type === 'assistant' && isObject(event.message)) {
|
|
190
|
-
const
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
227
|
+
const message = event.message as Record<string, unknown>
|
|
228
|
+
const content = Array.isArray(message.content) ? message.content : []
|
|
229
|
+
const { text, reasoning, toolUses } = claudeAssistantContent(content)
|
|
230
|
+
stats.assistantChars += text.length
|
|
231
|
+
stats.toolCalls += toolUses
|
|
232
|
+
for (const block of content) {
|
|
233
|
+
if (
|
|
234
|
+
isObject(block) &&
|
|
235
|
+
block.type === 'tool_use' &&
|
|
236
|
+
block.name === 'TodoWrite' &&
|
|
237
|
+
opts.onProgress
|
|
238
|
+
) {
|
|
239
|
+
const progress = todosToProgress((block.input as Record<string, unknown>)?.todos)
|
|
240
|
+
if (progress) opts.onProgress(progress)
|
|
204
241
|
}
|
|
205
242
|
}
|
|
243
|
+
// Record this call BEFORE appending its turn: the prompt is the history that
|
|
244
|
+
// produced this response. The append-only array keeps each call's prompt a strict
|
|
245
|
+
// prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
|
|
246
|
+
const u = claudeCallUsage(message.usage)
|
|
247
|
+
calls.push({
|
|
248
|
+
...(typeof message.model === 'string' ? { model: message.model } : {}),
|
|
249
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
250
|
+
messageCount: messages.length,
|
|
251
|
+
responseText: redactBody(text, secrets),
|
|
252
|
+
reasoningText: redactBody(reasoning, secrets),
|
|
253
|
+
inputTokens: u.inputTokens,
|
|
254
|
+
cachedInputTokens: u.cachedInputTokens,
|
|
255
|
+
outputTokens: u.outputTokens,
|
|
256
|
+
finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
|
|
257
|
+
})
|
|
258
|
+
messages.push({ role: 'assistant', content })
|
|
259
|
+
} else if (type === 'user' && isObject(event.message)) {
|
|
260
|
+
// tool_result blocks the harness fed back to the model — part of the next prompt.
|
|
261
|
+
const content = (event.message as Record<string, unknown>).content
|
|
262
|
+
if (Array.isArray(content)) messages.push({ role: 'tool', content })
|
|
206
263
|
} else if (type === 'result') {
|
|
207
264
|
if (typeof event.result === 'string') summary = event.result
|
|
208
265
|
usage = claudeUsage(event.usage) ?? usage
|
|
@@ -282,7 +339,14 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
|
|
|
282
339
|
onEvent,
|
|
283
340
|
)
|
|
284
341
|
|
|
285
|
-
|
|
342
|
+
attributeCumulativeUsage(calls, usage)
|
|
343
|
+
return {
|
|
344
|
+
summary,
|
|
345
|
+
stats,
|
|
346
|
+
stderrTail,
|
|
347
|
+
...(usage ? { usage } : {}),
|
|
348
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
349
|
+
}
|
|
286
350
|
} finally {
|
|
287
351
|
// Never leave the config dir (and any cached credential) on disk past the run.
|
|
288
352
|
if (configHome) await rm(configHome, { recursive: true, force: true }).catch(() => {})
|
|
@@ -322,6 +386,44 @@ function claudeUsage(raw: unknown): { inputTokens: number; outputTokens: number
|
|
|
322
386
|
return { inputTokens: input, outputTokens: output }
|
|
323
387
|
}
|
|
324
388
|
|
|
389
|
+
/** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
|
|
390
|
+
function claudeAssistantContent(content: unknown[]): {
|
|
391
|
+
text: string
|
|
392
|
+
reasoning: string
|
|
393
|
+
toolUses: number
|
|
394
|
+
} {
|
|
395
|
+
let text = ''
|
|
396
|
+
let reasoning = ''
|
|
397
|
+
let toolUses = 0
|
|
398
|
+
for (const block of content) {
|
|
399
|
+
if (!isObject(block)) continue
|
|
400
|
+
if (block.type === 'text' && typeof block.text === 'string') text += block.text
|
|
401
|
+
else if (block.type === 'thinking' && typeof block.thinking === 'string')
|
|
402
|
+
reasoning += block.thinking
|
|
403
|
+
else if (block.type === 'tool_use') toolUses += 1
|
|
404
|
+
}
|
|
405
|
+
return { text, reasoning, toolUses }
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
|
|
410
|
+
* the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
|
|
411
|
+
* + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
|
|
412
|
+
*/
|
|
413
|
+
function claudeCallUsage(raw: unknown): {
|
|
414
|
+
inputTokens: number
|
|
415
|
+
cachedInputTokens: number
|
|
416
|
+
outputTokens: number
|
|
417
|
+
} {
|
|
418
|
+
if (!isObject(raw)) return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 }
|
|
419
|
+
const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens)
|
|
420
|
+
return {
|
|
421
|
+
inputTokens: numberOf(raw.input_tokens) + cached,
|
|
422
|
+
cachedInputTokens: cached,
|
|
423
|
+
outputTokens: numberOf(raw.output_tokens),
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
325
427
|
// ---------------------------------------------------------------------------
|
|
326
428
|
// Codex
|
|
327
429
|
// ---------------------------------------------------------------------------
|
|
@@ -366,13 +468,33 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
|
|
|
366
468
|
await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8')
|
|
367
469
|
}
|
|
368
470
|
|
|
471
|
+
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
472
|
+
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
473
|
+
const prompt = opts.systemPrompt
|
|
474
|
+
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
475
|
+
: opts.userPrompt
|
|
476
|
+
|
|
477
|
+
// Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
|
|
478
|
+
// flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
|
|
479
|
+
// plus a cumulative total. It never exposes the request transcript or structured
|
|
480
|
+
// tool/command bodies, so the captured prompt is just the folded input — the response
|
|
481
|
+
// text + per-turn tokens are faithful; the request side is best-effort by design.
|
|
482
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
|
|
483
|
+
const messages: Array<{ role: string; content: unknown }> = [{ role: 'user', content: prompt }]
|
|
484
|
+
const calls: HarnessCallMetric[] = []
|
|
485
|
+
let pendingText = ''
|
|
486
|
+
|
|
369
487
|
const onEvent = (event: Record<string, unknown>): void => {
|
|
370
488
|
const type = typeof event.type === 'string' ? event.type : ''
|
|
371
|
-
if (
|
|
489
|
+
if (
|
|
490
|
+
type.includes('agent_message') ||
|
|
491
|
+
(type === 'item.completed' && isCodexMessageItem(event))
|
|
492
|
+
) {
|
|
372
493
|
const text = extractText(event)
|
|
373
494
|
if (text) {
|
|
374
495
|
stats.assistantChars += text.length
|
|
375
496
|
summary = text
|
|
497
|
+
pendingText = text
|
|
376
498
|
}
|
|
377
499
|
}
|
|
378
500
|
if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
|
|
@@ -382,14 +504,26 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
|
|
|
382
504
|
if (progress && opts.onProgress) opts.onProgress(progress)
|
|
383
505
|
const turnUsage = codexUsage(event)
|
|
384
506
|
if (turnUsage) usage = turnUsage
|
|
507
|
+
// A `token_count` event closes a model turn: pair its per-turn usage with the
|
|
508
|
+
// assistant text seen since the previous turn as one telemetry call.
|
|
509
|
+
const perTurn = codexLastTurnUsage(event)
|
|
510
|
+
if (perTurn) {
|
|
511
|
+
calls.push({
|
|
512
|
+
model: opts.model,
|
|
513
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
514
|
+
messageCount: messages.length,
|
|
515
|
+
responseText: redactBody(pendingText, secrets),
|
|
516
|
+
reasoningText: '',
|
|
517
|
+
inputTokens: perTurn.inputTokens,
|
|
518
|
+
cachedInputTokens: perTurn.cachedInputTokens,
|
|
519
|
+
outputTokens: perTurn.outputTokens,
|
|
520
|
+
finishReason: null,
|
|
521
|
+
})
|
|
522
|
+
if (pendingText) messages.push({ role: 'assistant', content: pendingText })
|
|
523
|
+
pendingText = ''
|
|
524
|
+
}
|
|
385
525
|
}
|
|
386
526
|
|
|
387
|
-
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
388
|
-
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
389
|
-
const prompt = opts.systemPrompt
|
|
390
|
-
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
391
|
-
: opts.userPrompt
|
|
392
|
-
|
|
393
527
|
try {
|
|
394
528
|
const { stderrTail } = await streamCli(
|
|
395
529
|
'codex',
|
|
@@ -411,13 +545,53 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
|
|
|
411
545
|
onEvent,
|
|
412
546
|
)
|
|
413
547
|
|
|
414
|
-
|
|
548
|
+
// Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
|
|
549
|
+
// single call from the cumulative total + final text so the run is still observable.
|
|
550
|
+
if (calls.length === 0 && (usage || summary)) {
|
|
551
|
+
calls.push({
|
|
552
|
+
model: opts.model,
|
|
553
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
554
|
+
messageCount: messages.length,
|
|
555
|
+
responseText: redactBody(summary, secrets),
|
|
556
|
+
reasoningText: '',
|
|
557
|
+
inputTokens: usage?.inputTokens ?? 0,
|
|
558
|
+
cachedInputTokens: 0,
|
|
559
|
+
outputTokens: usage?.outputTokens ?? 0,
|
|
560
|
+
finishReason: null,
|
|
561
|
+
})
|
|
562
|
+
}
|
|
563
|
+
return {
|
|
564
|
+
summary,
|
|
565
|
+
stats,
|
|
566
|
+
stderrTail,
|
|
567
|
+
...(usage ? { usage } : {}),
|
|
568
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
569
|
+
}
|
|
415
570
|
} finally {
|
|
416
571
|
// Never leave the decrypted credential on disk past the run.
|
|
417
572
|
if (codexHome) await rm(codexHome, { recursive: true, force: true }).catch(() => {})
|
|
418
573
|
}
|
|
419
574
|
}
|
|
420
575
|
|
|
576
|
+
/**
|
|
577
|
+
* Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
|
|
578
|
+
* opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
|
|
579
|
+
* their command output or thinking — and must NOT be captured as the turn's response).
|
|
580
|
+
* A message item's kind contains `message` (`agent_message`/`assistant_message`); an
|
|
581
|
+
* item with no kind is treated as a message so older/simple shapes don't regress.
|
|
582
|
+
*/
|
|
583
|
+
function isCodexMessageItem(event: Record<string, unknown>): boolean {
|
|
584
|
+
const item = isObject(event.item) ? (event.item as Record<string, unknown>) : undefined
|
|
585
|
+
if (!item) return false
|
|
586
|
+
const kind =
|
|
587
|
+
typeof item.item_type === 'string'
|
|
588
|
+
? item.item_type
|
|
589
|
+
: typeof item.type === 'string'
|
|
590
|
+
? item.type
|
|
591
|
+
: ''
|
|
592
|
+
return kind === '' || /message/i.test(kind)
|
|
593
|
+
}
|
|
594
|
+
|
|
421
595
|
/** Best-effort: pull a textual message out of a Codex event. */
|
|
422
596
|
function extractText(event: Record<string, unknown>): string | undefined {
|
|
423
597
|
if (typeof event.message === 'string') return event.message
|
|
@@ -456,6 +630,8 @@ function codexPlanProgress(event: Record<string, unknown>): TodoProgress | undef
|
|
|
456
630
|
* other shapes put it on `usage` / `info.usage` directly. We read the cumulative
|
|
457
631
|
* total when present so the caller can simply overwrite (not sum) — summing
|
|
458
632
|
* cumulative totals across events would multiply-count. Checked most-likely first.
|
|
633
|
+
* `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
|
|
634
|
+
* is a subset already inside it), so it is NOT summed with the cached share.
|
|
459
635
|
*/
|
|
460
636
|
function codexUsage(
|
|
461
637
|
event: Record<string, unknown>,
|
|
@@ -467,12 +643,36 @@ function codexUsage(
|
|
|
467
643
|
(isObject(event.usage) ? event.usage : undefined) ??
|
|
468
644
|
(info && isObject(info.usage) ? info.usage : undefined)
|
|
469
645
|
if (!isObject(raw)) return undefined
|
|
470
|
-
const input = numberOf(raw.input_tokens)
|
|
646
|
+
const input = numberOf(raw.input_tokens)
|
|
471
647
|
const output = numberOf(raw.output_tokens)
|
|
472
648
|
if (input === 0 && output === 0) return undefined
|
|
473
649
|
return { inputTokens: input, outputTokens: output }
|
|
474
650
|
}
|
|
475
651
|
|
|
652
|
+
/**
|
|
653
|
+
* Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
|
|
654
|
+
* delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
|
|
655
|
+
* `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
|
|
656
|
+
* share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
|
|
657
|
+
* NOT added on top (adding it would double-count every cached token).
|
|
658
|
+
*/
|
|
659
|
+
function codexLastTurnUsage(event: Record<string, unknown>):
|
|
660
|
+
| {
|
|
661
|
+
inputTokens: number
|
|
662
|
+
cachedInputTokens: number
|
|
663
|
+
outputTokens: number
|
|
664
|
+
}
|
|
665
|
+
| undefined {
|
|
666
|
+
const info = isObject(event.info) ? (event.info as Record<string, unknown>) : undefined
|
|
667
|
+
const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined
|
|
668
|
+
if (!isObject(raw)) return undefined
|
|
669
|
+
const input = numberOf(raw.input_tokens)
|
|
670
|
+
const cached = numberOf(raw.cached_input_tokens)
|
|
671
|
+
const output = numberOf(raw.output_tokens)
|
|
672
|
+
if (input === 0 && output === 0) return undefined
|
|
673
|
+
return { inputTokens: input, cachedInputTokens: cached, outputTokens: output }
|
|
674
|
+
}
|
|
675
|
+
|
|
476
676
|
function numberOf(value: unknown): number {
|
|
477
677
|
return typeof value === 'number' && Number.isFinite(value) ? value : 0
|
|
478
678
|
}
|