@cat-factory/executor-harness 1.31.12 → 1.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-runner.js +206 -25
- package/dist/agent.js +81 -13
- package/dist/coding-agent.js +3 -1
- package/package.json +3 -3
- package/src/agent-runner.ts +225 -25
- package/src/agent.ts +81 -12
- package/src/coding-agent.ts +6 -2
- package/src/job.ts +7 -1
- package/src/pi.ts +40 -0
package/dist/agent-runner.js
CHANGED
|
@@ -7,6 +7,26 @@ import { redact, secretsToRedact } from './redact.js';
|
|
|
7
7
|
function isObject(value) {
|
|
8
8
|
return typeof value === 'object' && value !== null;
|
|
9
9
|
}
|
|
10
|
+
/** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
|
|
11
|
+
function redactBody(text, secrets) {
|
|
12
|
+
return secrets.length ? redact(text, secrets) : text;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Fallback token attribution: if a CLI reported a cumulative total but no per-turn
|
|
16
|
+
* usage (so every captured call has zero tokens), pin the whole total onto the LAST
|
|
17
|
+
* call rather than dropping it — the run's tokens are still accounted, just not split
|
|
18
|
+
* per turn. A no-op when the calls already carry per-turn tokens.
|
|
19
|
+
*/
|
|
20
|
+
function attributeCumulativeUsage(calls, usage) {
|
|
21
|
+
if (!usage || calls.length === 0)
|
|
22
|
+
return;
|
|
23
|
+
const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0);
|
|
24
|
+
if (anyTokens)
|
|
25
|
+
return;
|
|
26
|
+
const last = calls[calls.length - 1];
|
|
27
|
+
last.inputTokens = usage.inputTokens;
|
|
28
|
+
last.outputTokens = usage.outputTokens;
|
|
29
|
+
}
|
|
10
30
|
/**
|
|
11
31
|
* Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
|
|
12
32
|
* through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
|
|
@@ -114,27 +134,59 @@ export async function runClaudeCode(opts) {
|
|
|
114
134
|
const stats = { toolCalls: 0, assistantChars: 0 };
|
|
115
135
|
let summary = '';
|
|
116
136
|
let usage;
|
|
137
|
+
// Reconstruct the full per-call request/response bodies for telemetry from the
|
|
138
|
+
// stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
|
|
139
|
+
// Anthropic Messages envelope, so `assistant` events carry the complete response
|
|
140
|
+
// (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
|
|
141
|
+
// fed back — together the growing prompt transcript. We seed it with the two inputs
|
|
142
|
+
// the harness supplies (they never appear in the stream): the system + first user
|
|
143
|
+
// message. Bodies are credential-scrubbed (they can echo the leased token).
|
|
144
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
|
|
145
|
+
const messages = [
|
|
146
|
+
{ role: 'system', content: opts.systemPrompt },
|
|
147
|
+
{ role: 'user', content: opts.userPrompt },
|
|
148
|
+
];
|
|
149
|
+
const calls = [];
|
|
117
150
|
const onEvent = (event) => {
|
|
118
151
|
const type = event.type;
|
|
119
152
|
if (type === 'assistant' && isObject(event.message)) {
|
|
120
|
-
const
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
opts.onProgress(progress);
|
|
134
|
-
}
|
|
135
|
-
}
|
|
153
|
+
const message = event.message;
|
|
154
|
+
const content = Array.isArray(message.content) ? message.content : [];
|
|
155
|
+
const { text, reasoning, toolUses } = claudeAssistantContent(content);
|
|
156
|
+
stats.assistantChars += text.length;
|
|
157
|
+
stats.toolCalls += toolUses;
|
|
158
|
+
for (const block of content) {
|
|
159
|
+
if (isObject(block) &&
|
|
160
|
+
block.type === 'tool_use' &&
|
|
161
|
+
block.name === 'TodoWrite' &&
|
|
162
|
+
opts.onProgress) {
|
|
163
|
+
const progress = todosToProgress(block.input?.todos);
|
|
164
|
+
if (progress)
|
|
165
|
+
opts.onProgress(progress);
|
|
136
166
|
}
|
|
137
167
|
}
|
|
168
|
+
// Record this call BEFORE appending its turn: the prompt is the history that
|
|
169
|
+
// produced this response. The append-only array keeps each call's prompt a strict
|
|
170
|
+
// prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
|
|
171
|
+
const u = claudeCallUsage(message.usage);
|
|
172
|
+
calls.push({
|
|
173
|
+
...(typeof message.model === 'string' ? { model: message.model } : {}),
|
|
174
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
175
|
+
messageCount: messages.length,
|
|
176
|
+
responseText: redactBody(text, secrets),
|
|
177
|
+
reasoningText: redactBody(reasoning, secrets),
|
|
178
|
+
inputTokens: u.inputTokens,
|
|
179
|
+
cachedInputTokens: u.cachedInputTokens,
|
|
180
|
+
outputTokens: u.outputTokens,
|
|
181
|
+
finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
|
|
182
|
+
});
|
|
183
|
+
messages.push({ role: 'assistant', content });
|
|
184
|
+
}
|
|
185
|
+
else if (type === 'user' && isObject(event.message)) {
|
|
186
|
+
// tool_result blocks the harness fed back to the model — part of the next prompt.
|
|
187
|
+
const content = event.message.content;
|
|
188
|
+
if (Array.isArray(content))
|
|
189
|
+
messages.push({ role: 'tool', content });
|
|
138
190
|
}
|
|
139
191
|
else if (type === 'result') {
|
|
140
192
|
if (typeof event.result === 'string')
|
|
@@ -199,7 +251,14 @@ export async function runClaudeCode(opts) {
|
|
|
199
251
|
'--append-system-prompt',
|
|
200
252
|
opts.systemPrompt,
|
|
201
253
|
], opts.userPrompt, opts, env, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
|
|
202
|
-
|
|
254
|
+
attributeCumulativeUsage(calls, usage);
|
|
255
|
+
return {
|
|
256
|
+
summary,
|
|
257
|
+
stats,
|
|
258
|
+
stderrTail,
|
|
259
|
+
...(usage ? { usage } : {}),
|
|
260
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
261
|
+
};
|
|
203
262
|
}
|
|
204
263
|
finally {
|
|
205
264
|
// Never leave the config dir (and any cached credential) on disk past the run.
|
|
@@ -241,6 +300,38 @@ function claudeUsage(raw) {
|
|
|
241
300
|
return undefined;
|
|
242
301
|
return { inputTokens: input, outputTokens: output };
|
|
243
302
|
}
|
|
303
|
+
/** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
|
|
304
|
+
function claudeAssistantContent(content) {
|
|
305
|
+
let text = '';
|
|
306
|
+
let reasoning = '';
|
|
307
|
+
let toolUses = 0;
|
|
308
|
+
for (const block of content) {
|
|
309
|
+
if (!isObject(block))
|
|
310
|
+
continue;
|
|
311
|
+
if (block.type === 'text' && typeof block.text === 'string')
|
|
312
|
+
text += block.text;
|
|
313
|
+
else if (block.type === 'thinking' && typeof block.thinking === 'string')
|
|
314
|
+
reasoning += block.thinking;
|
|
315
|
+
else if (block.type === 'tool_use')
|
|
316
|
+
toolUses += 1;
|
|
317
|
+
}
|
|
318
|
+
return { text, reasoning, toolUses };
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
|
|
322
|
+
* the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
|
|
323
|
+
* + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
|
|
324
|
+
*/
|
|
325
|
+
function claudeCallUsage(raw) {
|
|
326
|
+
if (!isObject(raw))
|
|
327
|
+
return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 };
|
|
328
|
+
const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens);
|
|
329
|
+
return {
|
|
330
|
+
inputTokens: numberOf(raw.input_tokens) + cached,
|
|
331
|
+
cachedInputTokens: cached,
|
|
332
|
+
outputTokens: numberOf(raw.output_tokens),
|
|
333
|
+
};
|
|
334
|
+
}
|
|
244
335
|
// ---------------------------------------------------------------------------
|
|
245
336
|
// Codex
|
|
246
337
|
// ---------------------------------------------------------------------------
|
|
@@ -282,13 +373,29 @@ export async function runCodex(opts) {
|
|
|
282
373
|
await writeFile(join(codexHome, 'auth.json'), opts.subscriptionToken, { mode: 0o600 });
|
|
283
374
|
await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8');
|
|
284
375
|
}
|
|
376
|
+
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
377
|
+
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
378
|
+
const prompt = opts.systemPrompt
|
|
379
|
+
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
380
|
+
: opts.userPrompt;
|
|
381
|
+
// Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
|
|
382
|
+
// flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
|
|
383
|
+
// plus a cumulative total. It never exposes the request transcript or structured
|
|
384
|
+
// tool/command bodies, so the captured prompt is just the folded input — the response
|
|
385
|
+
// text + per-turn tokens are faithful; the request side is best-effort by design.
|
|
386
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
|
|
387
|
+
const messages = [{ role: 'user', content: prompt }];
|
|
388
|
+
const calls = [];
|
|
389
|
+
let pendingText = '';
|
|
285
390
|
const onEvent = (event) => {
|
|
286
391
|
const type = typeof event.type === 'string' ? event.type : '';
|
|
287
|
-
if (type.includes('agent_message') ||
|
|
392
|
+
if (type.includes('agent_message') ||
|
|
393
|
+
(type === 'item.completed' && isCodexMessageItem(event))) {
|
|
288
394
|
const text = extractText(event);
|
|
289
395
|
if (text) {
|
|
290
396
|
stats.assistantChars += text.length;
|
|
291
397
|
summary = text;
|
|
398
|
+
pendingText = text;
|
|
292
399
|
}
|
|
293
400
|
}
|
|
294
401
|
if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
|
|
@@ -300,12 +407,26 @@ export async function runCodex(opts) {
|
|
|
300
407
|
const turnUsage = codexUsage(event);
|
|
301
408
|
if (turnUsage)
|
|
302
409
|
usage = turnUsage;
|
|
410
|
+
// A `token_count` event closes a model turn: pair its per-turn usage with the
|
|
411
|
+
// assistant text seen since the previous turn as one telemetry call.
|
|
412
|
+
const perTurn = codexLastTurnUsage(event);
|
|
413
|
+
if (perTurn) {
|
|
414
|
+
calls.push({
|
|
415
|
+
model: opts.model,
|
|
416
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
417
|
+
messageCount: messages.length,
|
|
418
|
+
responseText: redactBody(pendingText, secrets),
|
|
419
|
+
reasoningText: '',
|
|
420
|
+
inputTokens: perTurn.inputTokens,
|
|
421
|
+
cachedInputTokens: perTurn.cachedInputTokens,
|
|
422
|
+
outputTokens: perTurn.outputTokens,
|
|
423
|
+
finishReason: null,
|
|
424
|
+
});
|
|
425
|
+
if (pendingText)
|
|
426
|
+
messages.push({ role: 'assistant', content: pendingText });
|
|
427
|
+
pendingText = '';
|
|
428
|
+
}
|
|
303
429
|
};
|
|
304
|
-
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
305
|
-
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
306
|
-
const prompt = opts.systemPrompt
|
|
307
|
-
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
308
|
-
: opts.userPrompt;
|
|
309
430
|
try {
|
|
310
431
|
const { stderrTail } = await streamCli('codex', [
|
|
311
432
|
'exec',
|
|
@@ -318,7 +439,28 @@ export async function runCodex(opts) {
|
|
|
318
439
|
opts.model,
|
|
319
440
|
'-',
|
|
320
441
|
], prompt, opts, codexHome ? { CODEX_HOME: codexHome } : {}, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
|
|
321
|
-
|
|
442
|
+
// Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
|
|
443
|
+
// single call from the cumulative total + final text so the run is still observable.
|
|
444
|
+
if (calls.length === 0 && (usage || summary)) {
|
|
445
|
+
calls.push({
|
|
446
|
+
model: opts.model,
|
|
447
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
448
|
+
messageCount: messages.length,
|
|
449
|
+
responseText: redactBody(summary, secrets),
|
|
450
|
+
reasoningText: '',
|
|
451
|
+
inputTokens: usage?.inputTokens ?? 0,
|
|
452
|
+
cachedInputTokens: 0,
|
|
453
|
+
outputTokens: usage?.outputTokens ?? 0,
|
|
454
|
+
finishReason: null,
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
return {
|
|
458
|
+
summary,
|
|
459
|
+
stats,
|
|
460
|
+
stderrTail,
|
|
461
|
+
...(usage ? { usage } : {}),
|
|
462
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
463
|
+
};
|
|
322
464
|
}
|
|
323
465
|
finally {
|
|
324
466
|
// Never leave the decrypted credential on disk past the run.
|
|
@@ -326,6 +468,24 @@ export async function runCodex(opts) {
|
|
|
326
468
|
await rm(codexHome, { recursive: true, force: true }).catch(() => { });
|
|
327
469
|
}
|
|
328
470
|
}
|
|
471
|
+
/**
|
|
472
|
+
* Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
|
|
473
|
+
* opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
|
|
474
|
+
* their command output or thinking — and must NOT be captured as the turn's response).
|
|
475
|
+
* A message item's kind contains `message` (`agent_message`/`assistant_message`); an
|
|
476
|
+
* item with no kind is treated as a message so older/simple shapes don't regress.
|
|
477
|
+
*/
|
|
478
|
+
function isCodexMessageItem(event) {
|
|
479
|
+
const item = isObject(event.item) ? event.item : undefined;
|
|
480
|
+
if (!item)
|
|
481
|
+
return false;
|
|
482
|
+
const kind = typeof item.item_type === 'string'
|
|
483
|
+
? item.item_type
|
|
484
|
+
: typeof item.type === 'string'
|
|
485
|
+
? item.type
|
|
486
|
+
: '';
|
|
487
|
+
return kind === '' || /message/i.test(kind);
|
|
488
|
+
}
|
|
329
489
|
/** Best-effort: pull a textual message out of a Codex event. */
|
|
330
490
|
function extractText(event) {
|
|
331
491
|
if (typeof event.message === 'string')
|
|
@@ -367,6 +527,8 @@ function codexPlanProgress(event) {
|
|
|
367
527
|
* other shapes put it on `usage` / `info.usage` directly. We read the cumulative
|
|
368
528
|
* total when present so the caller can simply overwrite (not sum) — summing
|
|
369
529
|
* cumulative totals across events would multiply-count. Checked most-likely first.
|
|
530
|
+
* `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
|
|
531
|
+
* is a subset already inside it), so it is NOT summed with the cached share.
|
|
370
532
|
*/
|
|
371
533
|
function codexUsage(event) {
|
|
372
534
|
const info = isObject(event.info) ? event.info : undefined;
|
|
@@ -376,12 +538,31 @@ function codexUsage(event) {
|
|
|
376
538
|
(info && isObject(info.usage) ? info.usage : undefined);
|
|
377
539
|
if (!isObject(raw))
|
|
378
540
|
return undefined;
|
|
379
|
-
const input = numberOf(raw.input_tokens)
|
|
541
|
+
const input = numberOf(raw.input_tokens);
|
|
380
542
|
const output = numberOf(raw.output_tokens);
|
|
381
543
|
if (input === 0 && output === 0)
|
|
382
544
|
return undefined;
|
|
383
545
|
return { inputTokens: input, outputTokens: output };
|
|
384
546
|
}
|
|
547
|
+
/**
|
|
548
|
+
* Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
|
|
549
|
+
* delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
|
|
550
|
+
* `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
|
|
551
|
+
* share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
|
|
552
|
+
* NOT added on top (adding it would double-count every cached token).
|
|
553
|
+
*/
|
|
554
|
+
function codexLastTurnUsage(event) {
|
|
555
|
+
const info = isObject(event.info) ? event.info : undefined;
|
|
556
|
+
const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined;
|
|
557
|
+
if (!isObject(raw))
|
|
558
|
+
return undefined;
|
|
559
|
+
const input = numberOf(raw.input_tokens);
|
|
560
|
+
const cached = numberOf(raw.cached_input_tokens);
|
|
561
|
+
const output = numberOf(raw.output_tokens);
|
|
562
|
+
if (input === 0 && output === 0)
|
|
563
|
+
return undefined;
|
|
564
|
+
return { inputTokens: input, cachedInputTokens: cached, outputTokens: output };
|
|
565
|
+
}
|
|
385
566
|
function numberOf(value) {
|
|
386
567
|
return typeof value === 'number' && Number.isFinite(value) ? value : 0;
|
|
387
568
|
}
|
package/dist/agent.js
CHANGED
|
@@ -341,7 +341,7 @@ async function runExploreMode(job, opts) {
|
|
|
341
341
|
try {
|
|
342
342
|
opts.onPhase?.('agent');
|
|
343
343
|
logger.info('agent(explore): running agent', { serviceDirectory });
|
|
344
|
-
const { summary, stats, stderrTail, usage, diagnostics: runDiag, } = await runAgentInWorkspace({
|
|
344
|
+
const { summary, stats, stderrTail, usage, callMetrics, diagnostics: runDiag, } = await runAgentInWorkspace({
|
|
345
345
|
dir: workDir,
|
|
346
346
|
systemPrompt: job.systemPrompt,
|
|
347
347
|
userPrompt,
|
|
@@ -368,6 +368,7 @@ async function runExploreMode(job, opts) {
|
|
|
368
368
|
error: noOutputReason(stats, stderrTail),
|
|
369
369
|
failureCause: 'no-usable-output',
|
|
370
370
|
...(usage ? { usage } : {}),
|
|
371
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
371
372
|
...infraSetupFields,
|
|
372
373
|
};
|
|
373
374
|
}
|
|
@@ -384,6 +385,7 @@ async function runExploreMode(job, opts) {
|
|
|
384
385
|
error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
|
|
385
386
|
failureCause: 'no-usable-output',
|
|
386
387
|
...(usage ? { usage } : {}),
|
|
388
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
387
389
|
...infraSetupFields,
|
|
388
390
|
};
|
|
389
391
|
}
|
|
@@ -391,7 +393,13 @@ async function runExploreMode(job, opts) {
|
|
|
391
393
|
// Prose: the summary IS the deliverable.
|
|
392
394
|
if (job.output?.kind !== 'structured') {
|
|
393
395
|
logger.info('agent(explore): done (prose)', { ...stats });
|
|
394
|
-
return {
|
|
396
|
+
return {
|
|
397
|
+
summary,
|
|
398
|
+
stats,
|
|
399
|
+
...(usage ? { usage } : {}),
|
|
400
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
401
|
+
...infraSetupFields,
|
|
402
|
+
};
|
|
395
403
|
}
|
|
396
404
|
// Structured: parse the agent's JSON. With repair enabled (default) a malformed
|
|
397
405
|
// reply gets ONE structured repair call before giving up; with `repair:false` we
|
|
@@ -432,6 +440,7 @@ async function runExploreMode(job, opts) {
|
|
|
432
440
|
error: noStructuredReason(stats, stderrTail, diagnostics),
|
|
433
441
|
failureCause: 'no-usable-output',
|
|
434
442
|
...(usage ? { usage } : {}),
|
|
443
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
435
444
|
...infraSetupFields,
|
|
436
445
|
};
|
|
437
446
|
}
|
|
@@ -451,7 +460,14 @@ async function runExploreMode(job, opts) {
|
|
|
451
460
|
custom.environment = reportedEnvironment;
|
|
452
461
|
}
|
|
453
462
|
logger.info('agent(explore): done (structured)', { ...stats });
|
|
454
|
-
return {
|
|
463
|
+
return {
|
|
464
|
+
summary,
|
|
465
|
+
custom,
|
|
466
|
+
stats,
|
|
467
|
+
...(usage ? { usage } : {}),
|
|
468
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
469
|
+
...infraSetupFields,
|
|
470
|
+
};
|
|
455
471
|
}
|
|
456
472
|
finally {
|
|
457
473
|
if (managed)
|
|
@@ -477,7 +493,7 @@ async function runCodingMode(job, opts) {
|
|
|
477
493
|
if (job.mergeBase)
|
|
478
494
|
return runConflictResolution(job, opts);
|
|
479
495
|
const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch;
|
|
480
|
-
const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent({
|
|
496
|
+
const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent({
|
|
481
497
|
kind: 'agent',
|
|
482
498
|
jobId: job.jobId,
|
|
483
499
|
repo: job.repo,
|
|
@@ -504,7 +520,14 @@ async function runCodingMode(job, opts) {
|
|
|
504
520
|
if (!pushed) {
|
|
505
521
|
// A no-op: a failure for the implementer, a clean non-event for the fixers.
|
|
506
522
|
if (job.noChangesIsError === false) {
|
|
507
|
-
return {
|
|
523
|
+
return {
|
|
524
|
+
pushed: false,
|
|
525
|
+
branch: pushBranch,
|
|
526
|
+
summary,
|
|
527
|
+
stats,
|
|
528
|
+
...(usage ? { usage } : {}),
|
|
529
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
530
|
+
};
|
|
508
531
|
}
|
|
509
532
|
return {
|
|
510
533
|
pushed: false,
|
|
@@ -514,6 +537,7 @@ async function runCodingMode(job, opts) {
|
|
|
514
537
|
error: noChangesReason('the agent produced no file changes', stats, stderrTail),
|
|
515
538
|
failureCause: 'no-changes',
|
|
516
539
|
...(usage ? { usage } : {}),
|
|
540
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
517
541
|
};
|
|
518
542
|
}
|
|
519
543
|
// Changes are on the branch. Open a PR only when the job asked for one.
|
|
@@ -539,7 +563,14 @@ async function runCodingMode(job, opts) {
|
|
|
539
563
|
// this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
|
|
540
564
|
if (prUrl === null) {
|
|
541
565
|
if (job.noChangesIsError === false) {
|
|
542
|
-
return {
|
|
566
|
+
return {
|
|
567
|
+
pushed: false,
|
|
568
|
+
branch: pushBranch,
|
|
569
|
+
summary,
|
|
570
|
+
stats,
|
|
571
|
+
...(usage ? { usage } : {}),
|
|
572
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
573
|
+
};
|
|
543
574
|
}
|
|
544
575
|
return {
|
|
545
576
|
pushed: false,
|
|
@@ -549,11 +580,27 @@ async function runCodingMode(job, opts) {
|
|
|
549
580
|
error: noChangesReason('the work branch has no commits ahead of its base (nothing to open a PR for)', stats, stderrTail),
|
|
550
581
|
failureCause: 'no-changes',
|
|
551
582
|
...(usage ? { usage } : {}),
|
|
583
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
552
584
|
};
|
|
553
585
|
}
|
|
554
|
-
return {
|
|
586
|
+
return {
|
|
587
|
+
pushed: true,
|
|
588
|
+
prUrl,
|
|
589
|
+
branch: pushBranch,
|
|
590
|
+
summary,
|
|
591
|
+
stats,
|
|
592
|
+
...(usage ? { usage } : {}),
|
|
593
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
594
|
+
};
|
|
555
595
|
}
|
|
556
|
-
return {
|
|
596
|
+
return {
|
|
597
|
+
pushed: true,
|
|
598
|
+
branch: pushBranch,
|
|
599
|
+
summary,
|
|
600
|
+
stats,
|
|
601
|
+
...(usage ? { usage } : {}),
|
|
602
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
603
|
+
};
|
|
557
604
|
}
|
|
558
605
|
/**
|
|
559
606
|
* Conflict-resolution coding flow (the conflict-resolver): clone the PR head `branch`
|
|
@@ -617,7 +664,7 @@ async function runConflictResolution(job, opts) {
|
|
|
617
664
|
logger.info('agent(conflict): resolving conflicts with agent', { conflicted });
|
|
618
665
|
const diff = await conflictDiff(dir, conflicted, signal);
|
|
619
666
|
const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt);
|
|
620
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
667
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
|
|
621
668
|
dir,
|
|
622
669
|
systemPrompt: job.systemPrompt,
|
|
623
670
|
userPrompt,
|
|
@@ -646,6 +693,7 @@ async function runConflictResolution(job, opts) {
|
|
|
646
693
|
error: unresolvedReason(unresolved, stats, stderrTail),
|
|
647
694
|
failureCause: 'agent',
|
|
648
695
|
...(usage ? { usage } : {}),
|
|
696
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
649
697
|
};
|
|
650
698
|
}
|
|
651
699
|
// Complete the merge commit with the agent's resolution staged, then push.
|
|
@@ -653,7 +701,14 @@ async function runConflictResolution(job, opts) {
|
|
|
653
701
|
opts.onPhase?.('push');
|
|
654
702
|
logger.info('agent(conflict): pushing resolved branch', { ...stats });
|
|
655
703
|
await pushBranch(dir, job.branch, job.ghToken, signal);
|
|
656
|
-
return {
|
|
704
|
+
return {
|
|
705
|
+
pushed: true,
|
|
706
|
+
branch: job.branch,
|
|
707
|
+
summary,
|
|
708
|
+
stats,
|
|
709
|
+
...(usage ? { usage } : {}),
|
|
710
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
711
|
+
};
|
|
657
712
|
});
|
|
658
713
|
}
|
|
659
714
|
/**
|
|
@@ -729,7 +784,7 @@ async function runBootstrap(job, opts) {
|
|
|
729
784
|
}
|
|
730
785
|
opts.onPhase?.('agent');
|
|
731
786
|
logger.info('agent(bootstrap): running agent');
|
|
732
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
787
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
|
|
733
788
|
dir,
|
|
734
789
|
systemPrompt: job.systemPrompt,
|
|
735
790
|
userPrompt: job.userPrompt,
|
|
@@ -749,7 +804,14 @@ async function runBootstrap(job, opts) {
|
|
|
749
804
|
if (!(await producedRepoContent(dir, !fromScratch, signal))) {
|
|
750
805
|
const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail);
|
|
751
806
|
logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats });
|
|
752
|
-
return {
|
|
807
|
+
return {
|
|
808
|
+
summary,
|
|
809
|
+
stats,
|
|
810
|
+
error,
|
|
811
|
+
failureCause: 'agent',
|
|
812
|
+
...(usage ? { usage } : {}),
|
|
813
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
814
|
+
};
|
|
753
815
|
}
|
|
754
816
|
opts.onPhase?.('push');
|
|
755
817
|
logger.info('agent(bootstrap): pushing bootstrapped contents', { ...stats });
|
|
@@ -764,7 +826,13 @@ async function runBootstrap(job, opts) {
|
|
|
764
826
|
: `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
|
|
765
827
|
});
|
|
766
828
|
logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch });
|
|
767
|
-
return {
|
|
829
|
+
return {
|
|
830
|
+
defaultBranch: boot.target.defaultBranch,
|
|
831
|
+
summary,
|
|
832
|
+
stats,
|
|
833
|
+
...(usage ? { usage } : {}),
|
|
834
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
835
|
+
};
|
|
768
836
|
});
|
|
769
837
|
}
|
|
770
838
|
/**
|
package/dist/coding-agent.js
CHANGED
|
@@ -195,7 +195,7 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
195
195
|
try {
|
|
196
196
|
opts.onPhase?.('agent');
|
|
197
197
|
logger.info('coding-agent: running agent', { serviceDirectory });
|
|
198
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
198
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
|
|
199
199
|
dir: workDir,
|
|
200
200
|
systemPrompt: spec.systemPrompt,
|
|
201
201
|
userPrompt: spec.userPrompt,
|
|
@@ -265,6 +265,7 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
265
265
|
stats,
|
|
266
266
|
...(stderrTail ? { stderrTail } : {}),
|
|
267
267
|
...(usage ? { usage } : {}),
|
|
268
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
268
269
|
};
|
|
269
270
|
}
|
|
270
271
|
else {
|
|
@@ -278,6 +279,7 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
278
279
|
stats,
|
|
279
280
|
...(stderrTail ? { stderrTail } : {}),
|
|
280
281
|
...(usage ? { usage } : {}),
|
|
282
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
281
283
|
};
|
|
282
284
|
}
|
|
283
285
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@cat-factory/executor-harness",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.32.0",
|
|
4
4
|
"description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -26,8 +26,8 @@
|
|
|
26
26
|
"hono": "^4.12.27",
|
|
27
27
|
"typescript": "^6.0.3",
|
|
28
28
|
"vitest": "^4.1.9",
|
|
29
|
-
"@cat-factory/
|
|
30
|
-
"@cat-factory/
|
|
29
|
+
"@cat-factory/server": "0.68.0",
|
|
30
|
+
"@cat-factory/spend": "0.10.74"
|
|
31
31
|
},
|
|
32
32
|
"scripts": {
|
|
33
33
|
"build": "tsc -p tsconfig.json",
|
package/src/agent-runner.ts
CHANGED
|
@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process'
|
|
|
2
2
|
import { mkdtemp, rm, writeFile } from 'node:fs/promises'
|
|
3
3
|
import { tmpdir } from 'node:os'
|
|
4
4
|
import { join } from 'node:path'
|
|
5
|
-
import type { PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
|
|
5
|
+
import type { HarnessCallMetric, PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
|
|
6
6
|
import { killChildProcess, spawnDetached } from './process.js'
|
|
7
7
|
import { redact, secretsToRedact } from './redact.js'
|
|
8
8
|
|
|
@@ -64,6 +64,29 @@ function isObject(value: unknown): value is Record<string, unknown> {
|
|
|
64
64
|
return typeof value === 'object' && value !== null
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
/** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
|
|
68
|
+
function redactBody(text: string, secrets: string[]): string {
|
|
69
|
+
return secrets.length ? redact(text, secrets) : text
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Fallback token attribution: if a CLI reported a cumulative total but no per-turn
|
|
74
|
+
* usage (so every captured call has zero tokens), pin the whole total onto the LAST
|
|
75
|
+
* call rather than dropping it — the run's tokens are still accounted, just not split
|
|
76
|
+
* per turn. A no-op when the calls already carry per-turn tokens.
|
|
77
|
+
*/
|
|
78
|
+
function attributeCumulativeUsage(
|
|
79
|
+
calls: HarnessCallMetric[],
|
|
80
|
+
usage: { inputTokens: number; outputTokens: number } | undefined,
|
|
81
|
+
): void {
|
|
82
|
+
if (!usage || calls.length === 0) return
|
|
83
|
+
const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0)
|
|
84
|
+
if (anyTokens) return
|
|
85
|
+
const last = calls[calls.length - 1]!
|
|
86
|
+
last.inputTokens = usage.inputTokens
|
|
87
|
+
last.outputTokens = usage.outputTokens
|
|
88
|
+
}
|
|
89
|
+
|
|
67
90
|
/**
|
|
68
91
|
* Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
|
|
69
92
|
* through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
|
|
@@ -184,25 +207,59 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
|
|
|
184
207
|
let summary = ''
|
|
185
208
|
let usage: { inputTokens: number; outputTokens: number } | undefined
|
|
186
209
|
|
|
210
|
+
// Reconstruct the full per-call request/response bodies for telemetry from the
|
|
211
|
+
// stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
|
|
212
|
+
// Anthropic Messages envelope, so `assistant` events carry the complete response
|
|
213
|
+
// (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
|
|
214
|
+
// fed back — together the growing prompt transcript. We seed it with the two inputs
|
|
215
|
+
// the harness supplies (they never appear in the stream): the system + first user
|
|
216
|
+
// message. Bodies are credential-scrubbed (they can echo the leased token).
|
|
217
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
|
|
218
|
+
const messages: Array<{ role: string; content: unknown }> = [
|
|
219
|
+
{ role: 'system', content: opts.systemPrompt },
|
|
220
|
+
{ role: 'user', content: opts.userPrompt },
|
|
221
|
+
]
|
|
222
|
+
const calls: HarnessCallMetric[] = []
|
|
223
|
+
|
|
187
224
|
const onEvent = (event: Record<string, unknown>): void => {
|
|
188
225
|
const type = event.type
|
|
189
226
|
if (type === 'assistant' && isObject(event.message)) {
|
|
190
|
-
const
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
227
|
+
const message = event.message as Record<string, unknown>
|
|
228
|
+
const content = Array.isArray(message.content) ? message.content : []
|
|
229
|
+
const { text, reasoning, toolUses } = claudeAssistantContent(content)
|
|
230
|
+
stats.assistantChars += text.length
|
|
231
|
+
stats.toolCalls += toolUses
|
|
232
|
+
for (const block of content) {
|
|
233
|
+
if (
|
|
234
|
+
isObject(block) &&
|
|
235
|
+
block.type === 'tool_use' &&
|
|
236
|
+
block.name === 'TodoWrite' &&
|
|
237
|
+
opts.onProgress
|
|
238
|
+
) {
|
|
239
|
+
const progress = todosToProgress((block.input as Record<string, unknown>)?.todos)
|
|
240
|
+
if (progress) opts.onProgress(progress)
|
|
204
241
|
}
|
|
205
242
|
}
|
|
243
|
+
// Record this call BEFORE appending its turn: the prompt is the history that
|
|
244
|
+
// produced this response. The append-only array keeps each call's prompt a strict
|
|
245
|
+
// prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
|
|
246
|
+
const u = claudeCallUsage(message.usage)
|
|
247
|
+
calls.push({
|
|
248
|
+
...(typeof message.model === 'string' ? { model: message.model } : {}),
|
|
249
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
250
|
+
messageCount: messages.length,
|
|
251
|
+
responseText: redactBody(text, secrets),
|
|
252
|
+
reasoningText: redactBody(reasoning, secrets),
|
|
253
|
+
inputTokens: u.inputTokens,
|
|
254
|
+
cachedInputTokens: u.cachedInputTokens,
|
|
255
|
+
outputTokens: u.outputTokens,
|
|
256
|
+
finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
|
|
257
|
+
})
|
|
258
|
+
messages.push({ role: 'assistant', content })
|
|
259
|
+
} else if (type === 'user' && isObject(event.message)) {
|
|
260
|
+
// tool_result blocks the harness fed back to the model — part of the next prompt.
|
|
261
|
+
const content = (event.message as Record<string, unknown>).content
|
|
262
|
+
if (Array.isArray(content)) messages.push({ role: 'tool', content })
|
|
206
263
|
} else if (type === 'result') {
|
|
207
264
|
if (typeof event.result === 'string') summary = event.result
|
|
208
265
|
usage = claudeUsage(event.usage) ?? usage
|
|
@@ -282,7 +339,14 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
|
|
|
282
339
|
onEvent,
|
|
283
340
|
)
|
|
284
341
|
|
|
285
|
-
|
|
342
|
+
attributeCumulativeUsage(calls, usage)
|
|
343
|
+
return {
|
|
344
|
+
summary,
|
|
345
|
+
stats,
|
|
346
|
+
stderrTail,
|
|
347
|
+
...(usage ? { usage } : {}),
|
|
348
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
349
|
+
}
|
|
286
350
|
} finally {
|
|
287
351
|
// Never leave the config dir (and any cached credential) on disk past the run.
|
|
288
352
|
if (configHome) await rm(configHome, { recursive: true, force: true }).catch(() => {})
|
|
@@ -322,6 +386,44 @@ function claudeUsage(raw: unknown): { inputTokens: number; outputTokens: number
|
|
|
322
386
|
return { inputTokens: input, outputTokens: output }
|
|
323
387
|
}
|
|
324
388
|
|
|
389
|
+
/** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
|
|
390
|
+
function claudeAssistantContent(content: unknown[]): {
|
|
391
|
+
text: string
|
|
392
|
+
reasoning: string
|
|
393
|
+
toolUses: number
|
|
394
|
+
} {
|
|
395
|
+
let text = ''
|
|
396
|
+
let reasoning = ''
|
|
397
|
+
let toolUses = 0
|
|
398
|
+
for (const block of content) {
|
|
399
|
+
if (!isObject(block)) continue
|
|
400
|
+
if (block.type === 'text' && typeof block.text === 'string') text += block.text
|
|
401
|
+
else if (block.type === 'thinking' && typeof block.thinking === 'string')
|
|
402
|
+
reasoning += block.thinking
|
|
403
|
+
else if (block.type === 'tool_use') toolUses += 1
|
|
404
|
+
}
|
|
405
|
+
return { text, reasoning, toolUses }
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/**
|
|
409
|
+
* Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
|
|
410
|
+
* the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
|
|
411
|
+
* + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
|
|
412
|
+
*/
|
|
413
|
+
function claudeCallUsage(raw: unknown): {
|
|
414
|
+
inputTokens: number
|
|
415
|
+
cachedInputTokens: number
|
|
416
|
+
outputTokens: number
|
|
417
|
+
} {
|
|
418
|
+
if (!isObject(raw)) return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 }
|
|
419
|
+
const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens)
|
|
420
|
+
return {
|
|
421
|
+
inputTokens: numberOf(raw.input_tokens) + cached,
|
|
422
|
+
cachedInputTokens: cached,
|
|
423
|
+
outputTokens: numberOf(raw.output_tokens),
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
325
427
|
// ---------------------------------------------------------------------------
|
|
326
428
|
// Codex
|
|
327
429
|
// ---------------------------------------------------------------------------
|
|
@@ -366,13 +468,33 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
|
|
|
366
468
|
await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8')
|
|
367
469
|
}
|
|
368
470
|
|
|
471
|
+
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
472
|
+
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
473
|
+
const prompt = opts.systemPrompt
|
|
474
|
+
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
475
|
+
: opts.userPrompt
|
|
476
|
+
|
|
477
|
+
// Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
|
|
478
|
+
// flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
|
|
479
|
+
// plus a cumulative total. It never exposes the request transcript or structured
|
|
480
|
+
// tool/command bodies, so the captured prompt is just the folded input — the response
|
|
481
|
+
// text + per-turn tokens are faithful; the request side is best-effort by design.
|
|
482
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
|
|
483
|
+
const messages: Array<{ role: string; content: unknown }> = [{ role: 'user', content: prompt }]
|
|
484
|
+
const calls: HarnessCallMetric[] = []
|
|
485
|
+
let pendingText = ''
|
|
486
|
+
|
|
369
487
|
const onEvent = (event: Record<string, unknown>): void => {
|
|
370
488
|
const type = typeof event.type === 'string' ? event.type : ''
|
|
371
|
-
if (
|
|
489
|
+
if (
|
|
490
|
+
type.includes('agent_message') ||
|
|
491
|
+
(type === 'item.completed' && isCodexMessageItem(event))
|
|
492
|
+
) {
|
|
372
493
|
const text = extractText(event)
|
|
373
494
|
if (text) {
|
|
374
495
|
stats.assistantChars += text.length
|
|
375
496
|
summary = text
|
|
497
|
+
pendingText = text
|
|
376
498
|
}
|
|
377
499
|
}
|
|
378
500
|
if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
|
|
@@ -382,14 +504,26 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
|
|
|
382
504
|
if (progress && opts.onProgress) opts.onProgress(progress)
|
|
383
505
|
const turnUsage = codexUsage(event)
|
|
384
506
|
if (turnUsage) usage = turnUsage
|
|
507
|
+
// A `token_count` event closes a model turn: pair its per-turn usage with the
|
|
508
|
+
// assistant text seen since the previous turn as one telemetry call.
|
|
509
|
+
const perTurn = codexLastTurnUsage(event)
|
|
510
|
+
if (perTurn) {
|
|
511
|
+
calls.push({
|
|
512
|
+
model: opts.model,
|
|
513
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
514
|
+
messageCount: messages.length,
|
|
515
|
+
responseText: redactBody(pendingText, secrets),
|
|
516
|
+
reasoningText: '',
|
|
517
|
+
inputTokens: perTurn.inputTokens,
|
|
518
|
+
cachedInputTokens: perTurn.cachedInputTokens,
|
|
519
|
+
outputTokens: perTurn.outputTokens,
|
|
520
|
+
finishReason: null,
|
|
521
|
+
})
|
|
522
|
+
if (pendingText) messages.push({ role: 'assistant', content: pendingText })
|
|
523
|
+
pendingText = ''
|
|
524
|
+
}
|
|
385
525
|
}
|
|
386
526
|
|
|
387
|
-
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
388
|
-
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
389
|
-
const prompt = opts.systemPrompt
|
|
390
|
-
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
391
|
-
: opts.userPrompt
|
|
392
|
-
|
|
393
527
|
try {
|
|
394
528
|
const { stderrTail } = await streamCli(
|
|
395
529
|
'codex',
|
|
@@ -411,13 +545,53 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
|
|
|
411
545
|
onEvent,
|
|
412
546
|
)
|
|
413
547
|
|
|
414
|
-
|
|
548
|
+
// Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
|
|
549
|
+
// single call from the cumulative total + final text so the run is still observable.
|
|
550
|
+
if (calls.length === 0 && (usage || summary)) {
|
|
551
|
+
calls.push({
|
|
552
|
+
model: opts.model,
|
|
553
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
554
|
+
messageCount: messages.length,
|
|
555
|
+
responseText: redactBody(summary, secrets),
|
|
556
|
+
reasoningText: '',
|
|
557
|
+
inputTokens: usage?.inputTokens ?? 0,
|
|
558
|
+
cachedInputTokens: 0,
|
|
559
|
+
outputTokens: usage?.outputTokens ?? 0,
|
|
560
|
+
finishReason: null,
|
|
561
|
+
})
|
|
562
|
+
}
|
|
563
|
+
return {
|
|
564
|
+
summary,
|
|
565
|
+
stats,
|
|
566
|
+
stderrTail,
|
|
567
|
+
...(usage ? { usage } : {}),
|
|
568
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
569
|
+
}
|
|
415
570
|
} finally {
|
|
416
571
|
// Never leave the decrypted credential on disk past the run.
|
|
417
572
|
if (codexHome) await rm(codexHome, { recursive: true, force: true }).catch(() => {})
|
|
418
573
|
}
|
|
419
574
|
}
|
|
420
575
|
|
|
576
|
+
/**
|
|
577
|
+
* Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
|
|
578
|
+
* opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
|
|
579
|
+
* their command output or thinking — and must NOT be captured as the turn's response).
|
|
580
|
+
* A message item's kind contains `message` (`agent_message`/`assistant_message`); an
|
|
581
|
+
* item with no kind is treated as a message so older/simple shapes don't regress.
|
|
582
|
+
*/
|
|
583
|
+
function isCodexMessageItem(event: Record<string, unknown>): boolean {
|
|
584
|
+
const item = isObject(event.item) ? (event.item as Record<string, unknown>) : undefined
|
|
585
|
+
if (!item) return false
|
|
586
|
+
const kind =
|
|
587
|
+
typeof item.item_type === 'string'
|
|
588
|
+
? item.item_type
|
|
589
|
+
: typeof item.type === 'string'
|
|
590
|
+
? item.type
|
|
591
|
+
: ''
|
|
592
|
+
return kind === '' || /message/i.test(kind)
|
|
593
|
+
}
|
|
594
|
+
|
|
421
595
|
/** Best-effort: pull a textual message out of a Codex event. */
|
|
422
596
|
function extractText(event: Record<string, unknown>): string | undefined {
|
|
423
597
|
if (typeof event.message === 'string') return event.message
|
|
@@ -456,6 +630,8 @@ function codexPlanProgress(event: Record<string, unknown>): TodoProgress | undef
|
|
|
456
630
|
* other shapes put it on `usage` / `info.usage` directly. We read the cumulative
|
|
457
631
|
* total when present so the caller can simply overwrite (not sum) — summing
|
|
458
632
|
* cumulative totals across events would multiply-count. Checked most-likely first.
|
|
633
|
+
* `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
|
|
634
|
+
* is a subset already inside it), so it is NOT summed with the cached share.
|
|
459
635
|
*/
|
|
460
636
|
function codexUsage(
|
|
461
637
|
event: Record<string, unknown>,
|
|
@@ -467,12 +643,36 @@ function codexUsage(
|
|
|
467
643
|
(isObject(event.usage) ? event.usage : undefined) ??
|
|
468
644
|
(info && isObject(info.usage) ? info.usage : undefined)
|
|
469
645
|
if (!isObject(raw)) return undefined
|
|
470
|
-
const input = numberOf(raw.input_tokens)
|
|
646
|
+
const input = numberOf(raw.input_tokens)
|
|
471
647
|
const output = numberOf(raw.output_tokens)
|
|
472
648
|
if (input === 0 && output === 0) return undefined
|
|
473
649
|
return { inputTokens: input, outputTokens: output }
|
|
474
650
|
}
|
|
475
651
|
|
|
652
|
+
/**
|
|
653
|
+
* Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
|
|
654
|
+
* delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
|
|
655
|
+
* `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
|
|
656
|
+
* share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
|
|
657
|
+
* NOT added on top (adding it would double-count every cached token).
|
|
658
|
+
*/
|
|
659
|
+
function codexLastTurnUsage(event: Record<string, unknown>):
|
|
660
|
+
| {
|
|
661
|
+
inputTokens: number
|
|
662
|
+
cachedInputTokens: number
|
|
663
|
+
outputTokens: number
|
|
664
|
+
}
|
|
665
|
+
| undefined {
|
|
666
|
+
const info = isObject(event.info) ? (event.info as Record<string, unknown>) : undefined
|
|
667
|
+
const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined
|
|
668
|
+
if (!isObject(raw)) return undefined
|
|
669
|
+
const input = numberOf(raw.input_tokens)
|
|
670
|
+
const cached = numberOf(raw.cached_input_tokens)
|
|
671
|
+
const output = numberOf(raw.output_tokens)
|
|
672
|
+
if (input === 0 && output === 0) return undefined
|
|
673
|
+
return { inputTokens: input, cachedInputTokens: cached, outputTokens: output }
|
|
674
|
+
}
|
|
675
|
+
|
|
476
676
|
function numberOf(value: unknown): number {
|
|
477
677
|
return typeof value === 'number' && Number.isFinite(value) ? value : 0
|
|
478
678
|
}
|
package/src/agent.ts
CHANGED
|
@@ -421,6 +421,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
421
421
|
stats,
|
|
422
422
|
stderrTail,
|
|
423
423
|
usage,
|
|
424
|
+
callMetrics,
|
|
424
425
|
diagnostics: runDiag,
|
|
425
426
|
} = await runAgentInWorkspace(
|
|
426
427
|
{
|
|
@@ -453,6 +454,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
453
454
|
error: noOutputReason(stats, stderrTail),
|
|
454
455
|
failureCause: 'no-usable-output',
|
|
455
456
|
...(usage ? { usage } : {}),
|
|
457
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
456
458
|
...infraSetupFields,
|
|
457
459
|
}
|
|
458
460
|
}
|
|
@@ -470,6 +472,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
470
472
|
error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
|
|
471
473
|
failureCause: 'no-usable-output',
|
|
472
474
|
...(usage ? { usage } : {}),
|
|
475
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
473
476
|
...infraSetupFields,
|
|
474
477
|
}
|
|
475
478
|
}
|
|
@@ -478,7 +481,13 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
478
481
|
// Prose: the summary IS the deliverable.
|
|
479
482
|
if (job.output?.kind !== 'structured') {
|
|
480
483
|
logger.info('agent(explore): done (prose)', { ...stats })
|
|
481
|
-
return {
|
|
484
|
+
return {
|
|
485
|
+
summary,
|
|
486
|
+
stats,
|
|
487
|
+
...(usage ? { usage } : {}),
|
|
488
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
489
|
+
...infraSetupFields,
|
|
490
|
+
}
|
|
482
491
|
}
|
|
483
492
|
|
|
484
493
|
// Structured: parse the agent's JSON. With repair enabled (default) a malformed
|
|
@@ -522,6 +531,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
522
531
|
error: noStructuredReason(stats, stderrTail, diagnostics),
|
|
523
532
|
failureCause: 'no-usable-output',
|
|
524
533
|
...(usage ? { usage } : {}),
|
|
534
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
525
535
|
...infraSetupFields,
|
|
526
536
|
}
|
|
527
537
|
}
|
|
@@ -540,7 +550,14 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
|
|
|
540
550
|
;(custom as Record<string, unknown>).environment = reportedEnvironment
|
|
541
551
|
}
|
|
542
552
|
logger.info('agent(explore): done (structured)', { ...stats })
|
|
543
|
-
return {
|
|
553
|
+
return {
|
|
554
|
+
summary,
|
|
555
|
+
custom,
|
|
556
|
+
stats,
|
|
557
|
+
...(usage ? { usage } : {}),
|
|
558
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
559
|
+
...infraSetupFields,
|
|
560
|
+
}
|
|
544
561
|
} finally {
|
|
545
562
|
if (managed) await managed.cleanup()
|
|
546
563
|
}
|
|
@@ -565,7 +582,7 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
565
582
|
if (job.mergeBase) return runConflictResolution(job, opts)
|
|
566
583
|
|
|
567
584
|
const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
|
|
568
|
-
const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent(
|
|
585
|
+
const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent(
|
|
569
586
|
{
|
|
570
587
|
kind: 'agent',
|
|
571
588
|
jobId: job.jobId,
|
|
@@ -596,7 +613,14 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
596
613
|
if (!pushed) {
|
|
597
614
|
// A no-op: a failure for the implementer, a clean non-event for the fixers.
|
|
598
615
|
if (job.noChangesIsError === false) {
|
|
599
|
-
return {
|
|
616
|
+
return {
|
|
617
|
+
pushed: false,
|
|
618
|
+
branch: pushBranch,
|
|
619
|
+
summary,
|
|
620
|
+
stats,
|
|
621
|
+
...(usage ? { usage } : {}),
|
|
622
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
623
|
+
}
|
|
600
624
|
}
|
|
601
625
|
return {
|
|
602
626
|
pushed: false,
|
|
@@ -606,6 +630,7 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
606
630
|
error: noChangesReason('the agent produced no file changes', stats, stderrTail),
|
|
607
631
|
failureCause: 'no-changes',
|
|
608
632
|
...(usage ? { usage } : {}),
|
|
633
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
609
634
|
}
|
|
610
635
|
}
|
|
611
636
|
|
|
@@ -632,7 +657,14 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
632
657
|
// this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
|
|
633
658
|
if (prUrl === null) {
|
|
634
659
|
if (job.noChangesIsError === false) {
|
|
635
|
-
return {
|
|
660
|
+
return {
|
|
661
|
+
pushed: false,
|
|
662
|
+
branch: pushBranch,
|
|
663
|
+
summary,
|
|
664
|
+
stats,
|
|
665
|
+
...(usage ? { usage } : {}),
|
|
666
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
667
|
+
}
|
|
636
668
|
}
|
|
637
669
|
return {
|
|
638
670
|
pushed: false,
|
|
@@ -646,11 +678,27 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
|
|
|
646
678
|
),
|
|
647
679
|
failureCause: 'no-changes',
|
|
648
680
|
...(usage ? { usage } : {}),
|
|
681
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
649
682
|
}
|
|
650
683
|
}
|
|
651
|
-
return {
|
|
684
|
+
return {
|
|
685
|
+
pushed: true,
|
|
686
|
+
prUrl,
|
|
687
|
+
branch: pushBranch,
|
|
688
|
+
summary,
|
|
689
|
+
stats,
|
|
690
|
+
...(usage ? { usage } : {}),
|
|
691
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
692
|
+
}
|
|
693
|
+
}
|
|
694
|
+
return {
|
|
695
|
+
pushed: true,
|
|
696
|
+
branch: pushBranch,
|
|
697
|
+
summary,
|
|
698
|
+
stats,
|
|
699
|
+
...(usage ? { usage } : {}),
|
|
700
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
652
701
|
}
|
|
653
|
-
return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
|
|
654
702
|
}
|
|
655
703
|
|
|
656
704
|
/**
|
|
@@ -719,7 +767,7 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
|
|
|
719
767
|
const diff = await conflictDiff(dir, conflicted, signal)
|
|
720
768
|
const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt)
|
|
721
769
|
|
|
722
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
|
|
770
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
|
|
723
771
|
{
|
|
724
772
|
dir,
|
|
725
773
|
systemPrompt: job.systemPrompt,
|
|
@@ -752,6 +800,7 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
|
|
|
752
800
|
error: unresolvedReason(unresolved, stats, stderrTail),
|
|
753
801
|
failureCause: 'agent',
|
|
754
802
|
...(usage ? { usage } : {}),
|
|
803
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
755
804
|
}
|
|
756
805
|
}
|
|
757
806
|
// Complete the merge commit with the agent's resolution staged, then push.
|
|
@@ -759,7 +808,14 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
|
|
|
759
808
|
opts.onPhase?.('push')
|
|
760
809
|
logger.info('agent(conflict): pushing resolved branch', { ...stats })
|
|
761
810
|
await pushBranch(dir, job.branch, job.ghToken, signal)
|
|
762
|
-
return {
|
|
811
|
+
return {
|
|
812
|
+
pushed: true,
|
|
813
|
+
branch: job.branch,
|
|
814
|
+
summary,
|
|
815
|
+
stats,
|
|
816
|
+
...(usage ? { usage } : {}),
|
|
817
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
818
|
+
}
|
|
763
819
|
})
|
|
764
820
|
}
|
|
765
821
|
|
|
@@ -850,7 +906,7 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
|
|
|
850
906
|
|
|
851
907
|
opts.onPhase?.('agent')
|
|
852
908
|
logger.info('agent(bootstrap): running agent')
|
|
853
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
|
|
909
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
|
|
854
910
|
{
|
|
855
911
|
dir,
|
|
856
912
|
systemPrompt: job.systemPrompt,
|
|
@@ -874,7 +930,14 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
|
|
|
874
930
|
if (!(await producedRepoContent(dir, !fromScratch, signal))) {
|
|
875
931
|
const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail)
|
|
876
932
|
logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats })
|
|
877
|
-
return {
|
|
933
|
+
return {
|
|
934
|
+
summary,
|
|
935
|
+
stats,
|
|
936
|
+
error,
|
|
937
|
+
failureCause: 'agent',
|
|
938
|
+
...(usage ? { usage } : {}),
|
|
939
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
940
|
+
}
|
|
878
941
|
}
|
|
879
942
|
|
|
880
943
|
opts.onPhase?.('push')
|
|
@@ -890,7 +953,13 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
|
|
|
890
953
|
: `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
|
|
891
954
|
})
|
|
892
955
|
logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch })
|
|
893
|
-
return {
|
|
956
|
+
return {
|
|
957
|
+
defaultBranch: boot.target.defaultBranch,
|
|
958
|
+
summary,
|
|
959
|
+
stats,
|
|
960
|
+
...(usage ? { usage } : {}),
|
|
961
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
962
|
+
}
|
|
894
963
|
})
|
|
895
964
|
}
|
|
896
965
|
|
package/src/coding-agent.ts
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
remoteBranchExists,
|
|
18
18
|
} from './git.js'
|
|
19
19
|
import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js'
|
|
20
|
-
import type { PiRunStats } from './pi.js'
|
|
20
|
+
import type { HarnessCallMetric, PiRunStats } from './pi.js'
|
|
21
21
|
import {
|
|
22
22
|
acquireRepoCheckout,
|
|
23
23
|
agentNeverActed,
|
|
@@ -89,6 +89,8 @@ export interface CodingAgentOutcome {
|
|
|
89
89
|
stderrTail?: string
|
|
90
90
|
/** Token usage from a subscription harness's CLI stream (absent for Pi). */
|
|
91
91
|
usage?: { inputTokens: number; outputTokens: number }
|
|
92
|
+
/** Per-model-call telemetry from a subscription harness's CLI stream (absent for Pi). */
|
|
93
|
+
callMetrics?: HarnessCallMetric[]
|
|
92
94
|
}
|
|
93
95
|
|
|
94
96
|
/**
|
|
@@ -296,7 +298,7 @@ export async function runCodingAgent(
|
|
|
296
298
|
try {
|
|
297
299
|
opts.onPhase?.('agent')
|
|
298
300
|
logger.info('coding-agent: running agent', { serviceDirectory })
|
|
299
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
|
|
301
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
|
|
300
302
|
{
|
|
301
303
|
dir: workDir,
|
|
302
304
|
systemPrompt: spec.systemPrompt,
|
|
@@ -371,6 +373,7 @@ export async function runCodingAgent(
|
|
|
371
373
|
stats,
|
|
372
374
|
...(stderrTail ? { stderrTail } : {}),
|
|
373
375
|
...(usage ? { usage } : {}),
|
|
376
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
374
377
|
}
|
|
375
378
|
} else {
|
|
376
379
|
opts.onPhase?.('push')
|
|
@@ -383,6 +386,7 @@ export async function runCodingAgent(
|
|
|
383
386
|
stats,
|
|
384
387
|
...(stderrTail ? { stderrTail } : {}),
|
|
385
388
|
...(usage ? { usage } : {}),
|
|
389
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
386
390
|
}
|
|
387
391
|
}
|
|
388
392
|
} finally {
|
package/src/job.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { PiRunStats } from './pi.js'
|
|
1
|
+
import type { HarnessCallMetric, PiRunStats } from './pi.js'
|
|
2
2
|
import type { HarnessKind } from './pi-workspace.js'
|
|
3
3
|
import type { FailureCause } from './failure.js'
|
|
4
4
|
|
|
@@ -529,6 +529,12 @@ export interface AgentResult {
|
|
|
529
529
|
*/
|
|
530
530
|
failureCause?: FailureCause
|
|
531
531
|
usage?: { inputTokens: number; outputTokens: number }
|
|
532
|
+
/**
|
|
533
|
+
* Per-model-call telemetry from a subscription harness's CLI stream (absent for the
|
|
534
|
+
* proxy-metered Pi harness). The backend records these into `llm_call_metrics`. See
|
|
535
|
+
* {@link HarnessCallMetric}.
|
|
536
|
+
*/
|
|
537
|
+
callMetrics?: HarnessCallMetric[]
|
|
532
538
|
}
|
|
533
539
|
|
|
534
540
|
/** Parse the coding-mode bootstrap spec, or undefined when absent. Validates the target. */
|
package/src/pi.ts
CHANGED
|
@@ -414,6 +414,38 @@ export interface RunDiagnostics {
|
|
|
414
414
|
finalAnswerEmpty: boolean
|
|
415
415
|
}
|
|
416
416
|
|
|
417
|
+
/**
|
|
418
|
+
* One model call captured from a subscription harness's CLI event stream, shaped so
|
|
419
|
+
* the backend can record it into the same `llm_call_metrics` telemetry the LLM proxy
|
|
420
|
+
* writes for the Pi harness. The subscription harnesses (Claude Code / Codex) talk
|
|
421
|
+
* DIRECT to the vendor and never touch the proxy, so this is the only place their
|
|
422
|
+
* per-call bodies are observable. Claude Code's `stream-json --verbose` is a near-
|
|
423
|
+
* verbatim Anthropic Messages stream, so its calls carry full request/response
|
|
424
|
+
* bodies; Codex's `exec --json` only surfaces flat assistant text + per-turn tokens,
|
|
425
|
+
* so its rows are honestly thinner (no request transcript, no tool/command bodies).
|
|
426
|
+
*/
|
|
427
|
+
export interface HarnessCallMetric {
|
|
428
|
+
/** The vendor model that served this call (from the CLI event), when reported. */
|
|
429
|
+
model?: string
|
|
430
|
+
/**
|
|
431
|
+
* The full request as an OpenAI-style chat array (`[{role, content}, …]`),
|
|
432
|
+
* JSON-stringified — the growing history as of this call. Matches the proxy's
|
|
433
|
+
* `promptText` shape so the telemetry chain delta-compresses + renders identically.
|
|
434
|
+
*/
|
|
435
|
+
promptText: string
|
|
436
|
+
/** Number of messages encoded in {@link promptText} (the telemetry chain messageCount). */
|
|
437
|
+
messageCount: number
|
|
438
|
+
/** The assistant's response text, as a plain string (`''` for a tool-only turn). */
|
|
439
|
+
responseText: string
|
|
440
|
+
/** The reasoning/thinking trace, as a plain string (`''` when none). */
|
|
441
|
+
reasoningText: string
|
|
442
|
+
inputTokens: number
|
|
443
|
+
cachedInputTokens: number
|
|
444
|
+
outputTokens: number
|
|
445
|
+
/** The provider finish/stop reason when the CLI reports one (else null). */
|
|
446
|
+
finishReason: string | null
|
|
447
|
+
}
|
|
448
|
+
|
|
417
449
|
/** Pi's assistant summary plus {@link PiRunStats} describing what it did. */
|
|
418
450
|
export interface PiRunOutcome {
|
|
419
451
|
summary: string
|
|
@@ -432,6 +464,14 @@ export interface PiRunOutcome {
|
|
|
432
464
|
* (usage-aware rotation) and telemetry. Absent for the proxy-metered Pi harness.
|
|
433
465
|
*/
|
|
434
466
|
usage?: { inputTokens: number; outputTokens: number }
|
|
467
|
+
/**
|
|
468
|
+
* Per-model-call telemetry lifted from a subscription harness's CLI event stream
|
|
469
|
+
* (Claude Code / Codex), which the backend records into `llm_call_metrics` — the
|
|
470
|
+
* proxy-bypassing analogue of the per-call rows the LLM proxy writes for Pi. Absent
|
|
471
|
+
* for the proxy-metered Pi harness (the proxy is its metering point). See
|
|
472
|
+
* {@link HarnessCallMetric}.
|
|
473
|
+
*/
|
|
474
|
+
callMetrics?: HarnessCallMetric[]
|
|
435
475
|
/** Output-quality signals (truncation / empty final answer); see {@link RunDiagnostics}. */
|
|
436
476
|
diagnostics?: RunDiagnostics
|
|
437
477
|
}
|