@cat-factory/executor-harness 1.31.12 → 1.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,26 @@ import { redact, secretsToRedact } from './redact.js';
7
7
  function isObject(value) {
8
8
  return typeof value === 'object' && value !== null;
9
9
  }
10
+ /** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
11
+ function redactBody(text, secrets) {
12
+ return secrets.length ? redact(text, secrets) : text;
13
+ }
14
+ /**
15
+ * Fallback token attribution: if a CLI reported a cumulative total but no per-turn
16
+ * usage (so every captured call has zero tokens), pin the whole total onto the LAST
17
+ * call rather than dropping it — the run's tokens are still accounted, just not split
18
+ * per turn. A no-op when the calls already carry per-turn tokens.
19
+ */
20
+ function attributeCumulativeUsage(calls, usage) {
21
+ if (!usage || calls.length === 0)
22
+ return;
23
+ const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0);
24
+ if (anyTokens)
25
+ return;
26
+ const last = calls[calls.length - 1];
27
+ last.inputTokens = usage.inputTokens;
28
+ last.outputTokens = usage.outputTokens;
29
+ }
10
30
  /**
11
31
  * Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
12
32
  * through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
@@ -114,27 +134,59 @@ export async function runClaudeCode(opts) {
114
134
  const stats = { toolCalls: 0, assistantChars: 0 };
115
135
  let summary = '';
116
136
  let usage;
137
+ // Reconstruct the full per-call request/response bodies for telemetry from the
138
+ // stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
139
+ // Anthropic Messages envelope, so `assistant` events carry the complete response
140
+ // (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
141
+ // fed back — together the growing prompt transcript. We seed it with the two inputs
142
+ // the harness supplies (they never appear in the stream): the system + first user
143
+ // message. Bodies are credential-scrubbed (they can echo the leased token).
144
+ const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
145
+ const messages = [
146
+ { role: 'system', content: opts.systemPrompt },
147
+ { role: 'user', content: opts.userPrompt },
148
+ ];
149
+ const calls = [];
117
150
  const onEvent = (event) => {
118
151
  const type = event.type;
119
152
  if (type === 'assistant' && isObject(event.message)) {
120
- const content = event.message.content;
121
- if (Array.isArray(content)) {
122
- for (const block of content) {
123
- if (!isObject(block))
124
- continue;
125
- if (block.type === 'text' && typeof block.text === 'string') {
126
- stats.assistantChars += block.text.length;
127
- }
128
- if (block.type === 'tool_use') {
129
- stats.toolCalls += 1;
130
- if (block.name === 'TodoWrite' && opts.onProgress) {
131
- const progress = todosToProgress(block.input?.todos);
132
- if (progress)
133
- opts.onProgress(progress);
134
- }
135
- }
153
+ const message = event.message;
154
+ const content = Array.isArray(message.content) ? message.content : [];
155
+ const { text, reasoning, toolUses } = claudeAssistantContent(content);
156
+ stats.assistantChars += text.length;
157
+ stats.toolCalls += toolUses;
158
+ for (const block of content) {
159
+ if (isObject(block) &&
160
+ block.type === 'tool_use' &&
161
+ block.name === 'TodoWrite' &&
162
+ opts.onProgress) {
163
+ const progress = todosToProgress(block.input?.todos);
164
+ if (progress)
165
+ opts.onProgress(progress);
136
166
  }
137
167
  }
168
+ // Record this call BEFORE appending its turn: the prompt is the history that
169
+ // produced this response. The append-only array keeps each call's prompt a strict
170
+ // prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
171
+ const u = claudeCallUsage(message.usage);
172
+ calls.push({
173
+ ...(typeof message.model === 'string' ? { model: message.model } : {}),
174
+ promptText: redactBody(JSON.stringify(messages), secrets),
175
+ messageCount: messages.length,
176
+ responseText: redactBody(text, secrets),
177
+ reasoningText: redactBody(reasoning, secrets),
178
+ inputTokens: u.inputTokens,
179
+ cachedInputTokens: u.cachedInputTokens,
180
+ outputTokens: u.outputTokens,
181
+ finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
182
+ });
183
+ messages.push({ role: 'assistant', content });
184
+ }
185
+ else if (type === 'user' && isObject(event.message)) {
186
+ // tool_result blocks the harness fed back to the model — part of the next prompt.
187
+ const content = event.message.content;
188
+ if (Array.isArray(content))
189
+ messages.push({ role: 'tool', content });
138
190
  }
139
191
  else if (type === 'result') {
140
192
  if (typeof event.result === 'string')
@@ -199,7 +251,14 @@ export async function runClaudeCode(opts) {
199
251
  '--append-system-prompt',
200
252
  opts.systemPrompt,
201
253
  ], opts.userPrompt, opts, env, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
202
- return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
254
+ attributeCumulativeUsage(calls, usage);
255
+ return {
256
+ summary,
257
+ stats,
258
+ stderrTail,
259
+ ...(usage ? { usage } : {}),
260
+ ...(calls.length ? { callMetrics: calls } : {}),
261
+ };
203
262
  }
204
263
  finally {
205
264
  // Never leave the config dir (and any cached credential) on disk past the run.
@@ -241,6 +300,38 @@ function claudeUsage(raw) {
241
300
  return undefined;
242
301
  return { inputTokens: input, outputTokens: output };
243
302
  }
303
+ /** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
304
+ function claudeAssistantContent(content) {
305
+ let text = '';
306
+ let reasoning = '';
307
+ let toolUses = 0;
308
+ for (const block of content) {
309
+ if (!isObject(block))
310
+ continue;
311
+ if (block.type === 'text' && typeof block.text === 'string')
312
+ text += block.text;
313
+ else if (block.type === 'thinking' && typeof block.thinking === 'string')
314
+ reasoning += block.thinking;
315
+ else if (block.type === 'tool_use')
316
+ toolUses += 1;
317
+ }
318
+ return { text, reasoning, toolUses };
319
+ }
320
+ /**
321
+ * Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
322
+ * the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
323
+ * + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
324
+ */
325
+ function claudeCallUsage(raw) {
326
+ if (!isObject(raw))
327
+ return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 };
328
+ const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens);
329
+ return {
330
+ inputTokens: numberOf(raw.input_tokens) + cached,
331
+ cachedInputTokens: cached,
332
+ outputTokens: numberOf(raw.output_tokens),
333
+ };
334
+ }
244
335
  // ---------------------------------------------------------------------------
245
336
  // Codex
246
337
  // ---------------------------------------------------------------------------
@@ -282,13 +373,29 @@ export async function runCodex(opts) {
282
373
  await writeFile(join(codexHome, 'auth.json'), opts.subscriptionToken, { mode: 0o600 });
283
374
  await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8');
284
375
  }
376
+ // Codex has no system-prompt flag, so fold the composed role + best-practice
377
+ // context into the prompt itself (Claude Code instead rides --append-system-prompt).
378
+ const prompt = opts.systemPrompt
379
+ ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
380
+ : opts.userPrompt;
381
+ // Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
382
+ // flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
383
+ // plus a cumulative total. It never exposes the request transcript or structured
384
+ // tool/command bodies, so the captured prompt is just the folded input — the response
385
+ // text + per-turn tokens are faithful; the request side is best-effort by design.
386
+ const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
387
+ const messages = [{ role: 'user', content: prompt }];
388
+ const calls = [];
389
+ let pendingText = '';
285
390
  const onEvent = (event) => {
286
391
  const type = typeof event.type === 'string' ? event.type : '';
287
- if (type.includes('agent_message') || type === 'item.completed') {
392
+ if (type.includes('agent_message') ||
393
+ (type === 'item.completed' && isCodexMessageItem(event))) {
288
394
  const text = extractText(event);
289
395
  if (text) {
290
396
  stats.assistantChars += text.length;
291
397
  summary = text;
398
+ pendingText = text;
292
399
  }
293
400
  }
294
401
  if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
@@ -300,12 +407,26 @@ export async function runCodex(opts) {
300
407
  const turnUsage = codexUsage(event);
301
408
  if (turnUsage)
302
409
  usage = turnUsage;
410
+ // A `token_count` event closes a model turn: pair its per-turn usage with the
411
+ // assistant text seen since the previous turn as one telemetry call.
412
+ const perTurn = codexLastTurnUsage(event);
413
+ if (perTurn) {
414
+ calls.push({
415
+ model: opts.model,
416
+ promptText: redactBody(JSON.stringify(messages), secrets),
417
+ messageCount: messages.length,
418
+ responseText: redactBody(pendingText, secrets),
419
+ reasoningText: '',
420
+ inputTokens: perTurn.inputTokens,
421
+ cachedInputTokens: perTurn.cachedInputTokens,
422
+ outputTokens: perTurn.outputTokens,
423
+ finishReason: null,
424
+ });
425
+ if (pendingText)
426
+ messages.push({ role: 'assistant', content: pendingText });
427
+ pendingText = '';
428
+ }
303
429
  };
304
- // Codex has no system-prompt flag, so fold the composed role + best-practice
305
- // context into the prompt itself (Claude Code instead rides --append-system-prompt).
306
- const prompt = opts.systemPrompt
307
- ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
308
- : opts.userPrompt;
309
430
  try {
310
431
  const { stderrTail } = await streamCli('codex', [
311
432
  'exec',
@@ -318,7 +439,28 @@ export async function runCodex(opts) {
318
439
  opts.model,
319
440
  '-',
320
441
  ], prompt, opts, codexHome ? { CODEX_HOME: codexHome } : {}, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
321
- return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
442
+ // Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
443
+ // single call from the cumulative total + final text so the run is still observable.
444
+ if (calls.length === 0 && (usage || summary)) {
445
+ calls.push({
446
+ model: opts.model,
447
+ promptText: redactBody(JSON.stringify(messages), secrets),
448
+ messageCount: messages.length,
449
+ responseText: redactBody(summary, secrets),
450
+ reasoningText: '',
451
+ inputTokens: usage?.inputTokens ?? 0,
452
+ cachedInputTokens: 0,
453
+ outputTokens: usage?.outputTokens ?? 0,
454
+ finishReason: null,
455
+ });
456
+ }
457
+ return {
458
+ summary,
459
+ stats,
460
+ stderrTail,
461
+ ...(usage ? { usage } : {}),
462
+ ...(calls.length ? { callMetrics: calls } : {}),
463
+ };
322
464
  }
323
465
  finally {
324
466
  // Never leave the decrypted credential on disk past the run.
@@ -326,6 +468,24 @@ export async function runCodex(opts) {
326
468
  await rm(codexHome, { recursive: true, force: true }).catch(() => { });
327
469
  }
328
470
  }
471
+ /**
472
+ * Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
473
+ * opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
474
+ * their command output or thinking — and must NOT be captured as the turn's response).
475
+ * A message item's kind contains `message` (`agent_message`/`assistant_message`); an
476
+ * item with no kind is treated as a message so older/simple shapes don't regress.
477
+ */
478
+ function isCodexMessageItem(event) {
479
+ const item = isObject(event.item) ? event.item : undefined;
480
+ if (!item)
481
+ return false;
482
+ const kind = typeof item.item_type === 'string'
483
+ ? item.item_type
484
+ : typeof item.type === 'string'
485
+ ? item.type
486
+ : '';
487
+ return kind === '' || /message/i.test(kind);
488
+ }
329
489
  /** Best-effort: pull a textual message out of a Codex event. */
330
490
  function extractText(event) {
331
491
  if (typeof event.message === 'string')
@@ -367,6 +527,8 @@ function codexPlanProgress(event) {
367
527
  * other shapes put it on `usage` / `info.usage` directly. We read the cumulative
368
528
  * total when present so the caller can simply overwrite (not sum) — summing
369
529
  * cumulative totals across events would multiply-count. Checked most-likely first.
530
+ * `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
531
+ * is a subset already inside it), so it is NOT summed with the cached share.
370
532
  */
371
533
  function codexUsage(event) {
372
534
  const info = isObject(event.info) ? event.info : undefined;
@@ -376,12 +538,31 @@ function codexUsage(event) {
376
538
  (info && isObject(info.usage) ? info.usage : undefined);
377
539
  if (!isObject(raw))
378
540
  return undefined;
379
- const input = numberOf(raw.input_tokens) + numberOf(raw.cached_input_tokens);
541
+ const input = numberOf(raw.input_tokens);
380
542
  const output = numberOf(raw.output_tokens);
381
543
  if (input === 0 && output === 0)
382
544
  return undefined;
383
545
  return { inputTokens: input, outputTokens: output };
384
546
  }
547
+ /**
548
+ * Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
549
+ * delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
550
+ * `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
551
+ * share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
552
+ * NOT added on top (adding it would double-count every cached token).
553
+ */
554
+ function codexLastTurnUsage(event) {
555
+ const info = isObject(event.info) ? event.info : undefined;
556
+ const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined;
557
+ if (!isObject(raw))
558
+ return undefined;
559
+ const input = numberOf(raw.input_tokens);
560
+ const cached = numberOf(raw.cached_input_tokens);
561
+ const output = numberOf(raw.output_tokens);
562
+ if (input === 0 && output === 0)
563
+ return undefined;
564
+ return { inputTokens: input, cachedInputTokens: cached, outputTokens: output };
565
+ }
385
566
  function numberOf(value) {
386
567
  return typeof value === 'number' && Number.isFinite(value) ? value : 0;
387
568
  }
package/dist/agent.js CHANGED
@@ -341,7 +341,7 @@ async function runExploreMode(job, opts) {
341
341
  try {
342
342
  opts.onPhase?.('agent');
343
343
  logger.info('agent(explore): running agent', { serviceDirectory });
344
- const { summary, stats, stderrTail, usage, diagnostics: runDiag, } = await runAgentInWorkspace({
344
+ const { summary, stats, stderrTail, usage, callMetrics, diagnostics: runDiag, } = await runAgentInWorkspace({
345
345
  dir: workDir,
346
346
  systemPrompt: job.systemPrompt,
347
347
  userPrompt,
@@ -368,6 +368,7 @@ async function runExploreMode(job, opts) {
368
368
  error: noOutputReason(stats, stderrTail),
369
369
  failureCause: 'no-usable-output',
370
370
  ...(usage ? { usage } : {}),
371
+ ...(callMetrics ? { callMetrics } : {}),
371
372
  ...infraSetupFields,
372
373
  };
373
374
  }
@@ -384,6 +385,7 @@ async function runExploreMode(job, opts) {
384
385
  error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
385
386
  failureCause: 'no-usable-output',
386
387
  ...(usage ? { usage } : {}),
388
+ ...(callMetrics ? { callMetrics } : {}),
387
389
  ...infraSetupFields,
388
390
  };
389
391
  }
@@ -391,7 +393,13 @@ async function runExploreMode(job, opts) {
391
393
  // Prose: the summary IS the deliverable.
392
394
  if (job.output?.kind !== 'structured') {
393
395
  logger.info('agent(explore): done (prose)', { ...stats });
394
- return { summary, stats, ...(usage ? { usage } : {}), ...infraSetupFields };
396
+ return {
397
+ summary,
398
+ stats,
399
+ ...(usage ? { usage } : {}),
400
+ ...(callMetrics ? { callMetrics } : {}),
401
+ ...infraSetupFields,
402
+ };
395
403
  }
396
404
  // Structured: parse the agent's JSON. With repair enabled (default) a malformed
397
405
  // reply gets ONE structured repair call before giving up; with `repair:false` we
@@ -432,6 +440,7 @@ async function runExploreMode(job, opts) {
432
440
  error: noStructuredReason(stats, stderrTail, diagnostics),
433
441
  failureCause: 'no-usable-output',
434
442
  ...(usage ? { usage } : {}),
443
+ ...(callMetrics ? { callMetrics } : {}),
435
444
  ...infraSetupFields,
436
445
  };
437
446
  }
@@ -451,7 +460,14 @@ async function runExploreMode(job, opts) {
451
460
  custom.environment = reportedEnvironment;
452
461
  }
453
462
  logger.info('agent(explore): done (structured)', { ...stats });
454
- return { summary, custom, stats, ...(usage ? { usage } : {}), ...infraSetupFields };
463
+ return {
464
+ summary,
465
+ custom,
466
+ stats,
467
+ ...(usage ? { usage } : {}),
468
+ ...(callMetrics ? { callMetrics } : {}),
469
+ ...infraSetupFields,
470
+ };
455
471
  }
456
472
  finally {
457
473
  if (managed)
@@ -477,7 +493,7 @@ async function runCodingMode(job, opts) {
477
493
  if (job.mergeBase)
478
494
  return runConflictResolution(job, opts);
479
495
  const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch;
480
- const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent({
496
+ const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent({
481
497
  kind: 'agent',
482
498
  jobId: job.jobId,
483
499
  repo: job.repo,
@@ -504,7 +520,14 @@ async function runCodingMode(job, opts) {
504
520
  if (!pushed) {
505
521
  // A no-op: a failure for the implementer, a clean non-event for the fixers.
506
522
  if (job.noChangesIsError === false) {
507
- return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
523
+ return {
524
+ pushed: false,
525
+ branch: pushBranch,
526
+ summary,
527
+ stats,
528
+ ...(usage ? { usage } : {}),
529
+ ...(callMetrics ? { callMetrics } : {}),
530
+ };
508
531
  }
509
532
  return {
510
533
  pushed: false,
@@ -514,6 +537,7 @@ async function runCodingMode(job, opts) {
514
537
  error: noChangesReason('the agent produced no file changes', stats, stderrTail),
515
538
  failureCause: 'no-changes',
516
539
  ...(usage ? { usage } : {}),
540
+ ...(callMetrics ? { callMetrics } : {}),
517
541
  };
518
542
  }
519
543
  // Changes are on the branch. Open a PR only when the job asked for one.
@@ -539,7 +563,14 @@ async function runCodingMode(job, opts) {
539
563
  // this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
540
564
  if (prUrl === null) {
541
565
  if (job.noChangesIsError === false) {
542
- return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
566
+ return {
567
+ pushed: false,
568
+ branch: pushBranch,
569
+ summary,
570
+ stats,
571
+ ...(usage ? { usage } : {}),
572
+ ...(callMetrics ? { callMetrics } : {}),
573
+ };
543
574
  }
544
575
  return {
545
576
  pushed: false,
@@ -549,11 +580,27 @@ async function runCodingMode(job, opts) {
549
580
  error: noChangesReason('the work branch has no commits ahead of its base (nothing to open a PR for)', stats, stderrTail),
550
581
  failureCause: 'no-changes',
551
582
  ...(usage ? { usage } : {}),
583
+ ...(callMetrics ? { callMetrics } : {}),
552
584
  };
553
585
  }
554
- return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
586
+ return {
587
+ pushed: true,
588
+ prUrl,
589
+ branch: pushBranch,
590
+ summary,
591
+ stats,
592
+ ...(usage ? { usage } : {}),
593
+ ...(callMetrics ? { callMetrics } : {}),
594
+ };
555
595
  }
556
- return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
596
+ return {
597
+ pushed: true,
598
+ branch: pushBranch,
599
+ summary,
600
+ stats,
601
+ ...(usage ? { usage } : {}),
602
+ ...(callMetrics ? { callMetrics } : {}),
603
+ };
557
604
  }
558
605
  /**
559
606
  * Conflict-resolution coding flow (the conflict-resolver): clone the PR head `branch`
@@ -617,7 +664,7 @@ async function runConflictResolution(job, opts) {
617
664
  logger.info('agent(conflict): resolving conflicts with agent', { conflicted });
618
665
  const diff = await conflictDiff(dir, conflicted, signal);
619
666
  const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt);
620
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
667
+ const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
621
668
  dir,
622
669
  systemPrompt: job.systemPrompt,
623
670
  userPrompt,
@@ -646,6 +693,7 @@ async function runConflictResolution(job, opts) {
646
693
  error: unresolvedReason(unresolved, stats, stderrTail),
647
694
  failureCause: 'agent',
648
695
  ...(usage ? { usage } : {}),
696
+ ...(callMetrics ? { callMetrics } : {}),
649
697
  };
650
698
  }
651
699
  // Complete the merge commit with the agent's resolution staged, then push.
@@ -653,7 +701,14 @@ async function runConflictResolution(job, opts) {
653
701
  opts.onPhase?.('push');
654
702
  logger.info('agent(conflict): pushing resolved branch', { ...stats });
655
703
  await pushBranch(dir, job.branch, job.ghToken, signal);
656
- return { pushed: true, branch: job.branch, summary, stats, ...(usage ? { usage } : {}) };
704
+ return {
705
+ pushed: true,
706
+ branch: job.branch,
707
+ summary,
708
+ stats,
709
+ ...(usage ? { usage } : {}),
710
+ ...(callMetrics ? { callMetrics } : {}),
711
+ };
657
712
  });
658
713
  }
659
714
  /**
@@ -729,7 +784,7 @@ async function runBootstrap(job, opts) {
729
784
  }
730
785
  opts.onPhase?.('agent');
731
786
  logger.info('agent(bootstrap): running agent');
732
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
787
+ const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
733
788
  dir,
734
789
  systemPrompt: job.systemPrompt,
735
790
  userPrompt: job.userPrompt,
@@ -749,7 +804,14 @@ async function runBootstrap(job, opts) {
749
804
  if (!(await producedRepoContent(dir, !fromScratch, signal))) {
750
805
  const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail);
751
806
  logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats });
752
- return { summary, stats, error, failureCause: 'agent', ...(usage ? { usage } : {}) };
807
+ return {
808
+ summary,
809
+ stats,
810
+ error,
811
+ failureCause: 'agent',
812
+ ...(usage ? { usage } : {}),
813
+ ...(callMetrics ? { callMetrics } : {}),
814
+ };
753
815
  }
754
816
  opts.onPhase?.('push');
755
817
  logger.info('agent(bootstrap): pushing bootstrapped contents', { ...stats });
@@ -764,7 +826,13 @@ async function runBootstrap(job, opts) {
764
826
  : `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
765
827
  });
766
828
  logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch });
767
- return { defaultBranch: boot.target.defaultBranch, summary, stats, ...(usage ? { usage } : {}) };
829
+ return {
830
+ defaultBranch: boot.target.defaultBranch,
831
+ summary,
832
+ stats,
833
+ ...(usage ? { usage } : {}),
834
+ ...(callMetrics ? { callMetrics } : {}),
835
+ };
768
836
  });
769
837
  }
770
838
  /**
@@ -195,7 +195,7 @@ export async function runCodingAgent(spec, opts = {}) {
195
195
  try {
196
196
  opts.onPhase?.('agent');
197
197
  logger.info('coding-agent: running agent', { serviceDirectory });
198
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
198
+ const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
199
199
  dir: workDir,
200
200
  systemPrompt: spec.systemPrompt,
201
201
  userPrompt: spec.userPrompt,
@@ -265,6 +265,7 @@ export async function runCodingAgent(spec, opts = {}) {
265
265
  stats,
266
266
  ...(stderrTail ? { stderrTail } : {}),
267
267
  ...(usage ? { usage } : {}),
268
+ ...(callMetrics ? { callMetrics } : {}),
268
269
  };
269
270
  }
270
271
  else {
@@ -278,6 +279,7 @@ export async function runCodingAgent(spec, opts = {}) {
278
279
  stats,
279
280
  ...(stderrTail ? { stderrTail } : {}),
280
281
  ...(usage ? { usage } : {}),
282
+ ...(callMetrics ? { callMetrics } : {}),
281
283
  };
282
284
  }
283
285
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cat-factory/executor-harness",
3
- "version": "1.31.12",
3
+ "version": "1.32.0",
4
4
  "description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
5
5
  "repository": {
6
6
  "type": "git",
@@ -26,8 +26,8 @@
26
26
  "hono": "^4.12.27",
27
27
  "typescript": "^6.0.3",
28
28
  "vitest": "^4.1.9",
29
- "@cat-factory/spend": "0.10.73",
30
- "@cat-factory/server": "0.67.0"
29
+ "@cat-factory/server": "0.68.0",
30
+ "@cat-factory/spend": "0.10.74"
31
31
  },
32
32
  "scripts": {
33
33
  "build": "tsc -p tsconfig.json",
@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process'
2
2
  import { mkdtemp, rm, writeFile } from 'node:fs/promises'
3
3
  import { tmpdir } from 'node:os'
4
4
  import { join } from 'node:path'
5
- import type { PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
5
+ import type { HarnessCallMetric, PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
6
6
  import { killChildProcess, spawnDetached } from './process.js'
7
7
  import { redact, secretsToRedact } from './redact.js'
8
8
 
@@ -64,6 +64,29 @@ function isObject(value: unknown): value is Record<string, unknown> {
64
64
  return typeof value === 'object' && value !== null
65
65
  }
66
66
 
67
+ /** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
68
+ function redactBody(text: string, secrets: string[]): string {
69
+ return secrets.length ? redact(text, secrets) : text
70
+ }
71
+
72
+ /**
73
+ * Fallback token attribution: if a CLI reported a cumulative total but no per-turn
74
+ * usage (so every captured call has zero tokens), pin the whole total onto the LAST
75
+ * call rather than dropping it — the run's tokens are still accounted, just not split
76
+ * per turn. A no-op when the calls already carry per-turn tokens.
77
+ */
78
+ function attributeCumulativeUsage(
79
+ calls: HarnessCallMetric[],
80
+ usage: { inputTokens: number; outputTokens: number } | undefined,
81
+ ): void {
82
+ if (!usage || calls.length === 0) return
83
+ const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0)
84
+ if (anyTokens) return
85
+ const last = calls[calls.length - 1]!
86
+ last.inputTokens = usage.inputTokens
87
+ last.outputTokens = usage.outputTokens
88
+ }
89
+
67
90
  /**
68
91
  * Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
69
92
  * through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
@@ -184,25 +207,59 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
184
207
  let summary = ''
185
208
  let usage: { inputTokens: number; outputTokens: number } | undefined
186
209
 
210
+ // Reconstruct the full per-call request/response bodies for telemetry from the
211
+ // stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
212
+ // Anthropic Messages envelope, so `assistant` events carry the complete response
213
+ // (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
214
+ // fed back — together the growing prompt transcript. We seed it with the two inputs
215
+ // the harness supplies (they never appear in the stream): the system + first user
216
+ // message. Bodies are credential-scrubbed (they can echo the leased token).
217
+ const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
218
+ const messages: Array<{ role: string; content: unknown }> = [
219
+ { role: 'system', content: opts.systemPrompt },
220
+ { role: 'user', content: opts.userPrompt },
221
+ ]
222
+ const calls: HarnessCallMetric[] = []
223
+
187
224
  const onEvent = (event: Record<string, unknown>): void => {
188
225
  const type = event.type
189
226
  if (type === 'assistant' && isObject(event.message)) {
190
- const content = (event.message as Record<string, unknown>).content
191
- if (Array.isArray(content)) {
192
- for (const block of content) {
193
- if (!isObject(block)) continue
194
- if (block.type === 'text' && typeof block.text === 'string') {
195
- stats.assistantChars += block.text.length
196
- }
197
- if (block.type === 'tool_use') {
198
- stats.toolCalls += 1
199
- if (block.name === 'TodoWrite' && opts.onProgress) {
200
- const progress = todosToProgress((block.input as Record<string, unknown>)?.todos)
201
- if (progress) opts.onProgress(progress)
202
- }
203
- }
227
+ const message = event.message as Record<string, unknown>
228
+ const content = Array.isArray(message.content) ? message.content : []
229
+ const { text, reasoning, toolUses } = claudeAssistantContent(content)
230
+ stats.assistantChars += text.length
231
+ stats.toolCalls += toolUses
232
+ for (const block of content) {
233
+ if (
234
+ isObject(block) &&
235
+ block.type === 'tool_use' &&
236
+ block.name === 'TodoWrite' &&
237
+ opts.onProgress
238
+ ) {
239
+ const progress = todosToProgress((block.input as Record<string, unknown>)?.todos)
240
+ if (progress) opts.onProgress(progress)
204
241
  }
205
242
  }
243
+ // Record this call BEFORE appending its turn: the prompt is the history that
244
+ // produced this response. The append-only array keeps each call's prompt a strict
245
+ // prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
246
+ const u = claudeCallUsage(message.usage)
247
+ calls.push({
248
+ ...(typeof message.model === 'string' ? { model: message.model } : {}),
249
+ promptText: redactBody(JSON.stringify(messages), secrets),
250
+ messageCount: messages.length,
251
+ responseText: redactBody(text, secrets),
252
+ reasoningText: redactBody(reasoning, secrets),
253
+ inputTokens: u.inputTokens,
254
+ cachedInputTokens: u.cachedInputTokens,
255
+ outputTokens: u.outputTokens,
256
+ finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
257
+ })
258
+ messages.push({ role: 'assistant', content })
259
+ } else if (type === 'user' && isObject(event.message)) {
260
+ // tool_result blocks the harness fed back to the model — part of the next prompt.
261
+ const content = (event.message as Record<string, unknown>).content
262
+ if (Array.isArray(content)) messages.push({ role: 'tool', content })
206
263
  } else if (type === 'result') {
207
264
  if (typeof event.result === 'string') summary = event.result
208
265
  usage = claudeUsage(event.usage) ?? usage
@@ -282,7 +339,14 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
282
339
  onEvent,
283
340
  )
284
341
 
285
- return { summary, stats, stderrTail, ...(usage ? { usage } : {}) }
342
+ attributeCumulativeUsage(calls, usage)
343
+ return {
344
+ summary,
345
+ stats,
346
+ stderrTail,
347
+ ...(usage ? { usage } : {}),
348
+ ...(calls.length ? { callMetrics: calls } : {}),
349
+ }
286
350
  } finally {
287
351
  // Never leave the config dir (and any cached credential) on disk past the run.
288
352
  if (configHome) await rm(configHome, { recursive: true, force: true }).catch(() => {})
@@ -322,6 +386,44 @@ function claudeUsage(raw: unknown): { inputTokens: number; outputTokens: number
322
386
  return { inputTokens: input, outputTokens: output }
323
387
  }
324
388
 
389
+ /** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
390
+ function claudeAssistantContent(content: unknown[]): {
391
+ text: string
392
+ reasoning: string
393
+ toolUses: number
394
+ } {
395
+ let text = ''
396
+ let reasoning = ''
397
+ let toolUses = 0
398
+ for (const block of content) {
399
+ if (!isObject(block)) continue
400
+ if (block.type === 'text' && typeof block.text === 'string') text += block.text
401
+ else if (block.type === 'thinking' && typeof block.thinking === 'string')
402
+ reasoning += block.thinking
403
+ else if (block.type === 'tool_use') toolUses += 1
404
+ }
405
+ return { text, reasoning, toolUses }
406
+ }
407
+
408
+ /**
409
+ * Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
410
+ * the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
411
+ * + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
412
+ */
413
+ function claudeCallUsage(raw: unknown): {
414
+ inputTokens: number
415
+ cachedInputTokens: number
416
+ outputTokens: number
417
+ } {
418
+ if (!isObject(raw)) return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 }
419
+ const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens)
420
+ return {
421
+ inputTokens: numberOf(raw.input_tokens) + cached,
422
+ cachedInputTokens: cached,
423
+ outputTokens: numberOf(raw.output_tokens),
424
+ }
425
+ }
426
+
325
427
  // ---------------------------------------------------------------------------
326
428
  // Codex
327
429
  // ---------------------------------------------------------------------------
@@ -366,13 +468,33 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
366
468
  await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8')
367
469
  }
368
470
 
471
+ // Codex has no system-prompt flag, so fold the composed role + best-practice
472
+ // context into the prompt itself (Claude Code instead rides --append-system-prompt).
473
+ const prompt = opts.systemPrompt
474
+ ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
475
+ : opts.userPrompt
476
+
477
+ // Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
478
+ // flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
479
+ // plus a cumulative total. It never exposes the request transcript or structured
480
+ // tool/command bodies, so the captured prompt is just the folded input — the response
481
+ // text + per-turn tokens are faithful; the request side is best-effort by design.
482
+ const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
483
+ const messages: Array<{ role: string; content: unknown }> = [{ role: 'user', content: prompt }]
484
+ const calls: HarnessCallMetric[] = []
485
+ let pendingText = ''
486
+
369
487
  const onEvent = (event: Record<string, unknown>): void => {
370
488
  const type = typeof event.type === 'string' ? event.type : ''
371
- if (type.includes('agent_message') || type === 'item.completed') {
489
+ if (
490
+ type.includes('agent_message') ||
491
+ (type === 'item.completed' && isCodexMessageItem(event))
492
+ ) {
372
493
  const text = extractText(event)
373
494
  if (text) {
374
495
  stats.assistantChars += text.length
375
496
  summary = text
497
+ pendingText = text
376
498
  }
377
499
  }
378
500
  if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
@@ -382,14 +504,26 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
382
504
  if (progress && opts.onProgress) opts.onProgress(progress)
383
505
  const turnUsage = codexUsage(event)
384
506
  if (turnUsage) usage = turnUsage
507
+ // A `token_count` event closes a model turn: pair its per-turn usage with the
508
+ // assistant text seen since the previous turn as one telemetry call.
509
+ const perTurn = codexLastTurnUsage(event)
510
+ if (perTurn) {
511
+ calls.push({
512
+ model: opts.model,
513
+ promptText: redactBody(JSON.stringify(messages), secrets),
514
+ messageCount: messages.length,
515
+ responseText: redactBody(pendingText, secrets),
516
+ reasoningText: '',
517
+ inputTokens: perTurn.inputTokens,
518
+ cachedInputTokens: perTurn.cachedInputTokens,
519
+ outputTokens: perTurn.outputTokens,
520
+ finishReason: null,
521
+ })
522
+ if (pendingText) messages.push({ role: 'assistant', content: pendingText })
523
+ pendingText = ''
524
+ }
385
525
  }
386
526
 
387
- // Codex has no system-prompt flag, so fold the composed role + best-practice
388
- // context into the prompt itself (Claude Code instead rides --append-system-prompt).
389
- const prompt = opts.systemPrompt
390
- ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
391
- : opts.userPrompt
392
-
393
527
  try {
394
528
  const { stderrTail } = await streamCli(
395
529
  'codex',
@@ -411,13 +545,53 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
411
545
  onEvent,
412
546
  )
413
547
 
414
- return { summary, stats, stderrTail, ...(usage ? { usage } : {}) }
548
+ // Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
549
+ // single call from the cumulative total + final text so the run is still observable.
550
+ if (calls.length === 0 && (usage || summary)) {
551
+ calls.push({
552
+ model: opts.model,
553
+ promptText: redactBody(JSON.stringify(messages), secrets),
554
+ messageCount: messages.length,
555
+ responseText: redactBody(summary, secrets),
556
+ reasoningText: '',
557
+ inputTokens: usage?.inputTokens ?? 0,
558
+ cachedInputTokens: 0,
559
+ outputTokens: usage?.outputTokens ?? 0,
560
+ finishReason: null,
561
+ })
562
+ }
563
+ return {
564
+ summary,
565
+ stats,
566
+ stderrTail,
567
+ ...(usage ? { usage } : {}),
568
+ ...(calls.length ? { callMetrics: calls } : {}),
569
+ }
415
570
  } finally {
416
571
  // Never leave the decrypted credential on disk past the run.
417
572
  if (codexHome) await rm(codexHome, { recursive: true, force: true }).catch(() => {})
418
573
  }
419
574
  }
420
575
 
576
+ /**
577
+ * Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
578
+ * opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
579
+ * their command output or thinking — and must NOT be captured as the turn's response).
580
+ * A message item's kind contains `message` (`agent_message`/`assistant_message`); an
581
+ * item with no kind is treated as a message so older/simple shapes don't regress.
582
+ */
583
+ function isCodexMessageItem(event: Record<string, unknown>): boolean {
584
+ const item = isObject(event.item) ? (event.item as Record<string, unknown>) : undefined
585
+ if (!item) return false
586
+ const kind =
587
+ typeof item.item_type === 'string'
588
+ ? item.item_type
589
+ : typeof item.type === 'string'
590
+ ? item.type
591
+ : ''
592
+ return kind === '' || /message/i.test(kind)
593
+ }
594
+
421
595
  /** Best-effort: pull a textual message out of a Codex event. */
422
596
  function extractText(event: Record<string, unknown>): string | undefined {
423
597
  if (typeof event.message === 'string') return event.message
@@ -456,6 +630,8 @@ function codexPlanProgress(event: Record<string, unknown>): TodoProgress | undef
456
630
  * other shapes put it on `usage` / `info.usage` directly. We read the cumulative
457
631
  * total when present so the caller can simply overwrite (not sum) — summing
458
632
  * cumulative totals across events would multiply-count. Checked most-likely first.
633
+ * `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
634
+ * is a subset already inside it), so it is NOT summed with the cached share.
459
635
  */
460
636
  function codexUsage(
461
637
  event: Record<string, unknown>,
@@ -467,12 +643,36 @@ function codexUsage(
467
643
  (isObject(event.usage) ? event.usage : undefined) ??
468
644
  (info && isObject(info.usage) ? info.usage : undefined)
469
645
  if (!isObject(raw)) return undefined
470
- const input = numberOf(raw.input_tokens) + numberOf(raw.cached_input_tokens)
646
+ const input = numberOf(raw.input_tokens)
471
647
  const output = numberOf(raw.output_tokens)
472
648
  if (input === 0 && output === 0) return undefined
473
649
  return { inputTokens: input, outputTokens: output }
474
650
  }
475
651
 
652
+ /**
653
+ * Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
654
+ * delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
655
+ * `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
656
+ * share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
657
+ * NOT added on top (adding it would double-count every cached token).
658
+ */
659
+ function codexLastTurnUsage(event: Record<string, unknown>):
660
+ | {
661
+ inputTokens: number
662
+ cachedInputTokens: number
663
+ outputTokens: number
664
+ }
665
+ | undefined {
666
+ const info = isObject(event.info) ? (event.info as Record<string, unknown>) : undefined
667
+ const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined
668
+ if (!isObject(raw)) return undefined
669
+ const input = numberOf(raw.input_tokens)
670
+ const cached = numberOf(raw.cached_input_tokens)
671
+ const output = numberOf(raw.output_tokens)
672
+ if (input === 0 && output === 0) return undefined
673
+ return { inputTokens: input, cachedInputTokens: cached, outputTokens: output }
674
+ }
675
+
476
676
  function numberOf(value: unknown): number {
477
677
  return typeof value === 'number' && Number.isFinite(value) ? value : 0
478
678
  }
package/src/agent.ts CHANGED
@@ -421,6 +421,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
421
421
  stats,
422
422
  stderrTail,
423
423
  usage,
424
+ callMetrics,
424
425
  diagnostics: runDiag,
425
426
  } = await runAgentInWorkspace(
426
427
  {
@@ -453,6 +454,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
453
454
  error: noOutputReason(stats, stderrTail),
454
455
  failureCause: 'no-usable-output',
455
456
  ...(usage ? { usage } : {}),
457
+ ...(callMetrics ? { callMetrics } : {}),
456
458
  ...infraSetupFields,
457
459
  }
458
460
  }
@@ -470,6 +472,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
470
472
  error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
471
473
  failureCause: 'no-usable-output',
472
474
  ...(usage ? { usage } : {}),
475
+ ...(callMetrics ? { callMetrics } : {}),
473
476
  ...infraSetupFields,
474
477
  }
475
478
  }
@@ -478,7 +481,13 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
478
481
  // Prose: the summary IS the deliverable.
479
482
  if (job.output?.kind !== 'structured') {
480
483
  logger.info('agent(explore): done (prose)', { ...stats })
481
- return { summary, stats, ...(usage ? { usage } : {}), ...infraSetupFields }
484
+ return {
485
+ summary,
486
+ stats,
487
+ ...(usage ? { usage } : {}),
488
+ ...(callMetrics ? { callMetrics } : {}),
489
+ ...infraSetupFields,
490
+ }
482
491
  }
483
492
 
484
493
  // Structured: parse the agent's JSON. With repair enabled (default) a malformed
@@ -522,6 +531,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
522
531
  error: noStructuredReason(stats, stderrTail, diagnostics),
523
532
  failureCause: 'no-usable-output',
524
533
  ...(usage ? { usage } : {}),
534
+ ...(callMetrics ? { callMetrics } : {}),
525
535
  ...infraSetupFields,
526
536
  }
527
537
  }
@@ -540,7 +550,14 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
540
550
  ;(custom as Record<string, unknown>).environment = reportedEnvironment
541
551
  }
542
552
  logger.info('agent(explore): done (structured)', { ...stats })
543
- return { summary, custom, stats, ...(usage ? { usage } : {}), ...infraSetupFields }
553
+ return {
554
+ summary,
555
+ custom,
556
+ stats,
557
+ ...(usage ? { usage } : {}),
558
+ ...(callMetrics ? { callMetrics } : {}),
559
+ ...infraSetupFields,
560
+ }
544
561
  } finally {
545
562
  if (managed) await managed.cleanup()
546
563
  }
@@ -565,7 +582,7 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
565
582
  if (job.mergeBase) return runConflictResolution(job, opts)
566
583
 
567
584
  const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
568
- const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent(
585
+ const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent(
569
586
  {
570
587
  kind: 'agent',
571
588
  jobId: job.jobId,
@@ -596,7 +613,14 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
596
613
  if (!pushed) {
597
614
  // A no-op: a failure for the implementer, a clean non-event for the fixers.
598
615
  if (job.noChangesIsError === false) {
599
- return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
616
+ return {
617
+ pushed: false,
618
+ branch: pushBranch,
619
+ summary,
620
+ stats,
621
+ ...(usage ? { usage } : {}),
622
+ ...(callMetrics ? { callMetrics } : {}),
623
+ }
600
624
  }
601
625
  return {
602
626
  pushed: false,
@@ -606,6 +630,7 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
606
630
  error: noChangesReason('the agent produced no file changes', stats, stderrTail),
607
631
  failureCause: 'no-changes',
608
632
  ...(usage ? { usage } : {}),
633
+ ...(callMetrics ? { callMetrics } : {}),
609
634
  }
610
635
  }
611
636
 
@@ -632,7 +657,14 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
632
657
  // this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
633
658
  if (prUrl === null) {
634
659
  if (job.noChangesIsError === false) {
635
- return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
660
+ return {
661
+ pushed: false,
662
+ branch: pushBranch,
663
+ summary,
664
+ stats,
665
+ ...(usage ? { usage } : {}),
666
+ ...(callMetrics ? { callMetrics } : {}),
667
+ }
636
668
  }
637
669
  return {
638
670
  pushed: false,
@@ -646,11 +678,27 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
646
678
  ),
647
679
  failureCause: 'no-changes',
648
680
  ...(usage ? { usage } : {}),
681
+ ...(callMetrics ? { callMetrics } : {}),
649
682
  }
650
683
  }
651
- return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
684
+ return {
685
+ pushed: true,
686
+ prUrl,
687
+ branch: pushBranch,
688
+ summary,
689
+ stats,
690
+ ...(usage ? { usage } : {}),
691
+ ...(callMetrics ? { callMetrics } : {}),
692
+ }
693
+ }
694
+ return {
695
+ pushed: true,
696
+ branch: pushBranch,
697
+ summary,
698
+ stats,
699
+ ...(usage ? { usage } : {}),
700
+ ...(callMetrics ? { callMetrics } : {}),
652
701
  }
653
- return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
654
702
  }
655
703
 
656
704
  /**
@@ -719,7 +767,7 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
719
767
  const diff = await conflictDiff(dir, conflicted, signal)
720
768
  const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt)
721
769
 
722
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
770
+ const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
723
771
  {
724
772
  dir,
725
773
  systemPrompt: job.systemPrompt,
@@ -752,6 +800,7 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
752
800
  error: unresolvedReason(unresolved, stats, stderrTail),
753
801
  failureCause: 'agent',
754
802
  ...(usage ? { usage } : {}),
803
+ ...(callMetrics ? { callMetrics } : {}),
755
804
  }
756
805
  }
757
806
  // Complete the merge commit with the agent's resolution staged, then push.
@@ -759,7 +808,14 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
759
808
  opts.onPhase?.('push')
760
809
  logger.info('agent(conflict): pushing resolved branch', { ...stats })
761
810
  await pushBranch(dir, job.branch, job.ghToken, signal)
762
- return { pushed: true, branch: job.branch, summary, stats, ...(usage ? { usage } : {}) }
811
+ return {
812
+ pushed: true,
813
+ branch: job.branch,
814
+ summary,
815
+ stats,
816
+ ...(usage ? { usage } : {}),
817
+ ...(callMetrics ? { callMetrics } : {}),
818
+ }
763
819
  })
764
820
  }
765
821
 
@@ -850,7 +906,7 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
850
906
 
851
907
  opts.onPhase?.('agent')
852
908
  logger.info('agent(bootstrap): running agent')
853
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
909
+ const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
854
910
  {
855
911
  dir,
856
912
  systemPrompt: job.systemPrompt,
@@ -874,7 +930,14 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
874
930
  if (!(await producedRepoContent(dir, !fromScratch, signal))) {
875
931
  const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail)
876
932
  logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats })
877
- return { summary, stats, error, failureCause: 'agent', ...(usage ? { usage } : {}) }
933
+ return {
934
+ summary,
935
+ stats,
936
+ error,
937
+ failureCause: 'agent',
938
+ ...(usage ? { usage } : {}),
939
+ ...(callMetrics ? { callMetrics } : {}),
940
+ }
878
941
  }
879
942
 
880
943
  opts.onPhase?.('push')
@@ -890,7 +953,13 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
890
953
  : `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
891
954
  })
892
955
  logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch })
893
- return { defaultBranch: boot.target.defaultBranch, summary, stats, ...(usage ? { usage } : {}) }
956
+ return {
957
+ defaultBranch: boot.target.defaultBranch,
958
+ summary,
959
+ stats,
960
+ ...(usage ? { usage } : {}),
961
+ ...(callMetrics ? { callMetrics } : {}),
962
+ }
894
963
  })
895
964
  }
896
965
 
@@ -17,7 +17,7 @@ import {
17
17
  remoteBranchExists,
18
18
  } from './git.js'
19
19
  import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js'
20
- import type { PiRunStats } from './pi.js'
20
+ import type { HarnessCallMetric, PiRunStats } from './pi.js'
21
21
  import {
22
22
  acquireRepoCheckout,
23
23
  agentNeverActed,
@@ -89,6 +89,8 @@ export interface CodingAgentOutcome {
89
89
  stderrTail?: string
90
90
  /** Token usage from a subscription harness's CLI stream (absent for Pi). */
91
91
  usage?: { inputTokens: number; outputTokens: number }
92
+ /** Per-model-call telemetry from a subscription harness's CLI stream (absent for Pi). */
93
+ callMetrics?: HarnessCallMetric[]
92
94
  }
93
95
 
94
96
  /**
@@ -296,7 +298,7 @@ export async function runCodingAgent(
296
298
  try {
297
299
  opts.onPhase?.('agent')
298
300
  logger.info('coding-agent: running agent', { serviceDirectory })
299
- const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
301
+ const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
300
302
  {
301
303
  dir: workDir,
302
304
  systemPrompt: spec.systemPrompt,
@@ -371,6 +373,7 @@ export async function runCodingAgent(
371
373
  stats,
372
374
  ...(stderrTail ? { stderrTail } : {}),
373
375
  ...(usage ? { usage } : {}),
376
+ ...(callMetrics ? { callMetrics } : {}),
374
377
  }
375
378
  } else {
376
379
  opts.onPhase?.('push')
@@ -383,6 +386,7 @@ export async function runCodingAgent(
383
386
  stats,
384
387
  ...(stderrTail ? { stderrTail } : {}),
385
388
  ...(usage ? { usage } : {}),
389
+ ...(callMetrics ? { callMetrics } : {}),
386
390
  }
387
391
  }
388
392
  } finally {
package/src/job.ts CHANGED
@@ -1,4 +1,4 @@
1
- import type { PiRunStats } from './pi.js'
1
+ import type { HarnessCallMetric, PiRunStats } from './pi.js'
2
2
  import type { HarnessKind } from './pi-workspace.js'
3
3
  import type { FailureCause } from './failure.js'
4
4
 
@@ -529,6 +529,12 @@ export interface AgentResult {
529
529
  */
530
530
  failureCause?: FailureCause
531
531
  usage?: { inputTokens: number; outputTokens: number }
532
+ /**
533
+ * Per-model-call telemetry from a subscription harness's CLI stream (absent for the
534
+ * proxy-metered Pi harness). The backend records these into `llm_call_metrics`. See
535
+ * {@link HarnessCallMetric}.
536
+ */
537
+ callMetrics?: HarnessCallMetric[]
532
538
  }
533
539
 
534
540
  /** Parse the coding-mode bootstrap spec, or undefined when absent. Validates the target. */
package/src/pi.ts CHANGED
@@ -414,6 +414,38 @@ export interface RunDiagnostics {
414
414
  finalAnswerEmpty: boolean
415
415
  }
416
416
 
417
+ /**
418
+ * One model call captured from a subscription harness's CLI event stream, shaped so
419
+ * the backend can record it into the same `llm_call_metrics` telemetry the LLM proxy
420
+ * writes for the Pi harness. The subscription harnesses (Claude Code / Codex) talk
421
+ * DIRECT to the vendor and never touch the proxy, so this is the only place their
422
+ * per-call bodies are observable. Claude Code's `stream-json --verbose` is a near-
423
+ * verbatim Anthropic Messages stream, so its calls carry full request/response
424
+ * bodies; Codex's `exec --json` only surfaces flat assistant text + per-turn tokens,
425
+ * so its rows are honestly thinner (no request transcript, no tool/command bodies).
426
+ */
427
+ export interface HarnessCallMetric {
428
+ /** The vendor model that served this call (from the CLI event), when reported. */
429
+ model?: string
430
+ /**
431
+ * The full request as an OpenAI-style chat array (`[{role, content}, …]`),
432
+ * JSON-stringified — the growing history as of this call. Matches the proxy's
433
+ * `promptText` shape so the telemetry chain delta-compresses + renders identically.
434
+ */
435
+ promptText: string
436
+ /** Number of messages encoded in {@link promptText} (the telemetry chain messageCount). */
437
+ messageCount: number
438
+ /** The assistant's response text, as a plain string (`''` for a tool-only turn). */
439
+ responseText: string
440
+ /** The reasoning/thinking trace, as a plain string (`''` when none). */
441
+ reasoningText: string
442
+ inputTokens: number
443
+ cachedInputTokens: number
444
+ outputTokens: number
445
+ /** The provider finish/stop reason when the CLI reports one (else null). */
446
+ finishReason: string | null
447
+ }
448
+
417
449
  /** Pi's assistant summary plus {@link PiRunStats} describing what it did. */
418
450
  export interface PiRunOutcome {
419
451
  summary: string
@@ -432,6 +464,14 @@ export interface PiRunOutcome {
432
464
  * (usage-aware rotation) and telemetry. Absent for the proxy-metered Pi harness.
433
465
  */
434
466
  usage?: { inputTokens: number; outputTokens: number }
467
+ /**
468
+ * Per-model-call telemetry lifted from a subscription harness's CLI event stream
469
+ * (Claude Code / Codex), which the backend records into `llm_call_metrics` — the
470
+ * proxy-bypassing analogue of the per-call rows the LLM proxy writes for Pi. Absent
471
+ * for the proxy-metered Pi harness (the proxy is its metering point). See
472
+ * {@link HarnessCallMetric}.
473
+ */
474
+ callMetrics?: HarnessCallMetric[]
435
475
  /** Output-quality signals (truncation / empty final answer); see {@link RunDiagnostics}. */
436
476
  diagnostics?: RunDiagnostics
437
477
  }