npm - @cat-factory/executor-harness - Versions diffs - 1.31.12 → 1.32.0 - Mend

@cat-factory/executor-harness 1.31.12 → 1.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/agent-runner.js CHANGED Viewed

@@ -7,6 +7,26 @@ import { redact, secretsToRedact } from './redact.js';
 function isObject(value) {
     return typeof value === 'object' && value !== null;
 }
+/** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
+function redactBody(text, secrets) {
+    return secrets.length ? redact(text, secrets) : text;
+}
+/**
+ * Fallback token attribution: if a CLI reported a cumulative total but no per-turn
+ * usage (so every captured call has zero tokens), pin the whole total onto the LAST
+ * call rather than dropping it — the run's tokens are still accounted, just not split
+ * per turn. A no-op when the calls already carry per-turn tokens.
+ */
+function attributeCumulativeUsage(calls, usage) {
+    if (!usage || calls.length === 0)
+        return;
+    const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0);
+    if (anyTokens)
+        return;
+    const last = calls[calls.length - 1];
+    last.inputTokens = usage.inputTokens;
+    last.outputTokens = usage.outputTokens;
+}
 /**
  * Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
  * through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
@@ -114,27 +134,59 @@ export async function runClaudeCode(opts) {
     const stats = { toolCalls: 0, assistantChars: 0 };
     let summary = '';
     let usage;
+    // Reconstruct the full per-call request/response bodies for telemetry from the
+    // stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
+    // Anthropic Messages envelope, so `assistant` events carry the complete response
+    // (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
+    // fed back — together the growing prompt transcript. We seed it with the two inputs
+    // the harness supplies (they never appear in the stream): the system + first user
+    // message. Bodies are credential-scrubbed (they can echo the leased token).
+    const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
+    const messages = [
+        { role: 'system', content: opts.systemPrompt },
+        { role: 'user', content: opts.userPrompt },
+    ];
+    const calls = [];
     const onEvent = (event) => {
         const type = event.type;
         if (type === 'assistant' && isObject(event.message)) {
-            const content = event.message.content;
-            if (Array.isArray(content)) {
-                for (const block of content) {
-                    if (!isObject(block))
-                        continue;
-                    if (block.type === 'text' && typeof block.text === 'string') {
-                        stats.assistantChars += block.text.length;
-                    }
-                    if (block.type === 'tool_use') {
-                        stats.toolCalls += 1;
-                        if (block.name === 'TodoWrite' && opts.onProgress) {
-                            const progress = todosToProgress(block.input?.todos);
-                            if (progress)
-                                opts.onProgress(progress);
-                        }
-                    }
+            const message = event.message;
+            const content = Array.isArray(message.content) ? message.content : [];
+            const { text, reasoning, toolUses } = claudeAssistantContent(content);
+            stats.assistantChars += text.length;
+            stats.toolCalls += toolUses;
+            for (const block of content) {
+                if (isObject(block) &&
+                    block.type === 'tool_use' &&
+                    block.name === 'TodoWrite' &&
+                    opts.onProgress) {
+                    const progress = todosToProgress(block.input?.todos);
+                    if (progress)
+                        opts.onProgress(progress);
                 }
             }
+            // Record this call BEFORE appending its turn: the prompt is the history that
+            // produced this response. The append-only array keeps each call's prompt a strict
+            // prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
+            const u = claudeCallUsage(message.usage);
+            calls.push({
+                ...(typeof message.model === 'string' ? { model: message.model } : {}),
+                promptText: redactBody(JSON.stringify(messages), secrets),
+                messageCount: messages.length,
+                responseText: redactBody(text, secrets),
+                reasoningText: redactBody(reasoning, secrets),
+                inputTokens: u.inputTokens,
+                cachedInputTokens: u.cachedInputTokens,
+                outputTokens: u.outputTokens,
+                finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
+            });
+            messages.push({ role: 'assistant', content });
+        }
+        else if (type === 'user' && isObject(event.message)) {
+            // tool_result blocks the harness fed back to the model — part of the next prompt.
+            const content = event.message.content;
+            if (Array.isArray(content))
+                messages.push({ role: 'tool', content });
         }
         else if (type === 'result') {
             if (typeof event.result === 'string')
@@ -199,7 +251,14 @@ export async function runClaudeCode(opts) {
             '--append-system-prompt',
             opts.systemPrompt,
         ], opts.userPrompt, opts, env, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
-        return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
+        attributeCumulativeUsage(calls, usage);
+        return {
+            summary,
+            stats,
+            stderrTail,
+            ...(usage ? { usage } : {}),
+            ...(calls.length ? { callMetrics: calls } : {}),
+        };
     }
     finally {
         // Never leave the config dir (and any cached credential) on disk past the run.
@@ -241,6 +300,38 @@ function claudeUsage(raw) {
         return undefined;
     return { inputTokens: input, outputTokens: output };
 }
+/** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
+function claudeAssistantContent(content) {
+    let text = '';
+    let reasoning = '';
+    let toolUses = 0;
+    for (const block of content) {
+        if (!isObject(block))
+            continue;
+        if (block.type === 'text' && typeof block.text === 'string')
+            text += block.text;
+        else if (block.type === 'thinking' && typeof block.thinking === 'string')
+            reasoning += block.thinking;
+        else if (block.type === 'tool_use')
+            toolUses += 1;
+    }
+    return { text, reasoning, toolUses };
+}
+/**
+ * Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
+ * the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
+ * + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
+ */
+function claudeCallUsage(raw) {
+    if (!isObject(raw))
+        return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 };
+    const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens);
+    return {
+        inputTokens: numberOf(raw.input_tokens) + cached,
+        cachedInputTokens: cached,
+        outputTokens: numberOf(raw.output_tokens),
+    };
+}
 // ---------------------------------------------------------------------------
 // Codex
 // ---------------------------------------------------------------------------
@@ -282,13 +373,29 @@ export async function runCodex(opts) {
         await writeFile(join(codexHome, 'auth.json'), opts.subscriptionToken, { mode: 0o600 });
         await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8');
     }
+    // Codex has no system-prompt flag, so fold the composed role + best-practice
+    // context into the prompt itself (Claude Code instead rides --append-system-prompt).
+    const prompt = opts.systemPrompt
+        ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
+        : opts.userPrompt;
+    // Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
+    // flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
+    // plus a cumulative total. It never exposes the request transcript or structured
+    // tool/command bodies, so the captured prompt is just the folded input — the response
+    // text + per-turn tokens are faithful; the request side is best-effort by design.
+    const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
+    const messages = [{ role: 'user', content: prompt }];
+    const calls = [];
+    let pendingText = '';
     const onEvent = (event) => {
         const type = typeof event.type === 'string' ? event.type : '';
-        if (type.includes('agent_message') || type === 'item.completed') {
+        if (type.includes('agent_message') ||
+            (type === 'item.completed' && isCodexMessageItem(event))) {
             const text = extractText(event);
             if (text) {
                 stats.assistantChars += text.length;
                 summary = text;
+                pendingText = text;
             }
         }
         if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
@@ -300,12 +407,26 @@ export async function runCodex(opts) {
         const turnUsage = codexUsage(event);
         if (turnUsage)
             usage = turnUsage;
+        // A `token_count` event closes a model turn: pair its per-turn usage with the
+        // assistant text seen since the previous turn as one telemetry call.
+        const perTurn = codexLastTurnUsage(event);
+        if (perTurn) {
+            calls.push({
+                model: opts.model,
+                promptText: redactBody(JSON.stringify(messages), secrets),
+                messageCount: messages.length,
+                responseText: redactBody(pendingText, secrets),
+                reasoningText: '',
+                inputTokens: perTurn.inputTokens,
+                cachedInputTokens: perTurn.cachedInputTokens,
+                outputTokens: perTurn.outputTokens,
+                finishReason: null,
+            });
+            if (pendingText)
+                messages.push({ role: 'assistant', content: pendingText });
+            pendingText = '';
+        }
     };
-    // Codex has no system-prompt flag, so fold the composed role + best-practice
-    // context into the prompt itself (Claude Code instead rides --append-system-prompt).
-    const prompt = opts.systemPrompt
-        ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
-        : opts.userPrompt;
     try {
         const { stderrTail } = await streamCli('codex', [
             'exec',
@@ -318,7 +439,28 @@ export async function runCodex(opts) {
             opts.model,
             '-',
         ], prompt, opts, codexHome ? { CODEX_HOME: codexHome } : {}, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
-        return { summary, stats, stderrTail, ...(usage ? { usage } : {}) };
+        // Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
+        // single call from the cumulative total + final text so the run is still observable.
+        if (calls.length === 0 && (usage || summary)) {
+            calls.push({
+                model: opts.model,
+                promptText: redactBody(JSON.stringify(messages), secrets),
+                messageCount: messages.length,
+                responseText: redactBody(summary, secrets),
+                reasoningText: '',
+                inputTokens: usage?.inputTokens ?? 0,
+                cachedInputTokens: 0,
+                outputTokens: usage?.outputTokens ?? 0,
+                finishReason: null,
+            });
+        }
+        return {
+            summary,
+            stats,
+            stderrTail,
+            ...(usage ? { usage } : {}),
+            ...(calls.length ? { callMetrics: calls } : {}),
+        };
     }
     finally {
         // Never leave the decrypted credential on disk past the run.
@@ -326,6 +468,24 @@ export async function runCodex(opts) {
             await rm(codexHome, { recursive: true, force: true }).catch(() => { });
     }
 }
+/**
+ * Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
+ * opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
+ * their command output or thinking — and must NOT be captured as the turn's response).
+ * A message item's kind contains `message` (`agent_message`/`assistant_message`); an
+ * item with no kind is treated as a message so older/simple shapes don't regress.
+ */
+function isCodexMessageItem(event) {
+    const item = isObject(event.item) ? event.item : undefined;
+    if (!item)
+        return false;
+    const kind = typeof item.item_type === 'string'
+        ? item.item_type
+        : typeof item.type === 'string'
+            ? item.type
+            : '';
+    return kind === '' || /message/i.test(kind);
+}
 /** Best-effort: pull a textual message out of a Codex event. */
 function extractText(event) {
     if (typeof event.message === 'string')
@@ -367,6 +527,8 @@ function codexPlanProgress(event) {
  * other shapes put it on `usage` / `info.usage` directly. We read the cumulative
  * total when present so the caller can simply overwrite (not sum) — summing
  * cumulative totals across events would multiply-count. Checked most-likely first.
+ * `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
+ * is a subset already inside it), so it is NOT summed with the cached share.
  */
 function codexUsage(event) {
     const info = isObject(event.info) ? event.info : undefined;
@@ -376,12 +538,31 @@ function codexUsage(event) {
         (info && isObject(info.usage) ? info.usage : undefined);
     if (!isObject(raw))
         return undefined;
-    const input = numberOf(raw.input_tokens) + numberOf(raw.cached_input_tokens);
+    const input = numberOf(raw.input_tokens);
     const output = numberOf(raw.output_tokens);
     if (input === 0 && output === 0)
         return undefined;
     return { inputTokens: input, outputTokens: output };
 }
+/**
+ * Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
+ * delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
+ * `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
+ * share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
+ * NOT added on top (adding it would double-count every cached token).
+ */
+function codexLastTurnUsage(event) {
+    const info = isObject(event.info) ? event.info : undefined;
+    const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined;
+    if (!isObject(raw))
+        return undefined;
+    const input = numberOf(raw.input_tokens);
+    const cached = numberOf(raw.cached_input_tokens);
+    const output = numberOf(raw.output_tokens);
+    if (input === 0 && output === 0)
+        return undefined;
+    return { inputTokens: input, cachedInputTokens: cached, outputTokens: output };
+}
 function numberOf(value) {
     return typeof value === 'number' && Number.isFinite(value) ? value : 0;
 }

package/dist/agent.js CHANGED Viewed

@@ -341,7 +341,7 @@ async function runExploreMode(job, opts) {
         try {
             opts.onPhase?.('agent');
             logger.info('agent(explore): running agent', { serviceDirectory });
-            const { summary, stats, stderrTail, usage, diagnostics: runDiag, } = await runAgentInWorkspace({
+            const { summary, stats, stderrTail, usage, callMetrics, diagnostics: runDiag, } = await runAgentInWorkspace({
                 dir: workDir,
                 systemPrompt: job.systemPrompt,
                 userPrompt,
@@ -368,6 +368,7 @@ async function runExploreMode(job, opts) {
                     error: noOutputReason(stats, stderrTail),
                     failureCause: 'no-usable-output',
                     ...(usage ? { usage } : {}),
+                    ...(callMetrics ? { callMetrics } : {}),
                     ...infraSetupFields,
                 };
             }
@@ -384,6 +385,7 @@ async function runExploreMode(job, opts) {
                         error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
                         failureCause: 'no-usable-output',
                         ...(usage ? { usage } : {}),
+                        ...(callMetrics ? { callMetrics } : {}),
                         ...infraSetupFields,
                     };
                 }
@@ -391,7 +393,13 @@ async function runExploreMode(job, opts) {
             // Prose: the summary IS the deliverable.
             if (job.output?.kind !== 'structured') {
                 logger.info('agent(explore): done (prose)', { ...stats });
-                return { summary, stats, ...(usage ? { usage } : {}), ...infraSetupFields };
+                return {
+                    summary,
+                    stats,
+                    ...(usage ? { usage } : {}),
+                    ...(callMetrics ? { callMetrics } : {}),
+                    ...infraSetupFields,
+                };
             }
             // Structured: parse the agent's JSON. With repair enabled (default) a malformed
             // reply gets ONE structured repair call before giving up; with `repair:false` we
@@ -432,6 +440,7 @@ async function runExploreMode(job, opts) {
                     error: noStructuredReason(stats, stderrTail, diagnostics),
                     failureCause: 'no-usable-output',
                     ...(usage ? { usage } : {}),
+                    ...(callMetrics ? { callMetrics } : {}),
                     ...infraSetupFields,
                 };
             }
@@ -451,7 +460,14 @@ async function runExploreMode(job, opts) {
                 custom.environment = reportedEnvironment;
             }
             logger.info('agent(explore): done (structured)', { ...stats });
-            return { summary, custom, stats, ...(usage ? { usage } : {}), ...infraSetupFields };
+            return {
+                summary,
+                custom,
+                stats,
+                ...(usage ? { usage } : {}),
+                ...(callMetrics ? { callMetrics } : {}),
+                ...infraSetupFields,
+            };
         }
         finally {
             if (managed)
@@ -477,7 +493,7 @@ async function runCodingMode(job, opts) {
     if (job.mergeBase)
         return runConflictResolution(job, opts);
     const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch;
-    const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent({
+    const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent({
         kind: 'agent',
         jobId: job.jobId,
         repo: job.repo,
@@ -504,7 +520,14 @@ async function runCodingMode(job, opts) {
     if (!pushed) {
         // A no-op: a failure for the implementer, a clean non-event for the fixers.
         if (job.noChangesIsError === false) {
-            return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
+            return {
+                pushed: false,
+                branch: pushBranch,
+                summary,
+                stats,
+                ...(usage ? { usage } : {}),
+                ...(callMetrics ? { callMetrics } : {}),
+            };
         }
         return {
             pushed: false,
@@ -514,6 +537,7 @@ async function runCodingMode(job, opts) {
             error: noChangesReason('the agent produced no file changes', stats, stderrTail),
             failureCause: 'no-changes',
             ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
         };
     }
     // Changes are on the branch. Open a PR only when the job asked for one.
@@ -539,7 +563,14 @@ async function runCodingMode(job, opts) {
         // this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
         if (prUrl === null) {
             if (job.noChangesIsError === false) {
-                return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
+                return {
+                    pushed: false,
+                    branch: pushBranch,
+                    summary,
+                    stats,
+                    ...(usage ? { usage } : {}),
+                    ...(callMetrics ? { callMetrics } : {}),
+                };
             }
             return {
                 pushed: false,
@@ -549,11 +580,27 @@ async function runCodingMode(job, opts) {
                 error: noChangesReason('the work branch has no commits ahead of its base (nothing to open a PR for)', stats, stderrTail),
                 failureCause: 'no-changes',
                 ...(usage ? { usage } : {}),
+                ...(callMetrics ? { callMetrics } : {}),
             };
         }
-        return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
+        return {
+            pushed: true,
+            prUrl,
+            branch: pushBranch,
+            summary,
+            stats,
+            ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
+        };
     }
-    return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) };
+    return {
+        pushed: true,
+        branch: pushBranch,
+        summary,
+        stats,
+        ...(usage ? { usage } : {}),
+        ...(callMetrics ? { callMetrics } : {}),
+    };
 }
 /**
  * Conflict-resolution coding flow (the conflict-resolver): clone the PR head `branch`
@@ -617,7 +664,7 @@ async function runConflictResolution(job, opts) {
         logger.info('agent(conflict): resolving conflicts with agent', { conflicted });
         const diff = await conflictDiff(dir, conflicted, signal);
         const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt);
-        const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
+        const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
             dir,
             systemPrompt: job.systemPrompt,
             userPrompt,
@@ -646,6 +693,7 @@ async function runConflictResolution(job, opts) {
                 error: unresolvedReason(unresolved, stats, stderrTail),
                 failureCause: 'agent',
                 ...(usage ? { usage } : {}),
+                ...(callMetrics ? { callMetrics } : {}),
             };
         }
         // Complete the merge commit with the agent's resolution staged, then push.
@@ -653,7 +701,14 @@ async function runConflictResolution(job, opts) {
         opts.onPhase?.('push');
         logger.info('agent(conflict): pushing resolved branch', { ...stats });
         await pushBranch(dir, job.branch, job.ghToken, signal);
-        return { pushed: true, branch: job.branch, summary, stats, ...(usage ? { usage } : {}) };
+        return {
+            pushed: true,
+            branch: job.branch,
+            summary,
+            stats,
+            ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
+        };
     });
 }
 /**
@@ -729,7 +784,7 @@ async function runBootstrap(job, opts) {
         }
         opts.onPhase?.('agent');
         logger.info('agent(bootstrap): running agent');
-        const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
+        const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
             dir,
             systemPrompt: job.systemPrompt,
             userPrompt: job.userPrompt,
@@ -749,7 +804,14 @@ async function runBootstrap(job, opts) {
         if (!(await producedRepoContent(dir, !fromScratch, signal))) {
             const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail);
             logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats });
-            return { summary, stats, error, failureCause: 'agent', ...(usage ? { usage } : {}) };
+            return {
+                summary,
+                stats,
+                error,
+                failureCause: 'agent',
+                ...(usage ? { usage } : {}),
+                ...(callMetrics ? { callMetrics } : {}),
+            };
         }
         opts.onPhase?.('push');
         logger.info('agent(bootstrap): pushing bootstrapped contents', { ...stats });
@@ -764,7 +826,13 @@ async function runBootstrap(job, opts) {
                 : `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
         });
         logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch });
-        return { defaultBranch: boot.target.defaultBranch, summary, stats, ...(usage ? { usage } : {}) };
+        return {
+            defaultBranch: boot.target.defaultBranch,
+            summary,
+            stats,
+            ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
+        };
     });
 }
 /**

package/dist/coding-agent.js CHANGED Viewed

@@ -195,7 +195,7 @@ export async function runCodingAgent(spec, opts = {}) {
         try {
             opts.onPhase?.('agent');
             logger.info('coding-agent: running agent', { serviceDirectory });
-            const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
+            const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
                 dir: workDir,
                 systemPrompt: spec.systemPrompt,
                 userPrompt: spec.userPrompt,
@@ -265,6 +265,7 @@ export async function runCodingAgent(spec, opts = {}) {
                     stats,
                     ...(stderrTail ? { stderrTail } : {}),
                     ...(usage ? { usage } : {}),
+                    ...(callMetrics ? { callMetrics } : {}),
                 };
             }
             else {
@@ -278,6 +279,7 @@ export async function runCodingAgent(spec, opts = {}) {
                     stats,
                     ...(stderrTail ? { stderrTail } : {}),
                     ...(usage ? { usage } : {}),
+                    ...(callMetrics ? { callMetrics } : {}),
                 };
             }
         }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@cat-factory/executor-harness",
-  "version": "1.31.12",
+  "version": "1.32.0",
   "description": "Container payload: a thin TypeScript wrapper that runs the Pi coding agent against a cloned repo and opens a PR. Runs in the Cloudflare Container (and, in local native mode, as a host process); carries no secrets.",
   "repository": {
     "type": "git",
@@ -26,8 +26,8 @@
     "hono": "^4.12.27",
     "typescript": "^6.0.3",
     "vitest": "^4.1.9",
-    "@cat-factory/spend": "0.10.73",
-    "@cat-factory/server": "0.67.0"
+    "@cat-factory/server": "0.68.0",
+    "@cat-factory/spend": "0.10.74"
   },
   "scripts": {
     "build": "tsc -p tsconfig.json",

package/src/agent-runner.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import { spawn } from 'node:child_process'
 import { mkdtemp, rm, writeFile } from 'node:fs/promises'
 import { tmpdir } from 'node:os'
 import { join } from 'node:path'
-import type { PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
+import type { HarnessCallMetric, PiRunOutcome, PiRunStats, TodoProgress } from './pi.js'
 import { killChildProcess, spawnDetached } from './process.js'
 import { redact, secretsToRedact } from './redact.js'
@@ -64,6 +64,29 @@ function isObject(value: unknown): value is Record<string, unknown> {
   return typeof value === 'object' && value !== null
 }
+/** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
+function redactBody(text: string, secrets: string[]): string {
+  return secrets.length ? redact(text, secrets) : text
+}
+/**
+ * Fallback token attribution: if a CLI reported a cumulative total but no per-turn
+ * usage (so every captured call has zero tokens), pin the whole total onto the LAST
+ * call rather than dropping it — the run's tokens are still accounted, just not split
+ * per turn. A no-op when the calls already carry per-turn tokens.
+ */
+function attributeCumulativeUsage(
+  calls: HarnessCallMetric[],
+  usage: { inputTokens: number; outputTokens: number } | undefined,
+): void {
+  if (!usage || calls.length === 0) return
+  const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0)
+  if (anyTokens) return
+  const last = calls[calls.length - 1]!
+  last.inputTokens = usage.inputTokens
+  last.outputTokens = usage.outputTokens
+}
 /**
  * Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
  * through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
@@ -184,25 +207,59 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
   let summary = ''
   let usage: { inputTokens: number; outputTokens: number } | undefined
+  // Reconstruct the full per-call request/response bodies for telemetry from the
+  // stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
+  // Anthropic Messages envelope, so `assistant` events carry the complete response
+  // (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
+  // fed back — together the growing prompt transcript. We seed it with the two inputs
+  // the harness supplies (they never appear in the stream): the system + first user
+  // message. Bodies are credential-scrubbed (they can echo the leased token).
+  const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
+  const messages: Array<{ role: string; content: unknown }> = [
+    { role: 'system', content: opts.systemPrompt },
+    { role: 'user', content: opts.userPrompt },
+  ]
+  const calls: HarnessCallMetric[] = []
   const onEvent = (event: Record<string, unknown>): void => {
     const type = event.type
     if (type === 'assistant' && isObject(event.message)) {
-      const content = (event.message as Record<string, unknown>).content
-      if (Array.isArray(content)) {
-        for (const block of content) {
-          if (!isObject(block)) continue
-          if (block.type === 'text' && typeof block.text === 'string') {
-            stats.assistantChars += block.text.length
-          }
-          if (block.type === 'tool_use') {
-            stats.toolCalls += 1
-            if (block.name === 'TodoWrite' && opts.onProgress) {
-              const progress = todosToProgress((block.input as Record<string, unknown>)?.todos)
-              if (progress) opts.onProgress(progress)
-            }
-          }
+      const message = event.message as Record<string, unknown>
+      const content = Array.isArray(message.content) ? message.content : []
+      const { text, reasoning, toolUses } = claudeAssistantContent(content)
+      stats.assistantChars += text.length
+      stats.toolCalls += toolUses
+      for (const block of content) {
+        if (
+          isObject(block) &&
+          block.type === 'tool_use' &&
+          block.name === 'TodoWrite' &&
+          opts.onProgress
+        ) {
+          const progress = todosToProgress((block.input as Record<string, unknown>)?.todos)
+          if (progress) opts.onProgress(progress)
         }
       }
+      // Record this call BEFORE appending its turn: the prompt is the history that
+      // produced this response. The append-only array keeps each call's prompt a strict
+      // prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
+      const u = claudeCallUsage(message.usage)
+      calls.push({
+        ...(typeof message.model === 'string' ? { model: message.model } : {}),
+        promptText: redactBody(JSON.stringify(messages), secrets),
+        messageCount: messages.length,
+        responseText: redactBody(text, secrets),
+        reasoningText: redactBody(reasoning, secrets),
+        inputTokens: u.inputTokens,
+        cachedInputTokens: u.cachedInputTokens,
+        outputTokens: u.outputTokens,
+        finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
+      })
+      messages.push({ role: 'assistant', content })
+    } else if (type === 'user' && isObject(event.message)) {
+      // tool_result blocks the harness fed back to the model — part of the next prompt.
+      const content = (event.message as Record<string, unknown>).content
+      if (Array.isArray(content)) messages.push({ role: 'tool', content })
     } else if (type === 'result') {
       if (typeof event.result === 'string') summary = event.result
       usage = claudeUsage(event.usage) ?? usage
@@ -282,7 +339,14 @@ export async function runClaudeCode(opts: SubscriptionRunOptions): Promise<PiRun
       onEvent,
     )
-    return { summary, stats, stderrTail, ...(usage ? { usage } : {}) }
+    attributeCumulativeUsage(calls, usage)
+    return {
+      summary,
+      stats,
+      stderrTail,
+      ...(usage ? { usage } : {}),
+      ...(calls.length ? { callMetrics: calls } : {}),
+    }
   } finally {
     // Never leave the config dir (and any cached credential) on disk past the run.
     if (configHome) await rm(configHome, { recursive: true, force: true }).catch(() => {})
@@ -322,6 +386,44 @@ function claudeUsage(raw: unknown): { inputTokens: number; outputTokens: number
   return { inputTokens: input, outputTokens: output }
 }
+/** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
+function claudeAssistantContent(content: unknown[]): {
+  text: string
+  reasoning: string
+  toolUses: number
+} {
+  let text = ''
+  let reasoning = ''
+  let toolUses = 0
+  for (const block of content) {
+    if (!isObject(block)) continue
+    if (block.type === 'text' && typeof block.text === 'string') text += block.text
+    else if (block.type === 'thinking' && typeof block.thinking === 'string')
+      reasoning += block.thinking
+    else if (block.type === 'tool_use') toolUses += 1
+  }
+  return { text, reasoning, toolUses }
+}
+/**
+ * Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
+ * the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
+ * + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
+ */
+function claudeCallUsage(raw: unknown): {
+  inputTokens: number
+  cachedInputTokens: number
+  outputTokens: number
+} {
+  if (!isObject(raw)) return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 }
+  const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens)
+  return {
+    inputTokens: numberOf(raw.input_tokens) + cached,
+    cachedInputTokens: cached,
+    outputTokens: numberOf(raw.output_tokens),
+  }
+}
 // ---------------------------------------------------------------------------
 // Codex
 // ---------------------------------------------------------------------------
@@ -366,13 +468,33 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
     await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8')
   }
+  // Codex has no system-prompt flag, so fold the composed role + best-practice
+  // context into the prompt itself (Claude Code instead rides --append-system-prompt).
+  const prompt = opts.systemPrompt
+    ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
+    : opts.userPrompt
+  // Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
+  // flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
+  // plus a cumulative total. It never exposes the request transcript or structured
+  // tool/command bodies, so the captured prompt is just the folded input — the response
+  // text + per-turn tokens are faithful; the request side is best-effort by design.
+  const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : []
+  const messages: Array<{ role: string; content: unknown }> = [{ role: 'user', content: prompt }]
+  const calls: HarnessCallMetric[] = []
+  let pendingText = ''
   const onEvent = (event: Record<string, unknown>): void => {
     const type = typeof event.type === 'string' ? event.type : ''
-    if (type.includes('agent_message') || type === 'item.completed') {
+    if (
+      type.includes('agent_message') ||
+      (type === 'item.completed' && isCodexMessageItem(event))
+    ) {
       const text = extractText(event)
       if (text) {
         stats.assistantChars += text.length
         summary = text
+        pendingText = text
       }
     }
     if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
@@ -382,14 +504,26 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
     if (progress && opts.onProgress) opts.onProgress(progress)
     const turnUsage = codexUsage(event)
     if (turnUsage) usage = turnUsage
+    // A `token_count` event closes a model turn: pair its per-turn usage with the
+    // assistant text seen since the previous turn as one telemetry call.
+    const perTurn = codexLastTurnUsage(event)
+    if (perTurn) {
+      calls.push({
+        model: opts.model,
+        promptText: redactBody(JSON.stringify(messages), secrets),
+        messageCount: messages.length,
+        responseText: redactBody(pendingText, secrets),
+        reasoningText: '',
+        inputTokens: perTurn.inputTokens,
+        cachedInputTokens: perTurn.cachedInputTokens,
+        outputTokens: perTurn.outputTokens,
+        finishReason: null,
+      })
+      if (pendingText) messages.push({ role: 'assistant', content: pendingText })
+      pendingText = ''
+    }
   }
-  // Codex has no system-prompt flag, so fold the composed role + best-practice
-  // context into the prompt itself (Claude Code instead rides --append-system-prompt).
-  const prompt = opts.systemPrompt
-    ? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
-    : opts.userPrompt
   try {
     const { stderrTail } = await streamCli(
       'codex',
@@ -411,13 +545,53 @@ export async function runCodex(opts: SubscriptionRunOptions): Promise<PiRunOutco
       onEvent,
     )
-    return { summary, stats, stderrTail, ...(usage ? { usage } : {}) }
+    // Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
+    // single call from the cumulative total + final text so the run is still observable.
+    if (calls.length === 0 && (usage || summary)) {
+      calls.push({
+        model: opts.model,
+        promptText: redactBody(JSON.stringify(messages), secrets),
+        messageCount: messages.length,
+        responseText: redactBody(summary, secrets),
+        reasoningText: '',
+        inputTokens: usage?.inputTokens ?? 0,
+        cachedInputTokens: 0,
+        outputTokens: usage?.outputTokens ?? 0,
+        finishReason: null,
+      })
+    }
+    return {
+      summary,
+      stats,
+      stderrTail,
+      ...(usage ? { usage } : {}),
+      ...(calls.length ? { callMetrics: calls } : {}),
+    }
   } finally {
     // Never leave the decrypted credential on disk past the run.
     if (codexHome) await rm(codexHome, { recursive: true, force: true }).catch(() => {})
   }
 }
+/**
+ * Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
+ * opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
+ * their command output or thinking — and must NOT be captured as the turn's response).
+ * A message item's kind contains `message` (`agent_message`/`assistant_message`); an
+ * item with no kind is treated as a message so older/simple shapes don't regress.
+ */
+function isCodexMessageItem(event: Record<string, unknown>): boolean {
+  const item = isObject(event.item) ? (event.item as Record<string, unknown>) : undefined
+  if (!item) return false
+  const kind =
+    typeof item.item_type === 'string'
+      ? item.item_type
+      : typeof item.type === 'string'
+        ? item.type
+        : ''
+  return kind === '' || /message/i.test(kind)
+}
 /** Best-effort: pull a textual message out of a Codex event. */
 function extractText(event: Record<string, unknown>): string | undefined {
   if (typeof event.message === 'string') return event.message
@@ -456,6 +630,8 @@ function codexPlanProgress(event: Record<string, unknown>): TodoProgress | undef
  * other shapes put it on `usage` / `info.usage` directly. We read the cumulative
  * total when present so the caller can simply overwrite (not sum) — summing
  * cumulative totals across events would multiply-count. Checked most-likely first.
+ * `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
+ * is a subset already inside it), so it is NOT summed with the cached share.
  */
 function codexUsage(
   event: Record<string, unknown>,
@@ -467,12 +643,36 @@ function codexUsage(
     (isObject(event.usage) ? event.usage : undefined) ??
     (info && isObject(info.usage) ? info.usage : undefined)
   if (!isObject(raw)) return undefined
-  const input = numberOf(raw.input_tokens) + numberOf(raw.cached_input_tokens)
+  const input = numberOf(raw.input_tokens)
   const output = numberOf(raw.output_tokens)
   if (input === 0 && output === 0) return undefined
   return { inputTokens: input, outputTokens: output }
 }
+/**
+ * Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
+ * delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
+ * `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
+ * share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
+ * NOT added on top (adding it would double-count every cached token).
+ */
+function codexLastTurnUsage(event: Record<string, unknown>):
+  | {
+      inputTokens: number
+      cachedInputTokens: number
+      outputTokens: number
+    }
+  | undefined {
+  const info = isObject(event.info) ? (event.info as Record<string, unknown>) : undefined
+  const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined
+  if (!isObject(raw)) return undefined
+  const input = numberOf(raw.input_tokens)
+  const cached = numberOf(raw.cached_input_tokens)
+  const output = numberOf(raw.output_tokens)
+  if (input === 0 && output === 0) return undefined
+  return { inputTokens: input, cachedInputTokens: cached, outputTokens: output }
+}
 function numberOf(value: unknown): number {
   return typeof value === 'number' && Number.isFinite(value) ? value : 0
 }

package/src/agent.ts CHANGED Viewed

@@ -421,6 +421,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
           stats,
           stderrTail,
           usage,
+          callMetrics,
           diagnostics: runDiag,
         } = await runAgentInWorkspace(
           {
@@ -453,6 +454,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
             error: noOutputReason(stats, stderrTail),
             failureCause: 'no-usable-output',
             ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
             ...infraSetupFields,
           }
         }
@@ -470,6 +472,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
               error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
               failureCause: 'no-usable-output',
               ...(usage ? { usage } : {}),
+              ...(callMetrics ? { callMetrics } : {}),
               ...infraSetupFields,
             }
           }
@@ -478,7 +481,13 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
         // Prose: the summary IS the deliverable.
         if (job.output?.kind !== 'structured') {
           logger.info('agent(explore): done (prose)', { ...stats })
-          return { summary, stats, ...(usage ? { usage } : {}), ...infraSetupFields }
+          return {
+            summary,
+            stats,
+            ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
+            ...infraSetupFields,
+          }
         }
         // Structured: parse the agent's JSON. With repair enabled (default) a malformed
@@ -522,6 +531,7 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
             error: noStructuredReason(stats, stderrTail, diagnostics),
             failureCause: 'no-usable-output',
             ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
             ...infraSetupFields,
           }
         }
@@ -540,7 +550,14 @@ async function runExploreMode(job: AgentJob, opts: RunOptions): Promise<AgentRes
           ;(custom as Record<string, unknown>).environment = reportedEnvironment
         }
         logger.info('agent(explore): done (structured)', { ...stats })
-        return { summary, custom, stats, ...(usage ? { usage } : {}), ...infraSetupFields }
+        return {
+          summary,
+          custom,
+          stats,
+          ...(usage ? { usage } : {}),
+          ...(callMetrics ? { callMetrics } : {}),
+          ...infraSetupFields,
+        }
       } finally {
         if (managed) await managed.cleanup()
       }
@@ -565,7 +582,7 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
   if (job.mergeBase) return runConflictResolution(job, opts)
   const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch
-  const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent(
+  const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent(
     {
       kind: 'agent',
       jobId: job.jobId,
@@ -596,7 +613,14 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
   if (!pushed) {
     // A no-op: a failure for the implementer, a clean non-event for the fixers.
     if (job.noChangesIsError === false) {
-      return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
+      return {
+        pushed: false,
+        branch: pushBranch,
+        summary,
+        stats,
+        ...(usage ? { usage } : {}),
+        ...(callMetrics ? { callMetrics } : {}),
+      }
     }
     return {
       pushed: false,
@@ -606,6 +630,7 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
       error: noChangesReason('the agent produced no file changes', stats, stderrTail),
       failureCause: 'no-changes',
       ...(usage ? { usage } : {}),
+      ...(callMetrics ? { callMetrics } : {}),
     }
   }
@@ -632,7 +657,14 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
     // this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
     if (prUrl === null) {
       if (job.noChangesIsError === false) {
-        return { pushed: false, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
+        return {
+          pushed: false,
+          branch: pushBranch,
+          summary,
+          stats,
+          ...(usage ? { usage } : {}),
+          ...(callMetrics ? { callMetrics } : {}),
+        }
       }
       return {
         pushed: false,
@@ -646,11 +678,27 @@ async function runCodingMode(job: AgentJob, opts: RunOptions): Promise<AgentResu
         ),
         failureCause: 'no-changes',
         ...(usage ? { usage } : {}),
+        ...(callMetrics ? { callMetrics } : {}),
       }
     }
-    return { pushed: true, prUrl, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
+    return {
+      pushed: true,
+      prUrl,
+      branch: pushBranch,
+      summary,
+      stats,
+      ...(usage ? { usage } : {}),
+      ...(callMetrics ? { callMetrics } : {}),
+    }
+  }
+  return {
+    pushed: true,
+    branch: pushBranch,
+    summary,
+    stats,
+    ...(usage ? { usage } : {}),
+    ...(callMetrics ? { callMetrics } : {}),
   }
-  return { pushed: true, branch: pushBranch, summary, stats, ...(usage ? { usage } : {}) }
 }
 /**
@@ -719,7 +767,7 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
     const diff = await conflictDiff(dir, conflicted, signal)
     const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt)
-    const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
+    const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
       {
         dir,
         systemPrompt: job.systemPrompt,
@@ -752,6 +800,7 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
         error: unresolvedReason(unresolved, stats, stderrTail),
         failureCause: 'agent',
         ...(usage ? { usage } : {}),
+        ...(callMetrics ? { callMetrics } : {}),
       }
     }
     // Complete the merge commit with the agent's resolution staged, then push.
@@ -759,7 +808,14 @@ async function runConflictResolution(job: AgentJob, opts: RunOptions): Promise<A
     opts.onPhase?.('push')
     logger.info('agent(conflict): pushing resolved branch', { ...stats })
     await pushBranch(dir, job.branch, job.ghToken, signal)
-    return { pushed: true, branch: job.branch, summary, stats, ...(usage ? { usage } : {}) }
+    return {
+      pushed: true,
+      branch: job.branch,
+      summary,
+      stats,
+      ...(usage ? { usage } : {}),
+      ...(callMetrics ? { callMetrics } : {}),
+    }
   })
 }
@@ -850,7 +906,7 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
     opts.onPhase?.('agent')
     logger.info('agent(bootstrap): running agent')
-    const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
+    const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
       {
         dir,
         systemPrompt: job.systemPrompt,
@@ -874,7 +930,14 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
     if (!(await producedRepoContent(dir, !fromScratch, signal))) {
       const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail)
       logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats })
-      return { summary, stats, error, failureCause: 'agent', ...(usage ? { usage } : {}) }
+      return {
+        summary,
+        stats,
+        error,
+        failureCause: 'agent',
+        ...(usage ? { usage } : {}),
+        ...(callMetrics ? { callMetrics } : {}),
+      }
     }
     opts.onPhase?.('push')
@@ -890,7 +953,13 @@ async function runBootstrap(job: AgentJob, opts: RunOptions): Promise<AgentResul
         : `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
     })
     logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch })
-    return { defaultBranch: boot.target.defaultBranch, summary, stats, ...(usage ? { usage } : {}) }
+    return {
+      defaultBranch: boot.target.defaultBranch,
+      summary,
+      stats,
+      ...(usage ? { usage } : {}),
+      ...(callMetrics ? { callMetrics } : {}),
+    }
   })
 }

package/src/coding-agent.ts CHANGED Viewed

@@ -17,7 +17,7 @@ import {
   remoteBranchExists,
 } from './git.js'
 import { FOLLOW_UPS_FILENAME, FollowUpTailer } from './follow-ups.js'
-import type { PiRunStats } from './pi.js'
+import type { HarnessCallMetric, PiRunStats } from './pi.js'
 import {
   acquireRepoCheckout,
   agentNeverActed,
@@ -89,6 +89,8 @@ export interface CodingAgentOutcome {
   stderrTail?: string
   /** Token usage from a subscription harness's CLI stream (absent for Pi). */
   usage?: { inputTokens: number; outputTokens: number }
+  /** Per-model-call telemetry from a subscription harness's CLI stream (absent for Pi). */
+  callMetrics?: HarnessCallMetric[]
 }
 /**
@@ -296,7 +298,7 @@ export async function runCodingAgent(
       try {
         opts.onPhase?.('agent')
         logger.info('coding-agent: running agent', { serviceDirectory })
-        const { summary, stats, stderrTail, usage } = await runAgentInWorkspace(
+        const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace(
           {
             dir: workDir,
             systemPrompt: spec.systemPrompt,
@@ -371,6 +373,7 @@ export async function runCodingAgent(
             stats,
             ...(stderrTail ? { stderrTail } : {}),
             ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
           }
         } else {
           opts.onPhase?.('push')
@@ -383,6 +386,7 @@ export async function runCodingAgent(
             stats,
             ...(stderrTail ? { stderrTail } : {}),
             ...(usage ? { usage } : {}),
+            ...(callMetrics ? { callMetrics } : {}),
           }
         }
       } finally {

package/src/job.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import type { PiRunStats } from './pi.js'
+import type { HarnessCallMetric, PiRunStats } from './pi.js'
 import type { HarnessKind } from './pi-workspace.js'
 import type { FailureCause } from './failure.js'
@@ -529,6 +529,12 @@ export interface AgentResult {
    */
   failureCause?: FailureCause
   usage?: { inputTokens: number; outputTokens: number }
+  /**
+   * Per-model-call telemetry from a subscription harness's CLI stream (absent for the
+   * proxy-metered Pi harness). The backend records these into `llm_call_metrics`. See
+   * {@link HarnessCallMetric}.
+   */
+  callMetrics?: HarnessCallMetric[]
 }
 /** Parse the coding-mode bootstrap spec, or undefined when absent. Validates the target. */

package/src/pi.ts CHANGED Viewed

@@ -414,6 +414,38 @@ export interface RunDiagnostics {
   finalAnswerEmpty: boolean
 }
+/**
+ * One model call captured from a subscription harness's CLI event stream, shaped so
+ * the backend can record it into the same `llm_call_metrics` telemetry the LLM proxy
+ * writes for the Pi harness. The subscription harnesses (Claude Code / Codex) talk
+ * DIRECT to the vendor and never touch the proxy, so this is the only place their
+ * per-call bodies are observable. Claude Code's `stream-json --verbose` is a near-
+ * verbatim Anthropic Messages stream, so its calls carry full request/response
+ * bodies; Codex's `exec --json` only surfaces flat assistant text + per-turn tokens,
+ * so its rows are honestly thinner (no request transcript, no tool/command bodies).
+ */
+export interface HarnessCallMetric {
+  /** The vendor model that served this call (from the CLI event), when reported. */
+  model?: string
+  /**
+   * The full request as an OpenAI-style chat array (`[{role, content}, …]`),
+   * JSON-stringified — the growing history as of this call. Matches the proxy's
+   * `promptText` shape so the telemetry chain delta-compresses + renders identically.
+   */
+  promptText: string
+  /** Number of messages encoded in {@link promptText} (the telemetry chain messageCount). */
+  messageCount: number
+  /** The assistant's response text, as a plain string (`''` for a tool-only turn). */
+  responseText: string
+  /** The reasoning/thinking trace, as a plain string (`''` when none). */
+  reasoningText: string
+  inputTokens: number
+  cachedInputTokens: number
+  outputTokens: number
+  /** The provider finish/stop reason when the CLI reports one (else null). */
+  finishReason: string | null
+}
 /** Pi's assistant summary plus {@link PiRunStats} describing what it did. */
 export interface PiRunOutcome {
   summary: string
@@ -432,6 +464,14 @@ export interface PiRunOutcome {
    * (usage-aware rotation) and telemetry. Absent for the proxy-metered Pi harness.
    */
   usage?: { inputTokens: number; outputTokens: number }
+  /**
+   * Per-model-call telemetry lifted from a subscription harness's CLI event stream
+   * (Claude Code / Codex), which the backend records into `llm_call_metrics` — the
+   * proxy-bypassing analogue of the per-call rows the LLM proxy writes for Pi. Absent
+   * for the proxy-metered Pi harness (the proxy is its metering point). See
+   * {@link HarnessCallMetric}.
+   */
+  callMetrics?: HarnessCallMetric[]
   /** Output-quality signals (truncation / empty final answer); see {@link RunDiagnostics}. */
   diagnostics?: RunDiagnostics
 }