@cat-factory/executor-harness 1.31.12 → 1.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-runner.js +206 -25
- package/dist/agent.js +87 -13
- package/dist/coding-agent.js +3 -1
- package/dist/job.js +59 -0
- package/dist/package-registries.js +51 -0
- package/dist/redact.js +17 -4
- package/package.json +3 -3
- package/src/agent-runner.ts +225 -25
- package/src/agent.ts +87 -12
- package/src/coding-agent.ts +6 -2
- package/src/job.ts +100 -1
- package/src/package-registries.ts +58 -0
- package/src/pi.ts +40 -0
- package/src/redact.ts +18 -4
package/dist/agent-runner.js
CHANGED
|
@@ -7,6 +7,26 @@ import { redact, secretsToRedact } from './redact.js';
|
|
|
7
7
|
function isObject(value) {
|
|
8
8
|
return typeof value === 'object' && value !== null;
|
|
9
9
|
}
|
|
10
|
+
/** Scrub any leased-credential occurrences from a telemetry body (no-op when none). */
|
|
11
|
+
function redactBody(text, secrets) {
|
|
12
|
+
return secrets.length ? redact(text, secrets) : text;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Fallback token attribution: if a CLI reported a cumulative total but no per-turn
|
|
16
|
+
* usage (so every captured call has zero tokens), pin the whole total onto the LAST
|
|
17
|
+
* call rather than dropping it — the run's tokens are still accounted, just not split
|
|
18
|
+
* per turn. A no-op when the calls already carry per-turn tokens.
|
|
19
|
+
*/
|
|
20
|
+
function attributeCumulativeUsage(calls, usage) {
|
|
21
|
+
if (!usage || calls.length === 0)
|
|
22
|
+
return;
|
|
23
|
+
const anyTokens = calls.some((c) => c.inputTokens > 0 || c.outputTokens > 0);
|
|
24
|
+
if (anyTokens)
|
|
25
|
+
return;
|
|
26
|
+
const last = calls[calls.length - 1];
|
|
27
|
+
last.inputTokens = usage.inputTokens;
|
|
28
|
+
last.outputTokens = usage.outputTokens;
|
|
29
|
+
}
|
|
10
30
|
/**
|
|
11
31
|
* Drive one CLI subprocess to completion, streaming LF-framed JSONL from stdout
|
|
12
32
|
* through `onEvent`. Mirrors `runPi`'s lifecycle: prompt over stdin (out-of-band,
|
|
@@ -114,27 +134,59 @@ export async function runClaudeCode(opts) {
|
|
|
114
134
|
const stats = { toolCalls: 0, assistantChars: 0 };
|
|
115
135
|
let summary = '';
|
|
116
136
|
let usage;
|
|
137
|
+
// Reconstruct the full per-call request/response bodies for telemetry from the
|
|
138
|
+
// stream. `--output-format stream-json --verbose` emits each turn as a near-verbatim
|
|
139
|
+
// Anthropic Messages envelope, so `assistant` events carry the complete response
|
|
140
|
+
// (text + tool_use blocks + usage), and `user` events carry the tool_result blocks
|
|
141
|
+
// fed back — together the growing prompt transcript. We seed it with the two inputs
|
|
142
|
+
// the harness supplies (they never appear in the stream): the system + first user
|
|
143
|
+
// message. Bodies are credential-scrubbed (they can echo the leased token).
|
|
144
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
|
|
145
|
+
const messages = [
|
|
146
|
+
{ role: 'system', content: opts.systemPrompt },
|
|
147
|
+
{ role: 'user', content: opts.userPrompt },
|
|
148
|
+
];
|
|
149
|
+
const calls = [];
|
|
117
150
|
const onEvent = (event) => {
|
|
118
151
|
const type = event.type;
|
|
119
152
|
if (type === 'assistant' && isObject(event.message)) {
|
|
120
|
-
const
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
opts.onProgress(progress);
|
|
134
|
-
}
|
|
135
|
-
}
|
|
153
|
+
const message = event.message;
|
|
154
|
+
const content = Array.isArray(message.content) ? message.content : [];
|
|
155
|
+
const { text, reasoning, toolUses } = claudeAssistantContent(content);
|
|
156
|
+
stats.assistantChars += text.length;
|
|
157
|
+
stats.toolCalls += toolUses;
|
|
158
|
+
for (const block of content) {
|
|
159
|
+
if (isObject(block) &&
|
|
160
|
+
block.type === 'tool_use' &&
|
|
161
|
+
block.name === 'TodoWrite' &&
|
|
162
|
+
opts.onProgress) {
|
|
163
|
+
const progress = todosToProgress(block.input?.todos);
|
|
164
|
+
if (progress)
|
|
165
|
+
opts.onProgress(progress);
|
|
136
166
|
}
|
|
137
167
|
}
|
|
168
|
+
// Record this call BEFORE appending its turn: the prompt is the history that
|
|
169
|
+
// produced this response. The append-only array keeps each call's prompt a strict
|
|
170
|
+
// prefix of the next, so the backend's telemetry chain delta-compresses cleanly.
|
|
171
|
+
const u = claudeCallUsage(message.usage);
|
|
172
|
+
calls.push({
|
|
173
|
+
...(typeof message.model === 'string' ? { model: message.model } : {}),
|
|
174
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
175
|
+
messageCount: messages.length,
|
|
176
|
+
responseText: redactBody(text, secrets),
|
|
177
|
+
reasoningText: redactBody(reasoning, secrets),
|
|
178
|
+
inputTokens: u.inputTokens,
|
|
179
|
+
cachedInputTokens: u.cachedInputTokens,
|
|
180
|
+
outputTokens: u.outputTokens,
|
|
181
|
+
finishReason: typeof message.stop_reason === 'string' ? message.stop_reason : null,
|
|
182
|
+
});
|
|
183
|
+
messages.push({ role: 'assistant', content });
|
|
184
|
+
}
|
|
185
|
+
else if (type === 'user' && isObject(event.message)) {
|
|
186
|
+
// tool_result blocks the harness fed back to the model — part of the next prompt.
|
|
187
|
+
const content = event.message.content;
|
|
188
|
+
if (Array.isArray(content))
|
|
189
|
+
messages.push({ role: 'tool', content });
|
|
138
190
|
}
|
|
139
191
|
else if (type === 'result') {
|
|
140
192
|
if (typeof event.result === 'string')
|
|
@@ -199,7 +251,14 @@ export async function runClaudeCode(opts) {
|
|
|
199
251
|
'--append-system-prompt',
|
|
200
252
|
opts.systemPrompt,
|
|
201
253
|
], opts.userPrompt, opts, env, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
|
|
202
|
-
|
|
254
|
+
attributeCumulativeUsage(calls, usage);
|
|
255
|
+
return {
|
|
256
|
+
summary,
|
|
257
|
+
stats,
|
|
258
|
+
stderrTail,
|
|
259
|
+
...(usage ? { usage } : {}),
|
|
260
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
261
|
+
};
|
|
203
262
|
}
|
|
204
263
|
finally {
|
|
205
264
|
// Never leave the config dir (and any cached credential) on disk past the run.
|
|
@@ -241,6 +300,38 @@ function claudeUsage(raw) {
|
|
|
241
300
|
return undefined;
|
|
242
301
|
return { inputTokens: input, outputTokens: output };
|
|
243
302
|
}
|
|
303
|
+
/** Pull the text + reasoning out of a Claude `assistant` message's content blocks. */
|
|
304
|
+
function claudeAssistantContent(content) {
|
|
305
|
+
let text = '';
|
|
306
|
+
let reasoning = '';
|
|
307
|
+
let toolUses = 0;
|
|
308
|
+
for (const block of content) {
|
|
309
|
+
if (!isObject(block))
|
|
310
|
+
continue;
|
|
311
|
+
if (block.type === 'text' && typeof block.text === 'string')
|
|
312
|
+
text += block.text;
|
|
313
|
+
else if (block.type === 'thinking' && typeof block.thinking === 'string')
|
|
314
|
+
reasoning += block.thinking;
|
|
315
|
+
else if (block.type === 'tool_use')
|
|
316
|
+
toolUses += 1;
|
|
317
|
+
}
|
|
318
|
+
return { text, reasoning, toolUses };
|
|
319
|
+
}
|
|
320
|
+
/**
|
|
321
|
+
* Per-CALL token usage off a Claude `assistant` message's `usage` (this turn only, not
|
|
322
|
+
* the cumulative `result` total). `inputTokens` counts every billed input bucket (fresh
|
|
323
|
+
* + both cache buckets); `cachedInputTokens` is the cache share, surfaced separately.
|
|
324
|
+
*/
|
|
325
|
+
function claudeCallUsage(raw) {
|
|
326
|
+
if (!isObject(raw))
|
|
327
|
+
return { inputTokens: 0, cachedInputTokens: 0, outputTokens: 0 };
|
|
328
|
+
const cached = numberOf(raw.cache_read_input_tokens) + numberOf(raw.cache_creation_input_tokens);
|
|
329
|
+
return {
|
|
330
|
+
inputTokens: numberOf(raw.input_tokens) + cached,
|
|
331
|
+
cachedInputTokens: cached,
|
|
332
|
+
outputTokens: numberOf(raw.output_tokens),
|
|
333
|
+
};
|
|
334
|
+
}
|
|
244
335
|
// ---------------------------------------------------------------------------
|
|
245
336
|
// Codex
|
|
246
337
|
// ---------------------------------------------------------------------------
|
|
@@ -282,13 +373,29 @@ export async function runCodex(opts) {
|
|
|
282
373
|
await writeFile(join(codexHome, 'auth.json'), opts.subscriptionToken, { mode: 0o600 });
|
|
283
374
|
await writeFile(join(codexHome, 'config.toml'), 'cli_auth_credentials_store = "file"\n', 'utf8');
|
|
284
375
|
}
|
|
376
|
+
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
377
|
+
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
378
|
+
const prompt = opts.systemPrompt
|
|
379
|
+
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
380
|
+
: opts.userPrompt;
|
|
381
|
+
// Codex's `exec --json` is far thinner than Claude Code's stream: it surfaces only
|
|
382
|
+
// flat assistant text and (on `token_count` events) the per-turn `last_token_usage`
|
|
383
|
+
// plus a cumulative total. It never exposes the request transcript or structured
|
|
384
|
+
// tool/command bodies, so the captured prompt is just the folded input — the response
|
|
385
|
+
// text + per-turn tokens are faithful; the request side is best-effort by design.
|
|
386
|
+
const secrets = opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [];
|
|
387
|
+
const messages = [{ role: 'user', content: prompt }];
|
|
388
|
+
const calls = [];
|
|
389
|
+
let pendingText = '';
|
|
285
390
|
const onEvent = (event) => {
|
|
286
391
|
const type = typeof event.type === 'string' ? event.type : '';
|
|
287
|
-
if (type.includes('agent_message') ||
|
|
392
|
+
if (type.includes('agent_message') ||
|
|
393
|
+
(type === 'item.completed' && isCodexMessageItem(event))) {
|
|
288
394
|
const text = extractText(event);
|
|
289
395
|
if (text) {
|
|
290
396
|
stats.assistantChars += text.length;
|
|
291
397
|
summary = text;
|
|
398
|
+
pendingText = text;
|
|
292
399
|
}
|
|
293
400
|
}
|
|
294
401
|
if (type.includes('tool') || type.includes('command') || type.includes('exec')) {
|
|
@@ -300,12 +407,26 @@ export async function runCodex(opts) {
|
|
|
300
407
|
const turnUsage = codexUsage(event);
|
|
301
408
|
if (turnUsage)
|
|
302
409
|
usage = turnUsage;
|
|
410
|
+
// A `token_count` event closes a model turn: pair its per-turn usage with the
|
|
411
|
+
// assistant text seen since the previous turn as one telemetry call.
|
|
412
|
+
const perTurn = codexLastTurnUsage(event);
|
|
413
|
+
if (perTurn) {
|
|
414
|
+
calls.push({
|
|
415
|
+
model: opts.model,
|
|
416
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
417
|
+
messageCount: messages.length,
|
|
418
|
+
responseText: redactBody(pendingText, secrets),
|
|
419
|
+
reasoningText: '',
|
|
420
|
+
inputTokens: perTurn.inputTokens,
|
|
421
|
+
cachedInputTokens: perTurn.cachedInputTokens,
|
|
422
|
+
outputTokens: perTurn.outputTokens,
|
|
423
|
+
finishReason: null,
|
|
424
|
+
});
|
|
425
|
+
if (pendingText)
|
|
426
|
+
messages.push({ role: 'assistant', content: pendingText });
|
|
427
|
+
pendingText = '';
|
|
428
|
+
}
|
|
303
429
|
};
|
|
304
|
-
// Codex has no system-prompt flag, so fold the composed role + best-practice
|
|
305
|
-
// context into the prompt itself (Claude Code instead rides --append-system-prompt).
|
|
306
|
-
const prompt = opts.systemPrompt
|
|
307
|
-
? `${opts.systemPrompt}\n\n---\n\n${opts.userPrompt}`
|
|
308
|
-
: opts.userPrompt;
|
|
309
430
|
try {
|
|
310
431
|
const { stderrTail } = await streamCli('codex', [
|
|
311
432
|
'exec',
|
|
@@ -318,7 +439,28 @@ export async function runCodex(opts) {
|
|
|
318
439
|
opts.model,
|
|
319
440
|
'-',
|
|
320
441
|
], prompt, opts, codexHome ? { CODEX_HOME: codexHome } : {}, opts.subscriptionToken ? secretsToRedact(opts.subscriptionToken) : [], onEvent);
|
|
321
|
-
|
|
442
|
+
// Fallback for a CLI/version that never emits per-turn `last_token_usage`: record a
|
|
443
|
+
// single call from the cumulative total + final text so the run is still observable.
|
|
444
|
+
if (calls.length === 0 && (usage || summary)) {
|
|
445
|
+
calls.push({
|
|
446
|
+
model: opts.model,
|
|
447
|
+
promptText: redactBody(JSON.stringify(messages), secrets),
|
|
448
|
+
messageCount: messages.length,
|
|
449
|
+
responseText: redactBody(summary, secrets),
|
|
450
|
+
reasoningText: '',
|
|
451
|
+
inputTokens: usage?.inputTokens ?? 0,
|
|
452
|
+
cachedInputTokens: 0,
|
|
453
|
+
outputTokens: usage?.outputTokens ?? 0,
|
|
454
|
+
finishReason: null,
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
return {
|
|
458
|
+
summary,
|
|
459
|
+
stats,
|
|
460
|
+
stderrTail,
|
|
461
|
+
...(usage ? { usage } : {}),
|
|
462
|
+
...(calls.length ? { callMetrics: calls } : {}),
|
|
463
|
+
};
|
|
322
464
|
}
|
|
323
465
|
finally {
|
|
324
466
|
// Never leave the decrypted credential on disk past the run.
|
|
@@ -326,6 +468,24 @@ export async function runCodex(opts) {
|
|
|
326
468
|
await rm(codexHome, { recursive: true, force: true }).catch(() => { });
|
|
327
469
|
}
|
|
328
470
|
}
|
|
471
|
+
/**
|
|
472
|
+
* Whether a Codex `item.completed` event carries the model's ASSISTANT text (as
|
|
473
|
+
* opposed to a command/exec/tool/reasoning item, which also carry a `text` field —
|
|
474
|
+
* their command output or thinking — and must NOT be captured as the turn's response).
|
|
475
|
+
* A message item's kind contains `message` (`agent_message`/`assistant_message`); an
|
|
476
|
+
* item with no kind is treated as a message so older/simple shapes don't regress.
|
|
477
|
+
*/
|
|
478
|
+
function isCodexMessageItem(event) {
|
|
479
|
+
const item = isObject(event.item) ? event.item : undefined;
|
|
480
|
+
if (!item)
|
|
481
|
+
return false;
|
|
482
|
+
const kind = typeof item.item_type === 'string'
|
|
483
|
+
? item.item_type
|
|
484
|
+
: typeof item.type === 'string'
|
|
485
|
+
? item.type
|
|
486
|
+
: '';
|
|
487
|
+
return kind === '' || /message/i.test(kind);
|
|
488
|
+
}
|
|
329
489
|
/** Best-effort: pull a textual message out of a Codex event. */
|
|
330
490
|
function extractText(event) {
|
|
331
491
|
if (typeof event.message === 'string')
|
|
@@ -367,6 +527,8 @@ function codexPlanProgress(event) {
|
|
|
367
527
|
* other shapes put it on `usage` / `info.usage` directly. We read the cumulative
|
|
368
528
|
* total when present so the caller can simply overwrite (not sum) — summing
|
|
369
529
|
* cumulative totals across events would multiply-count. Checked most-likely first.
|
|
530
|
+
* `input_tokens` is the TOTAL prompt count (OpenAI semantics: `cached_input_tokens`
|
|
531
|
+
* is a subset already inside it), so it is NOT summed with the cached share.
|
|
370
532
|
*/
|
|
371
533
|
function codexUsage(event) {
|
|
372
534
|
const info = isObject(event.info) ? event.info : undefined;
|
|
@@ -376,12 +538,31 @@ function codexUsage(event) {
|
|
|
376
538
|
(info && isObject(info.usage) ? info.usage : undefined);
|
|
377
539
|
if (!isObject(raw))
|
|
378
540
|
return undefined;
|
|
379
|
-
const input = numberOf(raw.input_tokens)
|
|
541
|
+
const input = numberOf(raw.input_tokens);
|
|
380
542
|
const output = numberOf(raw.output_tokens);
|
|
381
543
|
if (input === 0 && output === 0)
|
|
382
544
|
return undefined;
|
|
383
545
|
return { inputTokens: input, outputTokens: output };
|
|
384
546
|
}
|
|
547
|
+
/**
|
|
548
|
+
* Per-TURN Codex token usage off a `token_count` event's `info.last_token_usage` (the
|
|
549
|
+
* delta for the turn just completed, as opposed to `codexUsage`'s cumulative total).
|
|
550
|
+
* `input_tokens` is the total prompt count for the turn and already INCLUDES the cached
|
|
551
|
+
* share (OpenAI semantics), so `cachedInputTokens` is surfaced as the subset it is —
|
|
552
|
+
* NOT added on top (adding it would double-count every cached token).
|
|
553
|
+
*/
|
|
554
|
+
function codexLastTurnUsage(event) {
|
|
555
|
+
const info = isObject(event.info) ? event.info : undefined;
|
|
556
|
+
const raw = info && isObject(info.last_token_usage) ? info.last_token_usage : undefined;
|
|
557
|
+
if (!isObject(raw))
|
|
558
|
+
return undefined;
|
|
559
|
+
const input = numberOf(raw.input_tokens);
|
|
560
|
+
const cached = numberOf(raw.cached_input_tokens);
|
|
561
|
+
const output = numberOf(raw.output_tokens);
|
|
562
|
+
if (input === 0 && output === 0)
|
|
563
|
+
return undefined;
|
|
564
|
+
return { inputTokens: input, cachedInputTokens: cached, outputTokens: output };
|
|
565
|
+
}
|
|
385
566
|
function numberOf(value) {
|
|
386
567
|
return typeof value === 'number' && Number.isFinite(value) ? value : 0;
|
|
387
568
|
}
|
package/dist/agent.js
CHANGED
|
@@ -4,6 +4,7 @@ import { mkdir, mkdtemp, opendir, rm } from 'node:fs/promises';
|
|
|
4
4
|
import { execFile } from 'node:child_process';
|
|
5
5
|
import { promisify } from 'node:util';
|
|
6
6
|
import { standUpFrontend, tearDownFrontend } from './frontend-infra.js';
|
|
7
|
+
import { configurePackageRegistries } from './package-registries.js';
|
|
7
8
|
import { captureRedactedOutput, redactSecrets } from './redact.js';
|
|
8
9
|
import { cloneRepo, commitAll, conflictDiff, hasAgentChanges, headCommit, mergeBranch, openPullRequest, prepareExistingCheckout, pushBranch, reinitAndPush, unmergedPaths, } from './git.js';
|
|
9
10
|
import { noChangesReason, runCodingAgent } from './coding-agent.js';
|
|
@@ -196,6 +197,11 @@ async function cloneServiceCheckout(dir, job, signal) {
|
|
|
196
197
|
}
|
|
197
198
|
/** Run one generic agent job end to end, dispatching on `mode`. */
|
|
198
199
|
export async function handleAgent(job, opts = {}) {
|
|
200
|
+
// Private-registry auth first, before any mode runs: every mode with a checkout may
|
|
201
|
+
// install dependencies (the agent's own shell and the frontend-infra stand-up both
|
|
202
|
+
// inherit `HOME`, so they all read the written ~/.npmrc). A job with no entries
|
|
203
|
+
// clears any stale ~/.npmrc from a prior job on a reused (warm-pool) container.
|
|
204
|
+
await configurePackageRegistries(job.packageRegistries);
|
|
199
205
|
if (job.mode === 'preview')
|
|
200
206
|
return runPreviewMode(job, opts);
|
|
201
207
|
return job.mode === 'coding' ? runCodingMode(job, opts) : runExploreMode(job, opts);
|
|
@@ -341,7 +347,7 @@ async function runExploreMode(job, opts) {
|
|
|
341
347
|
try {
|
|
342
348
|
opts.onPhase?.('agent');
|
|
343
349
|
logger.info('agent(explore): running agent', { serviceDirectory });
|
|
344
|
-
const { summary, stats, stderrTail, usage, diagnostics: runDiag, } = await runAgentInWorkspace({
|
|
350
|
+
const { summary, stats, stderrTail, usage, callMetrics, diagnostics: runDiag, } = await runAgentInWorkspace({
|
|
345
351
|
dir: workDir,
|
|
346
352
|
systemPrompt: job.systemPrompt,
|
|
347
353
|
userPrompt,
|
|
@@ -368,6 +374,7 @@ async function runExploreMode(job, opts) {
|
|
|
368
374
|
error: noOutputReason(stats, stderrTail),
|
|
369
375
|
failureCause: 'no-usable-output',
|
|
370
376
|
...(usage ? { usage } : {}),
|
|
377
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
371
378
|
...infraSetupFields,
|
|
372
379
|
};
|
|
373
380
|
}
|
|
@@ -384,6 +391,7 @@ async function runExploreMode(job, opts) {
|
|
|
384
391
|
error: `the agent did not return a usable result: ${unusable}.${agentOutputTail(stderrTail, summary)}`,
|
|
385
392
|
failureCause: 'no-usable-output',
|
|
386
393
|
...(usage ? { usage } : {}),
|
|
394
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
387
395
|
...infraSetupFields,
|
|
388
396
|
};
|
|
389
397
|
}
|
|
@@ -391,7 +399,13 @@ async function runExploreMode(job, opts) {
|
|
|
391
399
|
// Prose: the summary IS the deliverable.
|
|
392
400
|
if (job.output?.kind !== 'structured') {
|
|
393
401
|
logger.info('agent(explore): done (prose)', { ...stats });
|
|
394
|
-
return {
|
|
402
|
+
return {
|
|
403
|
+
summary,
|
|
404
|
+
stats,
|
|
405
|
+
...(usage ? { usage } : {}),
|
|
406
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
407
|
+
...infraSetupFields,
|
|
408
|
+
};
|
|
395
409
|
}
|
|
396
410
|
// Structured: parse the agent's JSON. With repair enabled (default) a malformed
|
|
397
411
|
// reply gets ONE structured repair call before giving up; with `repair:false` we
|
|
@@ -432,6 +446,7 @@ async function runExploreMode(job, opts) {
|
|
|
432
446
|
error: noStructuredReason(stats, stderrTail, diagnostics),
|
|
433
447
|
failureCause: 'no-usable-output',
|
|
434
448
|
...(usage ? { usage } : {}),
|
|
449
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
435
450
|
...infraSetupFields,
|
|
436
451
|
};
|
|
437
452
|
}
|
|
@@ -451,7 +466,14 @@ async function runExploreMode(job, opts) {
|
|
|
451
466
|
custom.environment = reportedEnvironment;
|
|
452
467
|
}
|
|
453
468
|
logger.info('agent(explore): done (structured)', { ...stats });
|
|
454
|
-
return {
|
|
469
|
+
return {
|
|
470
|
+
summary,
|
|
471
|
+
custom,
|
|
472
|
+
stats,
|
|
473
|
+
...(usage ? { usage } : {}),
|
|
474
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
475
|
+
...infraSetupFields,
|
|
476
|
+
};
|
|
455
477
|
}
|
|
456
478
|
finally {
|
|
457
479
|
if (managed)
|
|
@@ -477,7 +499,7 @@ async function runCodingMode(job, opts) {
|
|
|
477
499
|
if (job.mergeBase)
|
|
478
500
|
return runConflictResolution(job, opts);
|
|
479
501
|
const pushBranch = job.pushBranch ?? job.newBranch ?? job.branch;
|
|
480
|
-
const { summary, stats, stderrTail, pushed, usage } = await runCodingAgent({
|
|
502
|
+
const { summary, stats, stderrTail, pushed, usage, callMetrics } = await runCodingAgent({
|
|
481
503
|
kind: 'agent',
|
|
482
504
|
jobId: job.jobId,
|
|
483
505
|
repo: job.repo,
|
|
@@ -504,7 +526,14 @@ async function runCodingMode(job, opts) {
|
|
|
504
526
|
if (!pushed) {
|
|
505
527
|
// A no-op: a failure for the implementer, a clean non-event for the fixers.
|
|
506
528
|
if (job.noChangesIsError === false) {
|
|
507
|
-
return {
|
|
529
|
+
return {
|
|
530
|
+
pushed: false,
|
|
531
|
+
branch: pushBranch,
|
|
532
|
+
summary,
|
|
533
|
+
stats,
|
|
534
|
+
...(usage ? { usage } : {}),
|
|
535
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
536
|
+
};
|
|
508
537
|
}
|
|
509
538
|
return {
|
|
510
539
|
pushed: false,
|
|
@@ -514,6 +543,7 @@ async function runCodingMode(job, opts) {
|
|
|
514
543
|
error: noChangesReason('the agent produced no file changes', stats, stderrTail),
|
|
515
544
|
failureCause: 'no-changes',
|
|
516
545
|
...(usage ? { usage } : {}),
|
|
546
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
517
547
|
};
|
|
518
548
|
}
|
|
519
549
|
// Changes are on the branch. Open a PR only when the job asked for one.
|
|
@@ -539,7 +569,14 @@ async function runCodingMode(job, opts) {
|
|
|
539
569
|
// this is the belt-and-suspenders path when the ahead-of-base check couldn't determine it.
|
|
540
570
|
if (prUrl === null) {
|
|
541
571
|
if (job.noChangesIsError === false) {
|
|
542
|
-
return {
|
|
572
|
+
return {
|
|
573
|
+
pushed: false,
|
|
574
|
+
branch: pushBranch,
|
|
575
|
+
summary,
|
|
576
|
+
stats,
|
|
577
|
+
...(usage ? { usage } : {}),
|
|
578
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
579
|
+
};
|
|
543
580
|
}
|
|
544
581
|
return {
|
|
545
582
|
pushed: false,
|
|
@@ -549,11 +586,27 @@ async function runCodingMode(job, opts) {
|
|
|
549
586
|
error: noChangesReason('the work branch has no commits ahead of its base (nothing to open a PR for)', stats, stderrTail),
|
|
550
587
|
failureCause: 'no-changes',
|
|
551
588
|
...(usage ? { usage } : {}),
|
|
589
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
552
590
|
};
|
|
553
591
|
}
|
|
554
|
-
return {
|
|
592
|
+
return {
|
|
593
|
+
pushed: true,
|
|
594
|
+
prUrl,
|
|
595
|
+
branch: pushBranch,
|
|
596
|
+
summary,
|
|
597
|
+
stats,
|
|
598
|
+
...(usage ? { usage } : {}),
|
|
599
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
600
|
+
};
|
|
555
601
|
}
|
|
556
|
-
return {
|
|
602
|
+
return {
|
|
603
|
+
pushed: true,
|
|
604
|
+
branch: pushBranch,
|
|
605
|
+
summary,
|
|
606
|
+
stats,
|
|
607
|
+
...(usage ? { usage } : {}),
|
|
608
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
609
|
+
};
|
|
557
610
|
}
|
|
558
611
|
/**
|
|
559
612
|
* Conflict-resolution coding flow (the conflict-resolver): clone the PR head `branch`
|
|
@@ -617,7 +670,7 @@ async function runConflictResolution(job, opts) {
|
|
|
617
670
|
logger.info('agent(conflict): resolving conflicts with agent', { conflicted });
|
|
618
671
|
const diff = await conflictDiff(dir, conflicted, signal);
|
|
619
672
|
const userPrompt = buildConflictPrompt(mergeBase, job.branch, conflicted, diff, job.userPrompt);
|
|
620
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
673
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
|
|
621
674
|
dir,
|
|
622
675
|
systemPrompt: job.systemPrompt,
|
|
623
676
|
userPrompt,
|
|
@@ -646,6 +699,7 @@ async function runConflictResolution(job, opts) {
|
|
|
646
699
|
error: unresolvedReason(unresolved, stats, stderrTail),
|
|
647
700
|
failureCause: 'agent',
|
|
648
701
|
...(usage ? { usage } : {}),
|
|
702
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
649
703
|
};
|
|
650
704
|
}
|
|
651
705
|
// Complete the merge commit with the agent's resolution staged, then push.
|
|
@@ -653,7 +707,14 @@ async function runConflictResolution(job, opts) {
|
|
|
653
707
|
opts.onPhase?.('push');
|
|
654
708
|
logger.info('agent(conflict): pushing resolved branch', { ...stats });
|
|
655
709
|
await pushBranch(dir, job.branch, job.ghToken, signal);
|
|
656
|
-
return {
|
|
710
|
+
return {
|
|
711
|
+
pushed: true,
|
|
712
|
+
branch: job.branch,
|
|
713
|
+
summary,
|
|
714
|
+
stats,
|
|
715
|
+
...(usage ? { usage } : {}),
|
|
716
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
717
|
+
};
|
|
657
718
|
});
|
|
658
719
|
}
|
|
659
720
|
/**
|
|
@@ -729,7 +790,7 @@ async function runBootstrap(job, opts) {
|
|
|
729
790
|
}
|
|
730
791
|
opts.onPhase?.('agent');
|
|
731
792
|
logger.info('agent(bootstrap): running agent');
|
|
732
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
793
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
|
|
733
794
|
dir,
|
|
734
795
|
systemPrompt: job.systemPrompt,
|
|
735
796
|
userPrompt: job.userPrompt,
|
|
@@ -749,7 +810,14 @@ async function runBootstrap(job, opts) {
|
|
|
749
810
|
if (!(await producedRepoContent(dir, !fromScratch, signal))) {
|
|
750
811
|
const error = bootstrapNoOpReason(!fromScratch, stats, summary, stderrTail);
|
|
751
812
|
logger.error('agent(bootstrap): agent produced no content, refusing to push', { ...stats });
|
|
752
|
-
return {
|
|
813
|
+
return {
|
|
814
|
+
summary,
|
|
815
|
+
stats,
|
|
816
|
+
error,
|
|
817
|
+
failureCause: 'agent',
|
|
818
|
+
...(usage ? { usage } : {}),
|
|
819
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
820
|
+
};
|
|
753
821
|
}
|
|
754
822
|
opts.onPhase?.('push');
|
|
755
823
|
logger.info('agent(bootstrap): pushing bootstrapped contents', { ...stats });
|
|
@@ -764,7 +832,13 @@ async function runBootstrap(job, opts) {
|
|
|
764
832
|
: `Bootstrap from ${job.repo.owner}/${job.repo.name}`,
|
|
765
833
|
});
|
|
766
834
|
logger.info('agent(bootstrap): complete', { defaultBranch: boot.target.defaultBranch });
|
|
767
|
-
return {
|
|
835
|
+
return {
|
|
836
|
+
defaultBranch: boot.target.defaultBranch,
|
|
837
|
+
summary,
|
|
838
|
+
stats,
|
|
839
|
+
...(usage ? { usage } : {}),
|
|
840
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
841
|
+
};
|
|
768
842
|
});
|
|
769
843
|
}
|
|
770
844
|
/**
|
package/dist/coding-agent.js
CHANGED
|
@@ -195,7 +195,7 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
195
195
|
try {
|
|
196
196
|
opts.onPhase?.('agent');
|
|
197
197
|
logger.info('coding-agent: running agent', { serviceDirectory });
|
|
198
|
-
const { summary, stats, stderrTail, usage } = await runAgentInWorkspace({
|
|
198
|
+
const { summary, stats, stderrTail, usage, callMetrics } = await runAgentInWorkspace({
|
|
199
199
|
dir: workDir,
|
|
200
200
|
systemPrompt: spec.systemPrompt,
|
|
201
201
|
userPrompt: spec.userPrompt,
|
|
@@ -265,6 +265,7 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
265
265
|
stats,
|
|
266
266
|
...(stderrTail ? { stderrTail } : {}),
|
|
267
267
|
...(usage ? { usage } : {}),
|
|
268
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
268
269
|
};
|
|
269
270
|
}
|
|
270
271
|
else {
|
|
@@ -278,6 +279,7 @@ export async function runCodingAgent(spec, opts = {}) {
|
|
|
278
279
|
stats,
|
|
279
280
|
...(stderrTail ? { stderrTail } : {}),
|
|
280
281
|
...(usage ? { usage } : {}),
|
|
282
|
+
...(callMetrics ? { callMetrics } : {}),
|
|
281
283
|
};
|
|
282
284
|
}
|
|
283
285
|
}
|
package/dist/job.js
CHANGED
|
@@ -165,6 +165,63 @@ function assertAllowedHost(rawUrl, path, env = process.env) {
|
|
|
165
165
|
throw new Error(`Invalid job: '${path}' host '${host}' is not an allowed GitHub host`);
|
|
166
166
|
}
|
|
167
167
|
}
|
|
168
|
+
/** npm registry hosts the harness is willing to send a registry token to. */
|
|
169
|
+
export function allowedNpmRegistryHosts(env = process.env) {
|
|
170
|
+
const hosts = new Set(['registry.npmjs.org', 'npm.pkg.github.com']);
|
|
171
|
+
// Optional extra allowlist (comma-separated) for tests / bespoke deployments.
|
|
172
|
+
for (const h of (env.NPM_ALLOWED_REGISTRY_HOSTS ?? '').split(',')) {
|
|
173
|
+
const t = h.trim().toLowerCase();
|
|
174
|
+
if (t)
|
|
175
|
+
hosts.add(t);
|
|
176
|
+
}
|
|
177
|
+
return hosts;
|
|
178
|
+
}
|
|
179
|
+
/** An npm scope (`@org`) — same shape the backend validates at the write boundary. */
|
|
180
|
+
const NPM_SCOPE_PATTERN = /^@[a-z0-9~-][a-z0-9._~-]*$/i;
|
|
181
|
+
// A registry token is a single opaque string. Reject any whitespace / control
|
|
182
|
+
// character: a newline in the token would inject arbitrary lines into the rendered
|
|
183
|
+
// `~/.npmrc` (a second, forged registry/_authToken line). Mirrors the backend's
|
|
184
|
+
// write-boundary constraint so a drifted body can't slip a multiline token past.
|
|
185
|
+
const NPM_TOKEN_PATTERN = /^[\x21-\x7e]+$/;
|
|
186
|
+
/** Validate the optional `packageRegistries` list (see {@link PackageRegistrySpec}). */
|
|
187
|
+
export function parsePackageRegistries(value, env = process.env) {
|
|
188
|
+
if (value === undefined || value === null)
|
|
189
|
+
return [];
|
|
190
|
+
if (!Array.isArray(value))
|
|
191
|
+
throw new Error("Invalid job: 'packageRegistries' must be an array");
|
|
192
|
+
const allowed = allowedNpmRegistryHosts(env);
|
|
193
|
+
const entries = [];
|
|
194
|
+
for (const [i, raw] of value.entries()) {
|
|
195
|
+
if (typeof raw !== 'object' || raw === null) {
|
|
196
|
+
throw new Error(`Invalid job: 'packageRegistries[${i}]' must be an object`);
|
|
197
|
+
}
|
|
198
|
+
const entry = raw;
|
|
199
|
+
// Unknown ecosystems are additive: a newer backend may send pip/maven entries an
|
|
200
|
+
// older image doesn't understand yet — skip them rather than failing the job.
|
|
201
|
+
if (entry.ecosystem !== 'npm')
|
|
202
|
+
continue;
|
|
203
|
+
const host = str(entry.host, `packageRegistries[${i}].host`).trim().toLowerCase();
|
|
204
|
+
if (!allowed.has(host)) {
|
|
205
|
+
throw new Error(`Invalid job: 'packageRegistries[${i}].host' '${host}' is not an allowed npm registry host`);
|
|
206
|
+
}
|
|
207
|
+
if (!Array.isArray(entry.scopes) || entry.scopes.length === 0) {
|
|
208
|
+
throw new Error(`Invalid job: 'packageRegistries[${i}].scopes' must be a non-empty array`);
|
|
209
|
+
}
|
|
210
|
+
const scopes = entry.scopes.map((scope, j) => {
|
|
211
|
+
const s = str(scope, `packageRegistries[${i}].scopes[${j}]`).trim();
|
|
212
|
+
if (!NPM_SCOPE_PATTERN.test(s)) {
|
|
213
|
+
throw new Error(`Invalid job: 'packageRegistries[${i}].scopes[${j}]' must look like @org`);
|
|
214
|
+
}
|
|
215
|
+
return s;
|
|
216
|
+
});
|
|
217
|
+
const token = str(entry.token, `packageRegistries[${i}].token`);
|
|
218
|
+
if (!NPM_TOKEN_PATTERN.test(token)) {
|
|
219
|
+
throw new Error(`Invalid job: 'packageRegistries[${i}].token' must not contain spaces or control characters`);
|
|
220
|
+
}
|
|
221
|
+
entries.push({ ecosystem: 'npm', host, scopes, token });
|
|
222
|
+
}
|
|
223
|
+
return entries;
|
|
224
|
+
}
|
|
168
225
|
/** Parse the coding-mode bootstrap spec, or undefined when absent. Validates the target. */
|
|
169
226
|
function parseAgentBootstrapSpec(value) {
|
|
170
227
|
if (typeof value !== 'object' || value === null)
|
|
@@ -372,6 +429,7 @@ export function parseAgentJob(input) {
|
|
|
372
429
|
const infra = parseAgentInfraSpec(o.infra);
|
|
373
430
|
const bootstrap = parseAgentBootstrapSpec(o.bootstrap);
|
|
374
431
|
const contextFiles = parseContextFiles(o.contextFiles);
|
|
432
|
+
const packageRegistries = parsePackageRegistries(o.packageRegistries);
|
|
375
433
|
const guardLimits = parseGuardLimits(o.guardLimits);
|
|
376
434
|
const job = {
|
|
377
435
|
jobId: str(o.jobId, 'jobId'),
|
|
@@ -391,6 +449,7 @@ export function parseAgentJob(input) {
|
|
|
391
449
|
...(bootstrap ? { bootstrap } : {}),
|
|
392
450
|
...(output ? { output } : {}),
|
|
393
451
|
...(contextFiles.length ? { contextFiles } : {}),
|
|
452
|
+
...(packageRegistries.length ? { packageRegistries } : {}),
|
|
394
453
|
...(infra ? { infra } : {}),
|
|
395
454
|
...(typeof o.newBranch === 'string' && o.newBranch ? { newBranch: o.newBranch } : {}),
|
|
396
455
|
...(typeof o.pushBranch === 'string' && o.pushBranch ? { pushBranch: o.pushBranch } : {}),
|