@cat-factory/executor-harness 1.31.10 → 1.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-runner.js +206 -25
- package/dist/agent.js +81 -13
- package/dist/coding-agent.js +3 -1
- package/dist/frontend-infra.js +8 -5
- package/dist/job.js +7 -2
- package/package.json +3 -3
- package/src/agent-runner.ts +225 -25
- package/src/agent.ts +81 -12
- package/src/coding-agent.ts +6 -2
- package/src/frontend-infra.ts +8 -5
- package/src/job.ts +23 -3
- package/src/pi.ts +40 -0
package/src/job.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { PiRunStats } from './pi.js'
|
|
1
|
+
import type { HarnessCallMetric, PiRunStats } from './pi.js'
|
|
2
2
|
import type { HarnessKind } from './pi-workspace.js'
|
|
3
3
|
import type { FailureCause } from './failure.js'
|
|
4
4
|
|
|
@@ -146,7 +146,10 @@ function parseHarnessAuth(o: Record<string, unknown>): HarnessAuthFields {
|
|
|
146
146
|
* `..` segment) — the agent's cwd is built from this, so a hostile value must never
|
|
147
147
|
* point outside the cloned repo.
|
|
148
148
|
*/
|
|
149
|
-
function sanitizeServiceDirectory(
|
|
149
|
+
function sanitizeServiceDirectory(
|
|
150
|
+
value: unknown,
|
|
151
|
+
field = 'repo.serviceDirectory',
|
|
152
|
+
): string | undefined {
|
|
150
153
|
if (typeof value !== 'string') return undefined
|
|
151
154
|
const normalized = value
|
|
152
155
|
.trim()
|
|
@@ -156,7 +159,7 @@ function sanitizeServiceDirectory(value: unknown): string | undefined {
|
|
|
156
159
|
const segments = normalized.split('/').filter((s) => s !== '' && s !== '.')
|
|
157
160
|
if (segments.length === 0) return undefined
|
|
158
161
|
if (segments.some((s) => s === '..')) {
|
|
159
|
-
throw new Error(
|
|
162
|
+
throw new Error(`Invalid job: '${field}' must be a path inside the repo`)
|
|
160
163
|
}
|
|
161
164
|
return segments.join('/')
|
|
162
165
|
}
|
|
@@ -290,6 +293,12 @@ export interface ServiceInfraSpec {
|
|
|
290
293
|
*/
|
|
291
294
|
export interface FrontendInfraSpec {
|
|
292
295
|
kind: 'frontend'
|
|
296
|
+
/**
|
|
297
|
+
* The frontend app's subdirectory within the checkout (a monorepo frontend). Absent ⇒ the
|
|
298
|
+
* checkout root. When set, install/build/serve run there and `outputDir`/`wiremockMappingsPath`
|
|
299
|
+
* are resolved relative to it.
|
|
300
|
+
*/
|
|
301
|
+
directory?: string
|
|
293
302
|
/** Package manager for install/build. Default `pnpm`. */
|
|
294
303
|
packageManager?: 'pnpm' | 'npm' | 'yarn'
|
|
295
304
|
/** Explicit install command, overriding the one derived from `packageManager`. */
|
|
@@ -520,6 +529,12 @@ export interface AgentResult {
|
|
|
520
529
|
*/
|
|
521
530
|
failureCause?: FailureCause
|
|
522
531
|
usage?: { inputTokens: number; outputTokens: number }
|
|
532
|
+
/**
|
|
533
|
+
* Per-model-call telemetry from a subscription harness's CLI stream (absent for the
|
|
534
|
+
* proxy-metered Pi harness). The backend records these into `llm_call_metrics`. See
|
|
535
|
+
* {@link HarnessCallMetric}.
|
|
536
|
+
*/
|
|
537
|
+
callMetrics?: HarnessCallMetric[]
|
|
523
538
|
}
|
|
524
539
|
|
|
525
540
|
/** Parse the coding-mode bootstrap spec, or undefined when absent. Validates the target. */
|
|
@@ -659,8 +674,13 @@ function parseFrontendInfraSpec(o: Record<string, unknown>): FrontendInfraSpec {
|
|
|
659
674
|
}
|
|
660
675
|
const servePort = port(o.servePort)
|
|
661
676
|
const wiremockPort = port(o.wiremockPort)
|
|
677
|
+
// The app's monorepo subdirectory becomes the install/build/serve cwd, so it goes through the
|
|
678
|
+
// same escape-guard as `repo.serviceDirectory` — strip slashes and reject any `..` segment so a
|
|
679
|
+
// hostile value can't point the stand-up outside the cloned repo.
|
|
680
|
+
const directory = sanitizeServiceDirectory(o.directory, 'frontend.directory')
|
|
662
681
|
return {
|
|
663
682
|
kind: 'frontend',
|
|
683
|
+
...(directory ? { directory } : {}),
|
|
664
684
|
...(packageManager ? { packageManager } : {}),
|
|
665
685
|
...(typeof o.install === 'string' && o.install ? { install: o.install } : {}),
|
|
666
686
|
...(typeof o.buildScript === 'string' && o.buildScript ? { buildScript: o.buildScript } : {}),
|
package/src/pi.ts
CHANGED
|
@@ -414,6 +414,38 @@ export interface RunDiagnostics {
|
|
|
414
414
|
finalAnswerEmpty: boolean
|
|
415
415
|
}
|
|
416
416
|
|
|
417
|
+
/**
|
|
418
|
+
* One model call captured from a subscription harness's CLI event stream, shaped so
|
|
419
|
+
* the backend can record it into the same `llm_call_metrics` telemetry the LLM proxy
|
|
420
|
+
* writes for the Pi harness. The subscription harnesses (Claude Code / Codex) talk
|
|
421
|
+
* DIRECT to the vendor and never touch the proxy, so this is the only place their
|
|
422
|
+
* per-call bodies are observable. Claude Code's `stream-json --verbose` is a near-
|
|
423
|
+
* verbatim Anthropic Messages stream, so its calls carry full request/response
|
|
424
|
+
* bodies; Codex's `exec --json` only surfaces flat assistant text + per-turn tokens,
|
|
425
|
+
* so its rows are honestly thinner (no request transcript, no tool/command bodies).
|
|
426
|
+
*/
|
|
427
|
+
export interface HarnessCallMetric {
|
|
428
|
+
/** The vendor model that served this call (from the CLI event), when reported. */
|
|
429
|
+
model?: string
|
|
430
|
+
/**
|
|
431
|
+
* The full request as an OpenAI-style chat array (`[{role, content}, …]`),
|
|
432
|
+
* JSON-stringified — the growing history as of this call. Matches the proxy's
|
|
433
|
+
* `promptText` shape so the telemetry chain delta-compresses + renders identically.
|
|
434
|
+
*/
|
|
435
|
+
promptText: string
|
|
436
|
+
/** Number of messages encoded in {@link promptText} (the telemetry chain messageCount). */
|
|
437
|
+
messageCount: number
|
|
438
|
+
/** The assistant's response text, as a plain string (`''` for a tool-only turn). */
|
|
439
|
+
responseText: string
|
|
440
|
+
/** The reasoning/thinking trace, as a plain string (`''` when none). */
|
|
441
|
+
reasoningText: string
|
|
442
|
+
inputTokens: number
|
|
443
|
+
cachedInputTokens: number
|
|
444
|
+
outputTokens: number
|
|
445
|
+
/** The provider finish/stop reason when the CLI reports one (else null). */
|
|
446
|
+
finishReason: string | null
|
|
447
|
+
}
|
|
448
|
+
|
|
417
449
|
/** Pi's assistant summary plus {@link PiRunStats} describing what it did. */
|
|
418
450
|
export interface PiRunOutcome {
|
|
419
451
|
summary: string
|
|
@@ -432,6 +464,14 @@ export interface PiRunOutcome {
|
|
|
432
464
|
* (usage-aware rotation) and telemetry. Absent for the proxy-metered Pi harness.
|
|
433
465
|
*/
|
|
434
466
|
usage?: { inputTokens: number; outputTokens: number }
|
|
467
|
+
/**
|
|
468
|
+
* Per-model-call telemetry lifted from a subscription harness's CLI event stream
|
|
469
|
+
* (Claude Code / Codex), which the backend records into `llm_call_metrics` — the
|
|
470
|
+
* proxy-bypassing analogue of the per-call rows the LLM proxy writes for Pi. Absent
|
|
471
|
+
* for the proxy-metered Pi harness (the proxy is its metering point). See
|
|
472
|
+
* {@link HarnessCallMetric}.
|
|
473
|
+
*/
|
|
474
|
+
callMetrics?: HarnessCallMetric[]
|
|
435
475
|
/** Output-quality signals (truncation / empty final answer); see {@link RunDiagnostics}. */
|
|
436
476
|
diagnostics?: RunDiagnostics
|
|
437
477
|
}
|