npm - @alis-build/harness-eval - Versions diffs - 0.1.1 → 0.1.3 - Mend

@alis-build/harness-eval 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md +104 -10
package/dist/adapters/claude-code/index.d.ts +2 -2
package/dist/adapters/claude-code/index.js +2 -1
package/dist/adapters/codex/index.d.ts +68 -0
package/dist/adapters/codex/index.js +3 -0
package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} +37 -250
package/dist/claude-code-C_7hxC8z.js.map +1 -0
package/dist/cli/bin.js +204 -127
package/dist/cli/bin.js.map +1 -1
package/dist/codex-0cHO2te9.js +496 -0
package/dist/codex-0cHO2te9.js.map +1 -0
package/dist/config/loader.d.ts +2 -2
package/dist/config/loader.js +2 -2
package/dist/{index-6Z17eKZx.d.ts → index-DnvP1UBl.d.ts} +3 -2
package/dist/index.d.ts +397 -153
package/dist/index.js +125 -5
package/dist/index.js.map +1 -0
package/dist/loader-B1WmGGzf.d.ts +107 -0
package/dist/{loader-BCnFJ8rm.js → loader-DnQ6Jt0i.js} +707 -157
package/dist/loader-DnQ6Jt0i.js.map +1 -0
package/dist/reporter-Biy-5-9M.js +2216 -0
package/dist/reporter-Biy-5-9M.js.map +1 -0
package/dist/runner/suite.d.ts +1 -1
package/dist/runner/suite.js +1 -1
package/dist/{suite-BoOvK_lq.d.ts → suite-BEShV0by.d.ts} +7 -2
package/dist/{suite-chj0j22j.js → suite-BcP64nlb.js} +72 -4
package/dist/suite-BcP64nlb.js.map +1 -0
package/dist/{types-BQol062t.d.ts → types-0QkNVyp9.d.ts} +152 -11
package/dist/types-Bac8_Ixb.js +246 -0
package/dist/types-Bac8_Ixb.js.map +1 -0
package/dist/types-Bu8uOZZN.d.ts +77 -0
package/dist/{types-B9H4IZtA.d.ts → types-C0gBkl0-.d.ts} +3 -2
package/package.json +7 -2
package/schemas/eval-interchange-instances.schema.json +196 -0
package/schemas/eval-interchange.schema.json +65 -52
package/schemas/eval-run-envelope.schema.json +182 -425
package/dist/build-DsVJ_UeU.js +0 -1396
package/dist/build-DsVJ_UeU.js.map +0 -1
package/dist/claude-code-ycT0JQZF.js.map +0 -1
package/dist/loader-BCnFJ8rm.js.map +0 -1
package/dist/loader-DTvoVfN0.d.ts +0 -33
package/dist/suite-chj0j22j.js.map +0 -1
package/schemas/eval-interchange-agent-trace.schema.json +0 -322
package/schemas/eval-interchange-proto-instance.schema.json +0 -106

package/README.md CHANGED Viewed

@@ -54,10 +54,11 @@ pnpm exec harness-eval --help
 Suites are YAML files. Committed examples:
-- [`examples/basic.yaml`](examples/basic.yaml) — smoke test using the built-in `Read` tool on this repo's README
+- [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
+- [`examples/basic.yaml`](examples/basic.yaml) — minimal smoke test using the built-in `Read` tool on this repo's README
 - [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
 - [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
-- [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config for `harness-eval grade`
+- [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config (alternate to inline `judge:`)
 ```yaml
 adapter: claude-code
@@ -96,9 +97,15 @@ cases:
 Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
+**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
 ### 2. Run behavioral eval
 ```bash
+# Unified pipeline (run + optional grade + envelope when pipeline: is defined)
+npx @alis-build/harness-eval pipeline examples/pipeline/
+# Or run harness only
 npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
 ```
@@ -110,7 +117,14 @@ Exit code `0` = all cells passed all assertion thresholds.
 ### 3. Grade outcomes (optional)
-Judge model, timeout, env, and `claudeCode` flags live in a separate **`grading.yaml`** (not in the suite file). See [`examples/grading.yaml`](examples/grading.yaml).
+**Unified suite:** add a top-level `judge:` block in `suite.yaml` (see [`examples/pipeline/suite.yaml`](examples/pipeline/suite.yaml)), then:
+```bash
+npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.yaml --output grading.json --max-concurrent 1 --format console
+# or: npx @alis-build/harness-eval pipeline examples/pipeline/ --steps grade
+```
+**Standalone grading file:** judge config in a separate **`grading.yaml`** (still supported). See [`examples/grading.yaml`](examples/grading.yaml).
 ```bash
 npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
@@ -399,6 +413,7 @@ Both layers use statistical thresholds: a case runs `repetitions` times per matr
 npx @alis-build/harness-eval run <suite.yaml> [options]
 npx @alis-build/harness-eval grade <report.json> [options]
 npx @alis-build/harness-eval envelope <report.json> [options]
+npx @alis-build/harness-eval pipeline <suite.yaml|dir> [options]
 npx @alis-build/harness-eval format <report.json> [options]
 npx @alis-build/harness-eval --help
 ```
@@ -420,7 +435,9 @@ npx @alis-build/harness-eval --help
 ### `grade`
-Uses a standalone **`grading.yaml`** for judge model, timeout, env, and `claudeCode` flags (Option B — separate from the suite file).
+Uses **`grading.yaml`** or an inline **`judge:`** block in `suite.yaml` (`--suite`).
+**Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
 ```yaml
 # examples/grading.yaml
@@ -440,6 +457,7 @@ npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --
 | Option                                 | Description                                                       |
 | -------------------------------------- | ----------------------------------------------------------------- |
 | `--config <path>`                      | Grading YAML (`judge` block) — model, env, timeout, `claudeCode`  |
+| `--suite <path>`                       | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
 | `--output <path>`                      | Write grading JSON                                                |
 | `--expectations <path>`                | Sidecar YAML/JSON if report lacks expectations                    |
 | `--format console\|json`               | Output format                                                     |
@@ -467,7 +485,7 @@ npx @alis-build/harness-eval envelope report.json --suite examples/basic.yaml --
 # Interchange projections
 npx @alis-build/harness-eval envelope report.json --projection trajectory --output trajectory.jsonl
 npx @alis-build/harness-eval envelope report.json --projection instances --output instances.json
-npx @alis-build/harness-eval envelope report.json --projection agent-trace --output agent-traces.json
+npx @alis-build/harness-eval envelope report.json --projection instances --output instances.jsonl
 ```
 | Option                                                      | Description                                               |
@@ -475,12 +493,34 @@ npx @alis-build/harness-eval envelope report.json --projection agent-trace --out
 | `--output <path>`                                           | Write output (stdout if omitted)                          |
 | `--grading <path>`                                          | Merge `grading.json` outcome scores into the envelope     |
 | `--suite <path>`                                            | Suite YAML for provenance (`uri`, `contentHash`)          |
-| `--projection envelope\|trajectory\|instances\|agent-trace` | Output shape (default: `envelope`)                        |
+| `--projection envelope\|trajectory\|instances` | Output shape (default: `envelope`)                        |
 | `--include-raw-stream-events`                               | Include adapter raw stream events in repetition artifacts |
 | `--no-transcript`                                           | Omit judge transcript artifacts                           |
 Exit codes: `0` = envelope built and behavioral pass; `1` = built but behavioral failures; `2` = usage or file errors.
+### `pipeline`
+Orchestrate **run → grade → envelope** from a unified `suite.yaml` when a `pipeline:` block is present. See [docs/suite-config.md — Pipeline orchestration](docs/suite-config.md#pipeline-orchestration-pipeline).
+```bash
+npx @alis-build/harness-eval pipeline examples/pipeline/
+npx @alis-build/harness-eval pipeline my-suite/ --steps run,grade
+```
+| Option | Description |
+| ------ | ----------- |
+| `--steps run,grade,envelope` | Subset of configured steps (default: all configured) |
+| `--output <path>` | Override `pipeline.run.output` |
+| `--report <path>` | Override report input for grade/envelope |
+| `--grading <path>` | Override grading input for envelope |
+| `--grading-output <path>` | Override `pipeline.grade.output` |
+| `--envelope-output <path>` | Override `pipeline.envelope.output` |
+| `--projection envelope\|trajectory\|instances` | Envelope projection |
+| `--max-concurrent <n>` | Parallel harness/judge workers |
+Exit codes match the first failing step (`run`, `grade`, or `envelope`). Returns `2` when no `pipeline:` block exists.
 ### `format`
 Re-render an existing `report.json` without re-running the harness.
@@ -509,6 +549,8 @@ See [Data contracts & schemas](#data-contracts--schemas) for type details.
 ## Suite concepts
+**Authoring reference:** [docs/suite-config.md](docs/suite-config.md) — complete field list for suite YAML, matrix cells, test cases, reference trajectories, and grading config.
 ### Test case
 One prompt + assertions + optional expectations, run N times per matrix cell.
@@ -533,13 +575,17 @@ assertions:
 Default threshold is `1.0` (every evaluated rep must pass). Reps where the harness crashes are excluded from the denominator and counted as `adapterErrors`.
+### Reference trajectory (optional)
+Define expected tool calls for Vertex trajectory metrics on the eval envelope. Use `tool_name_mode: bare` when reference steps use short tool names but the harness records MCP-prefixed names. See [docs/suite-config.md — Reference trajectory](docs/suite-config.md#reference-trajectory).
 **Full reference:** [docs/assertions.md](docs/assertions.md) — all assertion kinds, predicates, statistical model, and how to add new assertion types or harness adapters.
 ---
 ## Adding harness adapters
-Built-in adapters register at module load. Today only `claude-code` ships; additional harnesses (Codex, Gemini CLI, Antigravity CLI) plug in via the same pattern:
+Built-in adapters register at module load. **`claude-code`** and **`codex`** ship today; additional harnesses (Gemini CLI, Antigravity CLI) plug in via the same pattern:
 1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
 2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
@@ -554,7 +600,7 @@ import {
 } from "@alis-build/harness-eval";
 registerAdapter("my-harness", myAdapter);
-console.log(listAdapters()); // ["claude-code", "my-harness"]
+console.log(listAdapters()); // ["claude-code", "codex", "my-harness"]
 ```
 Duplicate registration throws so accidental overrides fail fast during startup or tests.
@@ -610,12 +656,55 @@ The adapter captures Claude’s stream-json output and builds a `TrajectoryView`
 ---
+## Codex CLI adapter
+Nested under `codex` in YAML (or flat in programmatic config). Maps to [Codex CLI reference](https://developers.openai.com/codex/cli/reference) (`codex exec` flags).
+The harness adapter invokes:
+```bash
+codex --ask-for-approval never exec --json [exec flags…] "<prompt>"
+```
+`--ask-for-approval` is a **global** flag (before `exec`); other options attach to the `exec` subcommand.
+| Field | CLI flag | Notes |
+| ----- | -------- | ----- |
+| `binary` | — | Default `codex` |
+| `model` | `--model` | Also settable at top level |
+| `profile` | `--profile` | Layer `$CODEX_HOME/<profile>.config.toml` |
+| `sandbox` | `--sandbox` | `read-only`, `workspace-write`, `danger-full-access` |
+| `addDirs` | `--add-dir` | Extra writable dirs (repeatable) |
+| `configOverrides` | `-c key=value` | Inline TOML overrides (repeatable) |
+| `askForApproval` | `--ask-for-approval` | Default `never` for non-interactive eval |
+| `dangerouslyBypassApprovalsAndSandbox` | `--yolo` | Hardened CI only |
+| `dangerouslyBypassHookTrust` | `--dangerously-bypass-hook-trust` | Automation with vetted hooks |
+| `ephemeral` | `--ephemeral` | No session rollout files |
+| `ignoreUserConfig` | `--ignore-user-config` | Skip `$CODEX_HOME/config.toml` |
+| `skipGitRepoCheck` | `--skip-git-repo-check` | Allow runs outside git repos |
+| `outputSchema` | `--output-schema` | JSON Schema for structured final output |
+| `outputLastMessage` | `--output-last-message` | Write final assistant message to file (auto temp path when `captureLastMessage` is true) |
+| `captureLastMessage` | — | Default `true`: auto `--output-last-message` and read into `finalResponse` if JSONL has no assistant text |
+| `isolateConfig` | — | `false` (default) = inherit `~/.codex`; `true` = temp `$CODEX_HOME` per run |
+Generic `cwd` sets the child process working directory (`--cd`). MCP tool calls in Codex `--json` output map to harness names `mcp__<server>__<tool>`; shell commands map to `Bash`.
+The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/codex/` — CI does not require `codex` on `PATH`.
+**Example suite:** [examples/codex-basic.yaml](examples/codex-basic.yaml)
+**Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
+---
 ## Library API
 ```typescript
 import {
   loadSuite,
+  loadSuiteDocument,
   runSuite,
+  runPipeline,
   gradeReport,
   buildEvalRunEnvelope,
   trajectoryToTranscript,
@@ -625,6 +714,11 @@ import {
 } from "@alis-build/harness-eval";
 import { loadGradingConfig } from "@alis-build/harness-eval/config";
+// Unified pipeline
+const doc = await loadSuiteDocument("./examples/pipeline/suite.yaml");
+const { exitCode } = await runPipeline(doc, { maxConcurrent: 2 });
+// Or step-by-step
 const suite = await loadSuite("./examples/basic.yaml");
 const report = await runSuite(suite, { maxConcurrent: 2 });
@@ -649,7 +743,7 @@ const envelope = buildEvalRunEnvelope(report, {
 });
 ```
-Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`.
+Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`.
 ---
@@ -690,7 +784,7 @@ pnpm run typecheck
 pnpm run generate-schemas   # Zod → schemas/*.schema.json only
 ```
-**Docs:** [Assertion DSL & adapter extension](docs/assertions.md) · [Eval record contract (DB / CI)](docs/eval-record.md)
+**Docs:** [Suite & grading YAML](docs/suite-config.md) · [Assertion DSL & adapter extension](docs/assertions.md) · [Eval record contract (DB / CI)](docs/eval-record.md)
 ---

package/dist/adapters/claude-code/index.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
-import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-B9H4IZtA.js";
-import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-6Z17eKZx.js";
+import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-C0gBkl0-.js";
+import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-DnvP1UBl.js";
 export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };

package/dist/adapters/claude-code/index.js CHANGED Viewed

@@ -1,2 +1,3 @@
-import { a as AdapterError, r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-ycT0JQZF.js";
+import { t as AdapterError } from "../../types-Bac8_Ixb.js";
+import { r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-C_7hxC8z.js";
 export { AdapterError, claudeCodeAdapter, runClaudeCode };

package/dist/adapters/codex/index.d.ts ADDED Viewed

@@ -0,0 +1,68 @@
+import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-C0gBkl0-.js";
+import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-Bu8uOZZN.js";
+//#region src/adapters/codex/map-events.d.ts
+/** Stateful mapper — tracks session id and pending tool calls across the stream. */
+declare class CodexEventMapper {
+  private sessionId;
+  private sawInit;
+  private startedItems;
+  private turnCount;
+  /** Map one parsed Codex JSON object to zero or more stream events. */
+  map(event: CodexJsonEvent): StreamEvent[];
+  private buildInit;
+  private ensureInit;
+  private mapItemStarted;
+  private mapItemCompleted;
+  private toolUseEvent;
+  private commandUseEvent;
+  private toolResultEvent;
+  private buildResult;
+}
+/** Map an entire fixture or stream of Codex events through a fresh mapper. */
+declare function mapCodexEvents(events: CodexJsonEvent[]): StreamEvent[];
+/** Build harness-qualified MCP tool name from Codex server + tool fields. */
+declare function mcpToolName(server: string, tool: string): string;
+//#endregion
+//#region src/adapters/codex/flags.d.ts
+/** Prepend global flags that must appear before the `exec` subcommand. */
+declare function appendGlobalCodexFlags(args: string[], config: CodexOptions): void;
+/** Append `codex exec` subcommand flags (after `exec`, before prompt). */
+declare function appendExecCodexFlags(args: string[], config: CodexOptions & {
+  model?: string;
+  cwd?: string;
+}): void;
+/** @deprecated Use appendGlobalCodexFlags + appendExecCodexFlags */
+declare function appendCodexFlags(args: string[], config: CodexOptions & {
+  model?: string;
+  cwd?: string;
+}): void;
+/**
+ * Ensure harness runs pass `--output-last-message` when capture is enabled.
+ * Returns the auto-generated path (for cleanup), or null if unchanged.
+ */
+declare function ensureHarnessOutputLastMessage(config: CodexAdapterConfig): string | null;
+/**
+ * Build argv for `codex --ask-for-approval never exec --json … "<prompt>"`.
+ *
+ * Expects `config.outputLastMessage` to already be set if capture is desired;
+ * call {@link ensureHarnessOutputLastMessage} before this if spawning outside
+ * of {@link spawnCodex}.
+ */
+declare function buildArgs(config: CodexAdapterConfig): string[];
+/**
+ * Build argv for `codex --ask-for-approval never exec … "<prompt>"` (no `--json`).
+ */
+declare function buildJudgeArgs(prompt: string, config?: CodexOptions & {
+  model?: string;
+  cwd?: string;
+}): string[];
+//#endregion
+//#region src/adapters/codex/index.d.ts
+/** Run Codex in headless `exec --json` mode and return a trajectory. */
+declare function runCodex(config: CodexAdapterConfig): Promise<CodexAdapterResult>;
+/** Registered {@link HarnessAdapter} for Codex CLI headless runs. */
+declare const codexAdapter: HarnessAdapter<CodexAdapterConfig>;
+//#endregion
+export { type AdapterDiagnostics, AdapterError, type AdapterResult, type CodexAdapterConfig, type CodexAdapterResult, CodexEventMapper, type CodexOptions, type ParseErrorRecord, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
+//# sourceMappingURL=index.d.ts.map

package/dist/adapters/codex/index.js ADDED Viewed

@@ -0,0 +1,3 @@
+import { t as AdapterError } from "../../types-Bac8_Ixb.js";
+import { a as appendGlobalCodexFlags, c as ensureHarnessOutputLastMessage, d as mcpToolName, i as appendExecCodexFlags, l as CodexEventMapper, n as runCodex, o as buildArgs, r as appendCodexFlags, s as buildJudgeArgs, t as codexAdapter, u as mapCodexEvents } from "../../codex-0cHO2te9.js";
+export { AdapterError, CodexEventMapper, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };

package/dist/{claude-code-ycT0JQZF.js → claude-code-C_7hxC8z.js} RENAMED Viewed

@@ -1,235 +1,9 @@
 import { t as __exportAll } from "./rolldown-runtime-D7D4PA-g.js";
+import { n as TrajectoryBuilder, t as AdapterError } from "./types-Bac8_Ixb.js";
 import { spawn } from "node:child_process";
 import { mkdtemp, rm } from "node:fs/promises";
 import { tmpdir } from "node:os";
 import { join } from "node:path";
-//#region src/types/stream.ts
-/** Type guards. Prefer these over manual `e.type === "..."` checks at call sites. */
-function isSystemInit(e) {
-	return e.type === "system" && e.subtype === "init";
-}
-function isSystemRetry(e) {
-	return e.type === "system" && e.subtype === "api_retry";
-}
-function isAssistantMessage(e) {
-	return e.type === "assistant";
-}
-function isUserMessage(e) {
-	return e.type === "user";
-}
-function isResult(e) {
-	return e.type === "result";
-}
-function isTextBlock(b) {
-	return b.type === "text";
-}
-function isToolUseBlock(b) {
-	return b.type === "tool_use";
-}
-function isToolResultBlock(b) {
-	return b.type === "tool_result";
-}
-//#endregion
-//#region src/types/trajectory.ts
-/**
-* Extract the MCP namespace prefix from a tool name.
-*
-* Claude Code formats MCP tool names as `mcp__<server>__<tool>`. The namespace
-* is the first two segments joined: `mcp__<server>`. Returns null for non-MCP
-* tool names (built-ins like `Bash`, `Read`, `Edit`).
-*
-* @example
-*   namespaceOf("mcp__api__search_skills") // "mcp__api"
-*   namespaceOf("Bash")                     // null
-*/
-function namespaceOf(toolName) {
-	if (!toolName.startsWith("mcp__")) return null;
-	const parts = toolName.split("__");
-	if (parts.length < 3) return null;
-	return `${parts[0]}__${parts[1]}`;
-}
-//#endregion
-//#region src/trajectory/builder.ts
-/**
-* TrajectoryBuilder — consumes a stream of {@link StreamEvent} values and
-* produces a {@link TrajectoryView}.
-*
-* State machine: the builder is a small, tolerant state machine. Invariants:
-*
-*   - Exactly one `system/init` event opens the session. The builder requires
-*     it to be present before `build()`.
-*   - Each `assistant` event begins a new turn. Text blocks accumulate into
-*     the turn's text; `tool_use` blocks become `ToolCall` records.
-*   - `user` events with `tool_result` blocks deliver tool results back. We
-*     match them to pending calls by `tool_use_id`.
-*   - One `result` event closes the session and carries aggregate usage.
-*
-* The builder is *tolerant of partial streams*: a process killed mid-run
-* produces a coherent (but flagged) view. Tool calls without matching results
-* keep `result: null`. The `success` flag reflects whether a successful result
-* event was actually observed.
-*
-* Why a class (not a reducer)?
-*   The internal `pendingCalls` map is mutable by design — we modify ToolCall
-*   objects in place when results arrive, so other parts of the view (which
-*   hold references to the same objects) see the update for free. A reducer
-*   would force a deep copy per result event, which is wasteful and would
-*   complicate identity-based queries.
-*/
-var TrajectoryBuilder = class {
-	meta = null;
-	sessionStartTs = null;
-	turns = [];
-	allToolCalls = [];
-	/**
-	* tool_use_id → ToolCall, for matching results back to calls.
-	* Entries are removed once a result is observed.
-	*/
-	pendingCalls = /* @__PURE__ */ new Map();
-	retries = [];
-	finalUsage = null;
-	finalCostUsd = 0;
-	finalDurationMs = 0;
-	finalNumTurns = 0;
-	finalResultText = "";
-	sawResultEvent = false;
-	resultIsError = false;
-	/**
-	* Consume one event. Safe to call with events in stream order.
-	*
-	* Unknown event types are silently ignored — the schema evolves and we
-	* don't want CI to break on a new event type we haven't modelled.
-	*/
-	consume(event) {
-		if (isSystemInit(event)) {
-			this.meta = {
-				sessionId: event.session_id,
-				model: event.model,
-				cwd: event.cwd,
-				permissionMode: event.permissionMode,
-				availableTools: event.tools ?? [],
-				mcpServers: (event.mcp_servers ?? []).map((s) => ({
-					name: s.name,
-					status: s.status
-				}))
-			};
-			this.sessionStartTs = Date.now();
-			return;
-		}
-		if (event.type === "system" && event.subtype === "api_retry") {
-			this.retries.push({
-				offsetMs: this.sessionStartTs ? Date.now() - this.sessionStartTs : 0,
-				raw: event
-			});
-			return;
-		}
-		if (isAssistantMessage(event)) {
-			this.handleAssistantMessage(event);
-			return;
-		}
-		if (isUserMessage(event)) {
-			this.handleUserMessage(event);
-			return;
-		}
-		if (isResult(event)) {
-			this.sawResultEvent = true;
-			this.resultIsError = event.is_error;
-			this.finalUsage = event.usage ?? null;
-			this.finalCostUsd = event.total_cost_usd ?? 0;
-			this.finalDurationMs = event.duration_ms ?? 0;
-			this.finalNumTurns = event.num_turns ?? 0;
-			this.finalResultText = event.result ?? "";
-			return;
-		}
-	}
-	/**
-	* Finalize the view. Call after consuming the last event from the stream.
-	*
-	* Throws if no `system/init` was observed — at that point we have no model,
-	* no session id, and no available-tools list, which means assertions like
-	* "called any mcp__api__* tool" can't even be evaluated meaningfully.
-	*/
-	build() {
-		if (this.meta === null) throw new Error("TrajectoryBuilder.build() called before any system/init event was observed. The harness may have failed to start, or the stream was truncated before init.");
-		const lastTurn = this.turns[this.turns.length - 1];
-		const accumulatedText = this.turns.map((t) => t.text).filter((t) => t.length > 0).join("\n\n").trim();
-		return {
-			meta: this.meta,
-			toolCalls: this.allToolCalls,
-			turns: this.turns,
-			finalResponse: accumulatedText || this.finalResultText,
-			finalStopReason: lastTurn?.stopReason ?? null,
-			usage: {
-				inputTokens: this.finalUsage?.input_tokens ?? 0,
-				outputTokens: this.finalUsage?.output_tokens ?? 0,
-				totalCostUsd: this.finalCostUsd,
-				durationMs: this.finalDurationMs,
-				numTurns: this.finalNumTurns || this.turns.length
-			},
-			retries: this.retries,
-			success: this.sawResultEvent && !this.resultIsError
-		};
-	}
-	handleAssistantMessage(event) {
-		const turnIndex = this.turns.length;
-		const textChunks = [];
-		const toolCallsThisTurn = [];
-		for (const block of event.message.content) {
-			if (isTextBlock(block)) {
-				textChunks.push(block.text);
-				continue;
-			}
-			if (isToolUseBlock(block)) {
-				const call = {
-					name: block.name,
-					namespace: namespaceOf(block.name),
-					callId: block.id,
-					args: block.input,
-					result: null,
-					isError: false,
-					turnIndex,
-					callIndex: this.allToolCalls.length
-				};
-				this.allToolCalls.push(call);
-				this.pendingCalls.set(block.id, call);
-				toolCallsThisTurn.push(call);
-				continue;
-			}
-		}
-		this.turns.push({
-			turnIndex,
-			text: textChunks.join("").trim(),
-			toolCalls: toolCallsThisTurn,
-			stopReason: event.message.stop_reason ?? null
-		});
-	}
-	handleUserMessage(event) {
-		const content = event.message.content;
-		if (typeof content === "string") return;
-		for (const block of content) {
-			if (!isToolResultBlock(block)) continue;
-			const call = this.pendingCalls.get(block.tool_use_id);
-			if (!call) continue;
-			call.result = block.content;
-			call.isError = block.is_error ?? false;
-			this.pendingCalls.delete(block.tool_use_id);
-		}
-	}
-};
-/**
-* Convenience: drain an async iterable of events through a fresh builder.
-*
-* Suitable when you have the full event stream and just want the view.
-* For interactive/incremental scenarios (e.g. surfacing partial state in a UI)
-* instantiate {@link TrajectoryBuilder} directly and call `consume()` /
-* `build()` yourself.
-*/
-async function buildTrajectory(events) {
-	const builder = new TrajectoryBuilder();
-	for await (const event of events) builder.consume(event);
-	return builder.build();
-}
-//#endregion
 //#region src/parsers/stream-json.ts
 /**
 * Parse a readable stream of NDJSON into a sequence of typed stream-json events.
@@ -281,27 +55,16 @@ function tryParseLine(line) {
 	}
 }
 //#endregion
-//#region src/adapters/types.ts
-/**
-* Thrown when the harness fails to produce a usable trajectory.
-*
-* Most commonly this means the process failed before emitting a usable
-* session init event. Inspect `diagnostics.stderr` for the cause.
-*/
-var AdapterError = class extends Error {
-	diagnostics;
-	constructor(message, diagnostics) {
-		super(message);
-		this.diagnostics = diagnostics;
-		this.name = "AdapterError";
-	}
-};
-//#endregion
 //#region src/adapters/claude-code/flags.ts
+/** Append repeated `--flag value` pairs for array config fields. */
 function pushRepeatableFlag(args, flag, values) {
 	if (!values) return;
 	for (const value of values) args.push(flag, value);
 }
+/**
+* Append an optional CLI flag. Boolean `true` emits the flag alone; other
+* scalars emit `--flag value`.
+*/
 function pushOptionalFlag(args, flag, value) {
 	if (value === void 0) return;
 	if (typeof value === "boolean") {
@@ -360,7 +123,12 @@ function buildArgs(config) {
 	appendClaudeCodeFlags(args, config);
 	return args;
 }
-/** Build args for an LLM judge subprocess (`--output-format json`). */
+/**
+* Build args for an LLM judge subprocess (`--output-format json`).
+*
+* Defaults permission mode to `bypassPermissions` so the judge does not
+* block on tool permission prompts during single-shot JSON grading.
+*/
 function buildJudgeArgs(prompt, config = {}) {
 	const args = [
 		"-p",
@@ -402,6 +170,14 @@ const KILL_GRACE_MS = 5e3;
 /**
 * Spawn `claude` in headless mode with isolated config and a process-group
 * lifecycle. See {@link SpawnedClaude} for how to consume the result.
+*
+* **Kill sequence:** timeout and abort both follow the same two-step path:
+* `SIGTERM` to the process group, then `SIGKILL` after {@link KILL_GRACE_MS}
+* if the group is still alive. This avoids leaving MCP/tool subprocesses
+* running while still giving claude a chance to flush stream-json output.
+*
+* @param config - Adapter options; `timeoutMs`, `signal`, and `isolateConfig`
+*   control lifecycle and config isolation.
 */
 async function spawnClaude(config) {
 	const binary = config.binary ?? "claude";
@@ -425,6 +201,10 @@ async function spawnClaude(config) {
 	let timedOut = false;
 	let killEscalation = null;
 	const timeoutMs = config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+	/**
+	* Arm (or re-arm) the SIGKILL fallback. Each SIGTERM attempt gets its own
+	* grace window so a slow shutdown doesn't leave orphaned MCP servers.
+	*/
 	const scheduleKillEscalation = () => {
 		if (killEscalation) clearTimeout(killEscalation);
 		killEscalation = setTimeout(() => killTree(child, "SIGKILL"), KILL_GRACE_MS);
@@ -487,10 +267,16 @@ async function spawnClaude(config) {
 * group is already gone. This catches MCP server subprocesses and tool
 * processes spawned by claude.
 *
-* Why both? On some platforms the process group dies before we get here
-* (the child itself already cleaned up), in which case `kill(-pid)` throws
-* ESRCH. The fallback handles that edge case without leaking zombies in
-* the common case.
+* **Signal escalation:** callers typically invoke this first with `SIGTERM`,
+* then again with `SIGKILL` after {@link KILL_GRACE_MS}. The group kill is
+* essential — a bare `child.kill()` would leave MCP servers running.
+*
+* **Platform edge case:** when the group leader exits first, `kill(-pid)`
+* throws `ESRCH`. The single-PID fallback covers that without failing the
+* adapter run.
+*
+* @param child - Spawned process handle from {@link spawn}.
+* @param signal - POSIX signal to deliver (`SIGTERM` or `SIGKILL` in practice).
 */
 function killTree(child, signal) {
 	if (child.pid === void 0) return;
@@ -553,11 +339,12 @@ async function runClaudeCode(config) {
 		await spawned.cleanup();
 	}
 }
+/** Registered {@link HarnessAdapter} for Claude Code headless runs. */
 const claudeCodeAdapter = {
 	id: "claude-code",
 	run: runClaudeCode
 };
 //#endregion
-export { isUserMessage as _, AdapterError as a, buildTrajectory as c, isResult as d, isSystemInit as f, isToolUseBlock as g, isToolResultBlock as h, buildJudgeArgs as i, namespaceOf as l, isTextBlock as m, claude_code_exports as n, parseStreamJson as o, isSystemRetry as p, runClaudeCode as r, TrajectoryBuilder as s, claudeCodeAdapter as t, isAssistantMessage as u };
+export { parseStreamJson as a, buildJudgeArgs as i, claude_code_exports as n, runClaudeCode as r, claudeCodeAdapter as t };
-//# sourceMappingURL=claude-code-ycT0JQZF.js.map
+//# sourceMappingURL=claude-code-C_7hxC8z.js.map