npm - @alis-build/harness-eval - Versions diffs - 0.1.2 → 0.1.4 - Mend

@alis-build/harness-eval 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/README.md +187 -30
package/dist/adapters/claude-code/index.d.ts +2 -2
package/dist/adapters/claude-code/index.js +2 -1
package/dist/adapters/codex/index.d.ts +68 -0
package/dist/adapters/codex/index.js +3 -0
package/dist/{claude-code-DZ4Vkgp6.js → claude-code-C_7hxC8z.js} +3 -245
package/dist/claude-code-C_7hxC8z.js.map +1 -0
package/dist/cli/bin.js +131 -151
package/dist/cli/bin.js.map +1 -1
package/dist/codex-0cHO2te9.js +496 -0
package/dist/codex-0cHO2te9.js.map +1 -0
package/dist/config/loader.d.ts +2 -2
package/dist/config/loader.js +2 -2
package/dist/{index-V22PrR0p.d.ts → index-C56AEDUr.d.ts} +2 -2
package/dist/index.d.ts +134 -6
package/dist/index.js +6 -5
package/dist/index.js.map +1 -1
package/dist/{loader-DcI0KfRX.js → loader-CiBm4Kf6.js} +491 -209
package/dist/loader-CiBm4Kf6.js.map +1 -0
package/dist/loader-CrmzNwkq.d.ts +107 -0
package/dist/{projections-BcX7w-f6.js → reporter-BKCJZRYr.js} +1475 -729
package/dist/reporter-BKCJZRYr.js.map +1 -0
package/dist/runner/suite.d.ts +1 -1
package/dist/runner/suite.js +1 -1
package/dist/{suite-Dlzl-HI0.js → suite-C3-8EjUW.js} +558 -4
package/dist/suite-C3-8EjUW.js.map +1 -0
package/dist/{suite-DPJMIEbu.d.ts → suite-qyOGre2g.d.ts} +2 -2
package/dist/types-Bac8_Ixb.js +246 -0
package/dist/types-Bac8_Ixb.js.map +1 -0
package/dist/{types-CD3TwOtZ.d.ts → types-CLt4Yygc.d.ts} +2 -2
package/dist/{types-B9H4IZtA.d.ts → types-D0HR2WnP.d.ts} +9 -2
package/dist/types-DFMpv_HJ.d.ts +77 -0
package/package.json +11 -2
package/schemas/eval-run-envelope.schema.json +193 -183
package/dist/claude-code-DZ4Vkgp6.js.map +0 -1
package/dist/loader-C9yQHUPC.d.ts +0 -50
package/dist/loader-DcI0KfRX.js.map +0 -1
package/dist/projections-BcX7w-f6.js.map +0 -1
package/dist/suite-Dlzl-HI0.js.map +0 -1

package/README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 # @alis-build/harness-eval
-Statistical eval framework for **AI coding agent harnesses** (Claude Code today; Cursor and Gemini planned). Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
+Statistical eval framework for **AI coding agent harnesses**. Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
+**Built-in harness adapters:** `claude-code`, `codex`, and `gemini-cli`. Set `adapter:` in suite YAML; the runner, assertions, and eval interchange stay the same regardless of vendor.
 **Use it to answer:** “When users ask X, does this harness actually call our MCP tools — reliably, in this plugin/model setup?”
@@ -9,10 +11,20 @@ Statistical eval framework for **AI coding agent harnesses** (Claude Code today;
 ## Requirements
 - Node.js ≥ 22.12 required; Node 24 LTS recommended for development and CI
-- `claude` on `PATH` (for the Claude Code adapter)
-- Authentication for Claude Code:
-  - **Option A:** `claude login` and set `isolateConfig: false` in your suite (uses your normal plugins/MCP setup)
-  - **Option B:** `ANTHROPIC_API_KEY` with isolated config per run (default adapter behavior)
+- A harness CLI on `PATH` for the adapter you use (see [Adding harness adapters](#adding-harness-adapters)):
+  - **`claude-code`** — `claude` ([Claude Code CLI](https://code.claude.com/docs/en/cli-reference))
+  - **`codex`** — `codex` ([Codex CLI](https://developers.openai.com/codex/cli/reference))
+  - **`gemini-cli`** — `gemini` ([Gemini CLI](https://geminicli.com/docs/cli/cli-reference/))
+### Authentication (by adapter)
+| Adapter | Typical auth |
+| ------- | ------------ |
+| **Claude Code** | `claude login` with `isolateConfig: false`, or `ANTHROPIC_API_KEY` with isolated config (default harness behavior) |
+| **Codex** | Logged-in `~/.codex`, or `OPENAI_API_KEY` when `codex.isolateConfig: true` |
+| **Gemini CLI** | Logged-in Gemini CLI config with `geminiCli.isolateConfig: false`, or Vertex/API key env vars (`GOOGLE_APPLICATION_CREDENTIALS`, `GEMINI_API_KEY`, etc.) when isolated |
+Each adapter section below documents `isolateConfig`, MCP setup, and headless flags in detail.
 ---
@@ -54,13 +66,18 @@ pnpm exec harness-eval --help
 Suites are YAML files. Committed examples:
-- [`examples/basic.yaml`](examples/basic.yaml) — smoke test using the built-in `Read` tool on this repo's README
-- [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
+- [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
+- [`examples/basic.yaml`](examples/basic.yaml) — Claude Code smoke test (`Read` on this repo's README)
+- [`examples/codex-basic.yaml`](examples/codex-basic.yaml) — Codex CLI smoke test
+- [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) — Gemini CLI smoke test
+- [`examples/matrix.yaml`](examples/matrix.yaml) — Claude Code with a model matrix (sonnet vs opus)
 - [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
-- [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config for `harness-eval grade`
+- [`examples/grading.yaml`](examples/grading.yaml) — Claude Code judge config (standalone)
+- [`examples/codex-grading.yaml`](examples/codex-grading.yaml) — Codex judge config
+- [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) — Gemini CLI judge config
 ```yaml
-adapter: claude-code
+adapter: claude-code   # or: codex | gemini-cli
 defaultConfig:
   model: claude-sonnet-4-6
@@ -94,17 +111,21 @@ cases:
       - "The summary is grounded in README content, not a generic refusal"
 ```
-Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
+Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Harness-specific options nest under `claudeCode`, `codex`, or `geminiCli` depending on `adapter`.
-**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, `reference_trajectory`, `human_ratings`, multi-file layout, and `grading.yaml` options.
+**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
 ### 2. Run behavioral eval
 ```bash
+# Unified pipeline (run + optional grade + envelope when pipeline: is defined)
+npx @alis-build/harness-eval pipeline examples/pipeline/
+# Or run harness only
 npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
 ```
-This spawns Claude Code headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
+This spawns the configured harness CLI headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
 **Progress (stderr):** one line per repetition with ETA by default; use `--quiet` for dots or `--verbose` for tool/assertion detail.
@@ -112,13 +133,20 @@ Exit code `0` = all cells passed all assertion thresholds.
 ### 3. Grade outcomes (optional)
-Judge model, timeout, env, and `claudeCode` flags live in a separate **`grading.yaml`** (not in the suite file). See [`examples/grading.yaml`](examples/grading.yaml).
+**Unified suite:** add a top-level `judge:` block in `suite.yaml` (see [`examples/pipeline/suite.yaml`](examples/pipeline/suite.yaml)), then:
+```bash
+npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.yaml --output grading.json --max-concurrent 1 --format console
+# or: npx @alis-build/harness-eval pipeline examples/pipeline/ --steps grade
+```
+**Standalone grading file:** judge config in a separate **`grading.yaml`** (still supported). See [`examples/grading.yaml`](examples/grading.yaml).
 ```bash
 npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
 ```
-Runs a separate Claude subprocess as **judge** against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
+Runs a separate harness subprocess as **judge** (`judge.adapter`: `claude-code`, `codex`, or `gemini-cli`) against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
 Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2` = no expectations or no gradable repetitions.
@@ -126,13 +154,13 @@ Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2`
 ## Data contracts & schemas
-harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not Claude `stream-json` or OTLP as your primary record.
+harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not raw adapter NDJSON or OTLP as your primary record.
 ### Layering
 | Layer           | Type                  | Where                     | Use for                                            |
 | --------------- | --------------------- | ------------------------- | -------------------------------------------------- |
-| Vendor stream   | `StreamEvent`         | `src/types/stream.ts`     | Claude `stream-json` debug only                    |
+| Vendor stream   | `StreamEvent`         | `src/types/stream.ts`     | Adapter debug only (Claude/Codex/Gemini NDJSON)    |
 | Harness session | **`TrajectoryView`**  | `src/types/trajectory.ts` | Assertions, trajectory queries, judge input        |
 | Run report      | **`SuiteReport`**     | `report.json` from `run`  | Runner output; full trajectories + assertion stats |
 | Eval record     | **`EvalRunEnvelope`** | `buildEvalRunEnvelope()`  | CI gates, APIs, DB storage                         |
@@ -256,7 +284,7 @@ You do not need `harness-eval grade` if you already have LangSmith, Braintrust,
 | ------------------------ | ------------------------------ | ------------------------------------------ |
 | Headless harness runs    | `run` / `runSuite`             | —                                          |
 | Tool-call behavior       | Assertions on `TrajectoryView` | Optional: re-implement on `toolCalls`      |
-| Outcome / rubric scoring | `grade` (Claude judge)         | Your judge, eval platform, or human review |
+| Outcome / rubric scoring | `grade` (built-in judges)      | Your judge, eval platform, or human review |
 | Storage contract         | `EvalRunEnvelope`              | Same envelope; attach `externalScores`     |
 ### Pattern 1 — Behavioral only (no LLM judge)
@@ -293,7 +321,7 @@ const myJudge: GraderFn = async ({ prompt, transcript, expectations }) => {
 const grading = await gradeReport(report, { gradeFn: myJudge });
 ```
-Output is the same `SuiteGradingReport` shape as the built-in Claude grader — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
+Output is the same `SuiteGradingReport` shape as the built-in judges — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
 ### Pattern 3 — Separate judge pipeline (any language)
@@ -327,7 +355,7 @@ envelope.cells[0].repetitions[0].externalScores = [
 ];
 ```
-**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw Claude `stream-json` (Claude-only and verbose).
+**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw vendor NDJSON (adapter-specific and verbose).
 ### Pattern 4 — LangSmith, Braintrust, OpenAI Evals, etc.
@@ -389,7 +417,7 @@ Map your framework's output into these shapes (or use `externalScores`) so CI an
 | Layer        | Command | What it checks                          | Mechanism                                    |
 | ------------ | ------- | --------------------------------------- | -------------------------------------------- |
 | **Behavior** | `run`   | Tool calls, order, args, efficiency     | Deterministic assertions on `TrajectoryView` |
-| **Outcome**  | `grade` | Answer quality, grounding, completeness | LLM judge on transcript + `finalResponse`    |
+| **Outcome**  | `grade` | Answer quality, grounding, completeness | LLM judge (`claude-code`, `codex`, or `gemini-cli`) on transcript + `finalResponse` |
 Both layers use statistical thresholds: a case runs `repetitions` times per matrix cell, and each assertion/expectation has a pass-rate threshold (default `1.0`).
@@ -401,6 +429,7 @@ Both layers use statistical thresholds: a case runs `repetitions` times per matr
 npx @alis-build/harness-eval run <suite.yaml> [options]
 npx @alis-build/harness-eval grade <report.json> [options]
 npx @alis-build/harness-eval envelope <report.json> [options]
+npx @alis-build/harness-eval pipeline <suite.yaml|dir> [options]
 npx @alis-build/harness-eval format <report.json> [options]
 npx @alis-build/harness-eval --help
 ```
@@ -422,12 +451,12 @@ npx @alis-build/harness-eval --help
 ### `grade`
-Uses a standalone **`grading.yaml`** for judge model, timeout, env, and `claudeCode` flags (Option B — separate from the suite file).
+Uses **`grading.yaml`**, an inline **`judge:`** block in `suite.yaml` (`--suite`), or adapter-specific grading files under `examples/`.
 **Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
 ```yaml
-# examples/grading.yaml
+# examples/grading.yaml (Claude Code judge)
 judge:
   adapter: claude-code
   model: claude-sonnet-4-6
@@ -437,25 +466,38 @@ judge:
     permissionMode: bypassPermissions
 ```
+Other committed judge configs: [`examples/codex-grading.yaml`](examples/codex-grading.yaml) (`adapter: codex`), [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) (`adapter: gemini-cli`).
 ```bash
 npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json
+npx @alis-build/harness-eval grade report.json --config examples/codex-grading.yaml --output grading.json
+npx @alis-build/harness-eval grade report.json --config examples/gemini-grading.yaml --output grading.json
 ```
 | Option                                 | Description                                                       |
 | -------------------------------------- | ----------------------------------------------------------------- |
-| `--config <path>`                      | Grading YAML (`judge` block) — model, env, timeout, `claudeCode`  |
+| `--config <path>`                      | Grading YAML (`judge` block) — model, env, timeout, adapter options |
+| `--suite <path>`                       | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
 | `--output <path>`                      | Write grading JSON                                                |
 | `--expectations <path>`                | Sidecar YAML/JSON if report lacks expectations                    |
 | `--format console\|json`               | Output format                                                     |
 | `--model <id>`                         | Overrides `judge.model` in config                                 |
-| `--binary <path>`                      | Overrides `judge.claudeCode.binary`                               |
+| `--binary <path>`                      | Overrides judge binary for the selected adapter                   |
 | `--timeout-ms <n>`                     | Overrides `judge.timeoutMs`                                       |
 | `--max-concurrent <n>`                 | Overrides `judge.maxConcurrent` (default: 2 if unset)             |
 | `--quiet` / `--verbose` / `--progress` | Same progress modes as `run` (including `--color` / `--no-color`) |
 CLI flags override the YAML file. Expectations still come from `report.json` (copied from the suite at `run` time) unless `--expectations` is set. The grading report may include `gradingConfigPath` when `--config` was used.
-The built-in judge spawns Claude with **`--output-format json`** (single-shot response, not `stream-json`). It applies **safe defaults** so Claude Code does not reload plugins/MCP during grading: `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, plus `permissionMode: bypassPermissions` on the judge subprocess. Override in `judge.claudeCode` only if you need a different judge setup.
+**Built-in judge defaults** (override under `judge.claudeCode`, `judge.codex`, or `judge.geminiCli`):
+| Adapter | Defaults (summary) |
+| ------- | ------------------ |
+| `claude-code` | `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, `permissionMode: bypassPermissions`; JSON output |
+| `codex` | `ephemeral: true`, `ignoreUserConfig: true`, `skipGitRepoCheck: true`, `askForApproval: never` |
+| `gemini-cli` | `approvalMode: yolo`, `isolateConfig: true`, `skipTrust: true`; `--output-format json` |
+See [docs/suite-config.md](docs/suite-config.md) and each adapter section below for full flag tables.
 Exit codes: `0` = all expectations passed; `1` = failures; `2` = no expectations or no gradable repetitions (harness failures without trajectories are skipped).
@@ -485,6 +527,28 @@ npx @alis-build/harness-eval envelope report.json --projection instances --outpu
 Exit codes: `0` = envelope built and behavioral pass; `1` = built but behavioral failures; `2` = usage or file errors.
+### `pipeline`
+Orchestrate **run → grade → envelope** from a unified `suite.yaml` when a `pipeline:` block is present. See [docs/suite-config.md — Pipeline orchestration](docs/suite-config.md#pipeline-orchestration-pipeline).
+```bash
+npx @alis-build/harness-eval pipeline examples/pipeline/
+npx @alis-build/harness-eval pipeline my-suite/ --steps run,grade
+```
+| Option | Description |
+| ------ | ----------- |
+| `--steps run,grade,envelope` | Subset of configured steps (default: all configured) |
+| `--output <path>` | Override `pipeline.run.output` |
+| `--report <path>` | Override report input for grade/envelope |
+| `--grading <path>` | Override grading input for envelope |
+| `--grading-output <path>` | Override `pipeline.grade.output` |
+| `--envelope-output <path>` | Override `pipeline.envelope.output` |
+| `--projection envelope\|trajectory\|instances` | Envelope projection |
+| `--max-concurrent <n>` | Parallel harness/judge workers |
+Exit codes match the first failing step (`run`, `grade`, or `envelope`). Returns `2` when no `pipeline:` block exists.
 ### `format`
 Re-render an existing `report.json` without re-running the harness.
@@ -547,9 +611,17 @@ Define expected tool calls for Vertex trajectory metrics on the eval envelope. U
 ---
-## Adding harness adapters
+## Harness adapters
+Built-in adapters register at module load. Each has a dedicated section below with CLI flag mapping, examples, and judge configuration.
-Built-in adapters register at module load. Today only `claude-code` ships; additional harnesses (Codex, Gemini CLI, Antigravity CLI) plug in via the same pattern:
+| Adapter | Suite key | Example suite | Example judge |
+| ------- | --------- | ------------- | ------------- |
+| Claude Code | `claudeCode` | [`examples/basic.yaml`](examples/basic.yaml) | [`examples/grading.yaml`](examples/grading.yaml) |
+| Codex CLI | `codex` | [`examples/codex-basic.yaml`](examples/codex-basic.yaml) | [`examples/codex-grading.yaml`](examples/codex-grading.yaml) |
+| Gemini CLI | `geminiCli` | [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) | [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) |
+Additional harnesses (e.g. Antigravity CLI) plug in via the same pattern:
 1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
 2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
@@ -564,7 +636,7 @@ import {
 } from "@alis-build/harness-eval";
 registerAdapter("my-harness", myAdapter);
-console.log(listAdapters()); // ["claude-code", "my-harness"]
+console.log(listAdapters()); // ["claude-code", "codex", "gemini-cli", …]
 ```
 Duplicate registration throws so accidental overrides fail fast during startup or tests.
@@ -620,12 +692,92 @@ The adapter captures Claude’s stream-json output and builds a `TrajectoryView`
 ---
+## Codex CLI adapter
+Nested under `codex` in YAML (or flat in programmatic config). Maps to [Codex CLI reference](https://developers.openai.com/codex/cli/reference) (`codex exec` flags).
+The harness adapter invokes:
+```bash
+codex --ask-for-approval never exec --json [exec flags…] "<prompt>"
+```
+`--ask-for-approval` is a **global** flag (before `exec`); other options attach to the `exec` subcommand.
+| Field | CLI flag | Notes |
+| ----- | -------- | ----- |
+| `binary` | — | Default `codex` |
+| `model` | `--model` | Also settable at top level |
+| `profile` | `--profile` | Layer `$CODEX_HOME/<profile>.config.toml` |
+| `sandbox` | `--sandbox` | `read-only`, `workspace-write`, `danger-full-access` |
+| `addDirs` | `--add-dir` | Extra writable dirs (repeatable) |
+| `configOverrides` | `-c key=value` | Inline TOML overrides (repeatable) |
+| `askForApproval` | `--ask-for-approval` | Default `never` for non-interactive eval |
+| `dangerouslyBypassApprovalsAndSandbox` | `--yolo` | Hardened CI only |
+| `dangerouslyBypassHookTrust` | `--dangerously-bypass-hook-trust` | Automation with vetted hooks |
+| `ephemeral` | `--ephemeral` | No session rollout files |
+| `ignoreUserConfig` | `--ignore-user-config` | Skip `$CODEX_HOME/config.toml` |
+| `skipGitRepoCheck` | `--skip-git-repo-check` | Allow runs outside git repos |
+| `outputSchema` | `--output-schema` | JSON Schema for structured final output |
+| `outputLastMessage` | `--output-last-message` | Write final assistant message to file (auto temp path when `captureLastMessage` is true) |
+| `captureLastMessage` | — | Default `true`: auto `--output-last-message` and read into `finalResponse` if JSONL has no assistant text |
+| `isolateConfig` | — | `false` (default) = inherit `~/.codex`; `true` = temp `$CODEX_HOME` per run |
+Generic `cwd` sets the child process working directory (`--cd`). MCP tool calls in Codex `--json` output map to harness names `mcp__<server>__<tool>`; shell commands map to `Bash`.
+The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/codex/` — CI does not require `codex` on `PATH`.
+**Example suite:** [examples/codex-basic.yaml](examples/codex-basic.yaml)
+**Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
+**Package export:** `@alis-build/harness-eval/adapters/codex`
+---
+## Gemini CLI adapter
+Nested under `geminiCli` in YAML (or flat in programmatic config). Maps to [Gemini CLI reference](https://geminicli.com/docs/cli/cli-reference/).
+The harness adapter invokes:
+```bash
+gemini -p "<prompt>" --output-format stream-json --approval-mode yolo [flags…]
+```
+| Field | CLI flag | Notes |
+| ----- | -------- | ----- |
+| `binary` | — | Default `gemini` |
+| `model` | `--model` | Also settable at top level |
+| `approvalMode` | `--approval-mode` | Default `yolo`; overridable: `default`, `auto_edit`, `plan` |
+| `sandbox` | `--sandbox` | Sandboxed execution |
+| `skipTrust` | `--skip-trust` | Default `true` for harness and judge — skips folder trust in headless runs |
+| `includeDirectories` | `--include-directories` | Extra workspace dirs (repeatable) |
+| `allowedMcpServerNames` | `--allowed-mcp-server-names` | MCP server allowlist |
+| `extensions` | `--extensions` | Extension allowlist |
+| `debug` | `--debug` | Verbose logging |
+| `isolateConfig` | — | `false` (default) = inherit caller config; `true` = temp config dir per run |
+MCP tool calls map to harness names `mcp__<server>__<tool>`; built-in Gemini tools keep native names (e.g. `Bash`, `read_file`).
+The adapter maps Gemini stream-json events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/gemini-cli/` — CI does not require `gemini` on `PATH`.
+**Example suite:** [examples/gemini-cli-basic.yaml](examples/gemini-cli-basic.yaml)
+**Gemini CLI judge:** set `judge.adapter: gemini-cli` and nest options under `judge.geminiCli` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)). Example: [examples/gemini-grading.yaml](examples/gemini-grading.yaml).
+**Package export:** `@alis-build/harness-eval/adapters/gemini-cli`
+---
 ## Library API
 ```typescript
 import {
   loadSuite,
+  loadSuiteDocument,
   runSuite,
+  runPipeline,
   gradeReport,
   buildEvalRunEnvelope,
   trajectoryToTranscript,
@@ -635,6 +787,11 @@ import {
 } from "@alis-build/harness-eval";
 import { loadGradingConfig } from "@alis-build/harness-eval/config";
+// Unified pipeline
+const doc = await loadSuiteDocument("./examples/pipeline/suite.yaml");
+const { exitCode } = await runPipeline(doc, { maxConcurrent: 2 });
+// Or step-by-step
 const suite = await loadSuite("./examples/basic.yaml");
 const report = await runSuite(suite, { maxConcurrent: 2 });
@@ -659,7 +816,7 @@ const envelope = buildEvalRunEnvelope(report, {
 });
 ```
-Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`.
+Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`, `@alis-build/harness-eval/adapters/gemini-cli`.
 ---
@@ -682,7 +839,7 @@ Suite YAML  →  runSuite  →  Harness adapter  →  TrajectoryView
                           EvalRunEnvelope  →  DB / CI / API
 ```
-- **Pluggable harness adapters** — runner and assertions depend only on `TrajectoryView`.
+- **Pluggable harness adapters** — `claude-code`, `codex`, and `gemini-cli` today; runner and assertions depend only on `TrajectoryView`.
 - **Pluggable outcome layer** — built-in `grade`, custom `gradeFn`, or any external workflow.
 - **OTLP** — observability side export; not required for scoring.

package/dist/adapters/claude-code/index.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
-import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-B9H4IZtA.js";
-import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-V22PrR0p.js";
+import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-D0HR2WnP.js";
+import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-C56AEDUr.js";
 export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };

package/dist/adapters/claude-code/index.js CHANGED Viewed

@@ -1,2 +1,3 @@
-import { a as AdapterError, r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-DZ4Vkgp6.js";
+import { t as AdapterError } from "../../types-Bac8_Ixb.js";
+import { r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-C_7hxC8z.js";
 export { AdapterError, claudeCodeAdapter, runClaudeCode };

package/dist/adapters/codex/index.d.ts ADDED Viewed

@@ -0,0 +1,68 @@
+import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-D0HR2WnP.js";
+import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-DFMpv_HJ.js";
+//#region src/adapters/codex/map-events.d.ts
+/** Stateful mapper — tracks session id and pending tool calls across the stream. */
+declare class CodexEventMapper {
+  private sessionId;
+  private sawInit;
+  private startedItems;
+  private turnCount;
+  /** Map one parsed Codex JSON object to zero or more stream events. */
+  map(event: CodexJsonEvent): StreamEvent[];
+  private buildInit;
+  private ensureInit;
+  private mapItemStarted;
+  private mapItemCompleted;
+  private toolUseEvent;
+  private commandUseEvent;
+  private toolResultEvent;
+  private buildResult;
+}
+/** Map an entire fixture or stream of Codex events through a fresh mapper. */
+declare function mapCodexEvents(events: CodexJsonEvent[]): StreamEvent[];
+/** Build harness-qualified MCP tool name from Codex server + tool fields. */
+declare function mcpToolName(server: string, tool: string): string;
+//#endregion
+//#region src/adapters/codex/flags.d.ts
+/** Prepend global flags that must appear before the `exec` subcommand. */
+declare function appendGlobalCodexFlags(args: string[], config: CodexOptions): void;
+/** Append `codex exec` subcommand flags (after `exec`, before prompt). */
+declare function appendExecCodexFlags(args: string[], config: CodexOptions & {
+  model?: string;
+  cwd?: string;
+}): void;
+/** @deprecated Use appendGlobalCodexFlags + appendExecCodexFlags */
+declare function appendCodexFlags(args: string[], config: CodexOptions & {
+  model?: string;
+  cwd?: string;
+}): void;
+/**
+ * Ensure harness runs pass `--output-last-message` when capture is enabled.
+ * Returns the auto-generated path (for cleanup), or null if unchanged.
+ */
+declare function ensureHarnessOutputLastMessage(config: CodexAdapterConfig): string | null;
+/**
+ * Build argv for `codex --ask-for-approval never exec --json … "<prompt>"`.
+ *
+ * Expects `config.outputLastMessage` to already be set if capture is desired;
+ * call {@link ensureHarnessOutputLastMessage} before this if spawning outside
+ * of {@link spawnCodex}.
+ */
+declare function buildArgs(config: CodexAdapterConfig): string[];
+/**
+ * Build argv for `codex --ask-for-approval never exec … "<prompt>"` (no `--json`).
+ */
+declare function buildJudgeArgs(prompt: string, config?: CodexOptions & {
+  model?: string;
+  cwd?: string;
+}): string[];
+//#endregion
+//#region src/adapters/codex/index.d.ts
+/** Run Codex in headless `exec --json` mode and return a trajectory. */
+declare function runCodex(config: CodexAdapterConfig): Promise<CodexAdapterResult>;
+/** Registered {@link HarnessAdapter} for Codex CLI headless runs. */
+declare const codexAdapter: HarnessAdapter<CodexAdapterConfig>;
+//#endregion
+export { type AdapterDiagnostics, AdapterError, type AdapterResult, type CodexAdapterConfig, type CodexAdapterResult, CodexEventMapper, type CodexOptions, type ParseErrorRecord, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
+//# sourceMappingURL=index.d.ts.map

package/dist/adapters/codex/index.js ADDED Viewed

@@ -0,0 +1,3 @@
+import { t as AdapterError } from "../../types-Bac8_Ixb.js";
+import { a as appendGlobalCodexFlags, c as ensureHarnessOutputLastMessage, d as mcpToolName, i as appendExecCodexFlags, l as CodexEventMapper, n as runCodex, o as buildArgs, r as appendCodexFlags, s as buildJudgeArgs, t as codexAdapter, u as mapCodexEvents } from "../../codex-0cHO2te9.js";
+export { AdapterError, CodexEventMapper, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };