npm - @alis-build/harness-eval - Versions diffs - 0.1.3 → 0.1.4 - Mend

@alis-build/harness-eval 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/README.md +101 -28
package/dist/adapters/claude-code/index.d.ts +2 -2
package/dist/adapters/codex/index.d.ts +2 -2
package/dist/cli/bin.js +3 -3
package/dist/config/loader.d.ts +1 -1
package/dist/config/loader.js +1 -1
package/dist/{index-DnvP1UBl.d.ts → index-C56AEDUr.d.ts} +2 -2
package/dist/index.d.ts +9 -7
package/dist/index.js +3 -3
package/dist/{loader-DnQ6Jt0i.js → loader-CiBm4Kf6.js} +21 -2
package/dist/loader-CiBm4Kf6.js.map +1 -0
package/dist/{loader-B1WmGGzf.d.ts → loader-CrmzNwkq.d.ts} +3 -3
package/dist/{reporter-Biy-5-9M.js → reporter-BKCJZRYr.js} +186 -17
package/dist/reporter-BKCJZRYr.js.map +1 -0
package/dist/runner/suite.d.ts +1 -1
package/dist/runner/suite.js +1 -1
package/dist/{suite-BcP64nlb.js → suite-C3-8EjUW.js} +543 -3
package/dist/suite-C3-8EjUW.js.map +1 -0
package/dist/{suite-BEShV0by.d.ts → suite-qyOGre2g.d.ts} +2 -2
package/dist/types-Bac8_Ixb.js.map +1 -1
package/dist/{types-0QkNVyp9.d.ts → types-CLt4Yygc.d.ts} +2 -2
package/dist/{types-C0gBkl0-.d.ts → types-D0HR2WnP.d.ts} +8 -2
package/dist/{types-Bu8uOZZN.d.ts → types-DFMpv_HJ.d.ts} +2 -2
package/package.json +6 -1
package/schemas/eval-run-envelope.schema.json +193 -183
package/dist/loader-DnQ6Jt0i.js.map +0 -1
package/dist/reporter-Biy-5-9M.js.map +0 -1
package/dist/suite-BcP64nlb.js.map +0 -1

package/README.md CHANGED Viewed

@@ -1,6 +1,8 @@
 # @alis-build/harness-eval
-Statistical eval framework for **AI coding agent harnesses** (Claude Code today; Cursor and Gemini planned). Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
+Statistical eval framework for **AI coding agent harnesses**. Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
+**Built-in harness adapters:** `claude-code`, `codex`, and `gemini-cli`. Set `adapter:` in suite YAML; the runner, assertions, and eval interchange stay the same regardless of vendor.
 **Use it to answer:** “When users ask X, does this harness actually call our MCP tools — reliably, in this plugin/model setup?”
@@ -9,10 +11,20 @@ Statistical eval framework for **AI coding agent harnesses** (Claude Code today;
 ## Requirements
 - Node.js ≥ 22.12 required; Node 24 LTS recommended for development and CI
-- `claude` on `PATH` (for the Claude Code adapter)
-- Authentication for Claude Code:
-  - **Option A:** `claude login` and set `isolateConfig: false` in your suite (uses your normal plugins/MCP setup)
-  - **Option B:** `ANTHROPIC_API_KEY` with isolated config per run (default adapter behavior)
+- A harness CLI on `PATH` for the adapter you use (see [Adding harness adapters](#adding-harness-adapters)):
+  - **`claude-code`** — `claude` ([Claude Code CLI](https://code.claude.com/docs/en/cli-reference))
+  - **`codex`** — `codex` ([Codex CLI](https://developers.openai.com/codex/cli/reference))
+  - **`gemini-cli`** — `gemini` ([Gemini CLI](https://geminicli.com/docs/cli/cli-reference/))
+### Authentication (by adapter)
+| Adapter | Typical auth |
+| ------- | ------------ |
+| **Claude Code** | `claude login` with `isolateConfig: false`, or `ANTHROPIC_API_KEY` with isolated config (default harness behavior) |
+| **Codex** | Logged-in `~/.codex`, or `OPENAI_API_KEY` when `codex.isolateConfig: true` |
+| **Gemini CLI** | Logged-in Gemini CLI config with `geminiCli.isolateConfig: false`, or Vertex/API key env vars (`GOOGLE_APPLICATION_CREDENTIALS`, `GEMINI_API_KEY`, etc.) when isolated |
+Each adapter section below documents `isolateConfig`, MCP setup, and headless flags in detail.
 ---
@@ -55,13 +67,17 @@ pnpm exec harness-eval --help
 Suites are YAML files. Committed examples:
 - [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
-- [`examples/basic.yaml`](examples/basic.yaml) — minimal smoke test using the built-in `Read` tool on this repo's README
-- [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
+- [`examples/basic.yaml`](examples/basic.yaml) — Claude Code smoke test (`Read` on this repo's README)
+- [`examples/codex-basic.yaml`](examples/codex-basic.yaml) — Codex CLI smoke test
+- [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) — Gemini CLI smoke test
+- [`examples/matrix.yaml`](examples/matrix.yaml) — Claude Code with a model matrix (sonnet vs opus)
 - [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
-- [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config (alternate to inline `judge:`)
+- [`examples/grading.yaml`](examples/grading.yaml) — Claude Code judge config (standalone)
+- [`examples/codex-grading.yaml`](examples/codex-grading.yaml) — Codex judge config
+- [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) — Gemini CLI judge config
 ```yaml
-adapter: claude-code
+adapter: claude-code   # or: codex | gemini-cli
 defaultConfig:
   model: claude-sonnet-4-6
@@ -95,7 +111,7 @@ cases:
       - "The summary is grounded in README content, not a generic refusal"
 ```
-Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
+Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Harness-specific options nest under `claudeCode`, `codex`, or `geminiCli` depending on `adapter`.
 **Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
@@ -109,7 +125,7 @@ npx @alis-build/harness-eval pipeline examples/pipeline/
 npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
 ```
-This spawns Claude Code headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
+This spawns the configured harness CLI headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
 **Progress (stderr):** one line per repetition with ETA by default; use `--quiet` for dots or `--verbose` for tool/assertion detail.
@@ -130,7 +146,7 @@ npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.y
 npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
 ```
-Runs a separate Claude subprocess as **judge** against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
+Runs a separate harness subprocess as **judge** (`judge.adapter`: `claude-code`, `codex`, or `gemini-cli`) against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
 Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2` = no expectations or no gradable repetitions.
@@ -138,13 +154,13 @@ Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2`
 ## Data contracts & schemas
-harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not Claude `stream-json` or OTLP as your primary record.
+harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not raw adapter NDJSON or OTLP as your primary record.
 ### Layering
 | Layer           | Type                  | Where                     | Use for                                            |
 | --------------- | --------------------- | ------------------------- | -------------------------------------------------- |
-| Vendor stream   | `StreamEvent`         | `src/types/stream.ts`     | Claude `stream-json` debug only                    |
+| Vendor stream   | `StreamEvent`         | `src/types/stream.ts`     | Adapter debug only (Claude/Codex/Gemini NDJSON)    |
 | Harness session | **`TrajectoryView`**  | `src/types/trajectory.ts` | Assertions, trajectory queries, judge input        |
 | Run report      | **`SuiteReport`**     | `report.json` from `run`  | Runner output; full trajectories + assertion stats |
 | Eval record     | **`EvalRunEnvelope`** | `buildEvalRunEnvelope()`  | CI gates, APIs, DB storage                         |
@@ -268,7 +284,7 @@ You do not need `harness-eval grade` if you already have LangSmith, Braintrust,
 | ------------------------ | ------------------------------ | ------------------------------------------ |
 | Headless harness runs    | `run` / `runSuite`             | —                                          |
 | Tool-call behavior       | Assertions on `TrajectoryView` | Optional: re-implement on `toolCalls`      |
-| Outcome / rubric scoring | `grade` (Claude judge)         | Your judge, eval platform, or human review |
+| Outcome / rubric scoring | `grade` (built-in judges)      | Your judge, eval platform, or human review |
 | Storage contract         | `EvalRunEnvelope`              | Same envelope; attach `externalScores`     |
 ### Pattern 1 — Behavioral only (no LLM judge)
@@ -305,7 +321,7 @@ const myJudge: GraderFn = async ({ prompt, transcript, expectations }) => {
 const grading = await gradeReport(report, { gradeFn: myJudge });
 ```
-Output is the same `SuiteGradingReport` shape as the built-in Claude grader — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
+Output is the same `SuiteGradingReport` shape as the built-in judges — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
 ### Pattern 3 — Separate judge pipeline (any language)
@@ -339,7 +355,7 @@ envelope.cells[0].repetitions[0].externalScores = [
 ];
 ```
-**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw Claude `stream-json` (Claude-only and verbose).
+**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw vendor NDJSON (adapter-specific and verbose).
 ### Pattern 4 — LangSmith, Braintrust, OpenAI Evals, etc.
@@ -401,7 +417,7 @@ Map your framework's output into these shapes (or use `externalScores`) so CI an
 | Layer        | Command | What it checks                          | Mechanism                                    |
 | ------------ | ------- | --------------------------------------- | -------------------------------------------- |
 | **Behavior** | `run`   | Tool calls, order, args, efficiency     | Deterministic assertions on `TrajectoryView` |
-| **Outcome**  | `grade` | Answer quality, grounding, completeness | LLM judge on transcript + `finalResponse`    |
+| **Outcome**  | `grade` | Answer quality, grounding, completeness | LLM judge (`claude-code`, `codex`, or `gemini-cli`) on transcript + `finalResponse` |
 Both layers use statistical thresholds: a case runs `repetitions` times per matrix cell, and each assertion/expectation has a pass-rate threshold (default `1.0`).
@@ -435,12 +451,12 @@ npx @alis-build/harness-eval --help
 ### `grade`
-Uses **`grading.yaml`** or an inline **`judge:`** block in `suite.yaml` (`--suite`).
+Uses **`grading.yaml`**, an inline **`judge:`** block in `suite.yaml` (`--suite`), or adapter-specific grading files under `examples/`.
 **Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
 ```yaml
-# examples/grading.yaml
+# examples/grading.yaml (Claude Code judge)
 judge:
   adapter: claude-code
   model: claude-sonnet-4-6
@@ -450,26 +466,38 @@ judge:
     permissionMode: bypassPermissions
 ```
+Other committed judge configs: [`examples/codex-grading.yaml`](examples/codex-grading.yaml) (`adapter: codex`), [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) (`adapter: gemini-cli`).
 ```bash
 npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json
+npx @alis-build/harness-eval grade report.json --config examples/codex-grading.yaml --output grading.json
+npx @alis-build/harness-eval grade report.json --config examples/gemini-grading.yaml --output grading.json
 ```
 | Option                                 | Description                                                       |
 | -------------------------------------- | ----------------------------------------------------------------- |
-| `--config <path>`                      | Grading YAML (`judge` block) — model, env, timeout, `claudeCode`  |
+| `--config <path>`                      | Grading YAML (`judge` block) — model, env, timeout, adapter options |
 | `--suite <path>`                       | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
 | `--output <path>`                      | Write grading JSON                                                |
 | `--expectations <path>`                | Sidecar YAML/JSON if report lacks expectations                    |
 | `--format console\|json`               | Output format                                                     |
 | `--model <id>`                         | Overrides `judge.model` in config                                 |
-| `--binary <path>`                      | Overrides `judge.claudeCode.binary`                               |
+| `--binary <path>`                      | Overrides judge binary for the selected adapter                   |
 | `--timeout-ms <n>`                     | Overrides `judge.timeoutMs`                                       |
 | `--max-concurrent <n>`                 | Overrides `judge.maxConcurrent` (default: 2 if unset)             |
 | `--quiet` / `--verbose` / `--progress` | Same progress modes as `run` (including `--color` / `--no-color`) |
 CLI flags override the YAML file. Expectations still come from `report.json` (copied from the suite at `run` time) unless `--expectations` is set. The grading report may include `gradingConfigPath` when `--config` was used.
-The built-in judge spawns Claude with **`--output-format json`** (single-shot response, not `stream-json`). It applies **safe defaults** so Claude Code does not reload plugins/MCP during grading: `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, plus `permissionMode: bypassPermissions` on the judge subprocess. Override in `judge.claudeCode` only if you need a different judge setup.
+**Built-in judge defaults** (override under `judge.claudeCode`, `judge.codex`, or `judge.geminiCli`):
+| Adapter | Defaults (summary) |
+| ------- | ------------------ |
+| `claude-code` | `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, `permissionMode: bypassPermissions`; JSON output |
+| `codex` | `ephemeral: true`, `ignoreUserConfig: true`, `skipGitRepoCheck: true`, `askForApproval: never` |
+| `gemini-cli` | `approvalMode: yolo`, `isolateConfig: true`, `skipTrust: true`; `--output-format json` |
+See [docs/suite-config.md](docs/suite-config.md) and each adapter section below for full flag tables.
 Exit codes: `0` = all expectations passed; `1` = failures; `2` = no expectations or no gradable repetitions (harness failures without trajectories are skipped).
@@ -583,9 +611,17 @@ Define expected tool calls for Vertex trajectory metrics on the eval envelope. U
 ---
-## Adding harness adapters
+## Harness adapters
-Built-in adapters register at module load. **`claude-code`** and **`codex`** ship today; additional harnesses (Gemini CLI, Antigravity CLI) plug in via the same pattern:
+Built-in adapters register at module load. Each has a dedicated section below with CLI flag mapping, examples, and judge configuration.
+| Adapter | Suite key | Example suite | Example judge |
+| ------- | --------- | ------------- | ------------- |
+| Claude Code | `claudeCode` | [`examples/basic.yaml`](examples/basic.yaml) | [`examples/grading.yaml`](examples/grading.yaml) |
+| Codex CLI | `codex` | [`examples/codex-basic.yaml`](examples/codex-basic.yaml) | [`examples/codex-grading.yaml`](examples/codex-grading.yaml) |
+| Gemini CLI | `geminiCli` | [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) | [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) |
+Additional harnesses (e.g. Antigravity CLI) plug in via the same pattern:
 1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
 2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
@@ -600,7 +636,7 @@ import {
 } from "@alis-build/harness-eval";
 registerAdapter("my-harness", myAdapter);
-console.log(listAdapters()); // ["claude-code", "codex", "my-harness"]
+console.log(listAdapters()); // ["claude-code", "codex", "gemini-cli", …]
 ```
 Duplicate registration throws so accidental overrides fail fast during startup or tests.
@@ -695,6 +731,43 @@ The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feed
 **Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
+**Package export:** `@alis-build/harness-eval/adapters/codex`
+---
+## Gemini CLI adapter
+Nested under `geminiCli` in YAML (or flat in programmatic config). Maps to [Gemini CLI reference](https://geminicli.com/docs/cli/cli-reference/).
+The harness adapter invokes:
+```bash
+gemini -p "<prompt>" --output-format stream-json --approval-mode yolo [flags…]
+```
+| Field | CLI flag | Notes |
+| ----- | -------- | ----- |
+| `binary` | — | Default `gemini` |
+| `model` | `--model` | Also settable at top level |
+| `approvalMode` | `--approval-mode` | Default `yolo`; overridable: `default`, `auto_edit`, `plan` |
+| `sandbox` | `--sandbox` | Sandboxed execution |
+| `skipTrust` | `--skip-trust` | Default `true` for harness and judge — skips folder trust in headless runs |
+| `includeDirectories` | `--include-directories` | Extra workspace dirs (repeatable) |
+| `allowedMcpServerNames` | `--allowed-mcp-server-names` | MCP server allowlist |
+| `extensions` | `--extensions` | Extension allowlist |
+| `debug` | `--debug` | Verbose logging |
+| `isolateConfig` | — | `false` (default) = inherit caller config; `true` = temp config dir per run |
+MCP tool calls map to harness names `mcp__<server>__<tool>`; built-in Gemini tools keep native names (e.g. `Bash`, `read_file`).
+The adapter maps Gemini stream-json events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/gemini-cli/` — CI does not require `gemini` on `PATH`.
+**Example suite:** [examples/gemini-cli-basic.yaml](examples/gemini-cli-basic.yaml)
+**Gemini CLI judge:** set `judge.adapter: gemini-cli` and nest options under `judge.geminiCli` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)). Example: [examples/gemini-grading.yaml](examples/gemini-grading.yaml).
+**Package export:** `@alis-build/harness-eval/adapters/gemini-cli`
 ---
 ## Library API
@@ -743,7 +816,7 @@ const envelope = buildEvalRunEnvelope(report, {
 });
 ```
-Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`.
+Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`, `@alis-build/harness-eval/adapters/gemini-cli`.
 ---
@@ -766,7 +839,7 @@ Suite YAML  →  runSuite  →  Harness adapter  →  TrajectoryView
                           EvalRunEnvelope  →  DB / CI / API
 ```
-- **Pluggable harness adapters** — runner and assertions depend only on `TrajectoryView`.
+- **Pluggable harness adapters** — `claude-code`, `codex`, and `gemini-cli` today; runner and assertions depend only on `TrajectoryView`.
 - **Pluggable outcome layer** — built-in `grade`, custom `gradeFn`, or any external workflow.
 - **OTLP** — observability side export; not required for scoring.

package/dist/adapters/claude-code/index.d.ts CHANGED Viewed

@@ -1,3 +1,3 @@
-import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-C0gBkl0-.js";
-import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-DnvP1UBl.js";
+import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-D0HR2WnP.js";
+import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-C56AEDUr.js";
 export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };

package/dist/adapters/codex/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-C0gBkl0-.js";
-import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-Bu8uOZZN.js";
+import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-D0HR2WnP.js";
+import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-DFMpv_HJ.js";
 //#region src/adapters/codex/map-events.d.ts
 /** Stateful mapper — tracks session id and pending tool calls across the stream. */

package/dist/cli/bin.js CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env node
-import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-Biy-5-9M.js";
-import { t as runSuite, u as getAdapter } from "../suite-BcP64nlb.js";
-import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-DnQ6Jt0i.js";
+import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-BKCJZRYr.js";
+import { t as runSuite, u as getAdapter } from "../suite-C3-8EjUW.js";
+import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-CiBm4Kf6.js";
 import { mkdir, readFile, writeFile } from "node:fs/promises";
 import { dirname, isAbsolute, join } from "node:path";
 import { fileURLToPath } from "node:url";

package/dist/config/loader.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, i as parseCasesFile, l as loadGradingConfig, n as parseSuite, o as SuiteDocument, r as parseSuiteDirectory, t as loadSuite, u as parseGradingConfig } from "../loader-B1WmGGzf.js";
+import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, i as parseCasesFile, l as loadGradingConfig, n as parseSuite, o as SuiteDocument, r as parseSuiteDirectory, t as loadSuite, u as parseGradingConfig } from "../loader-CrmzNwkq.js";
 export { ConfigError, type GradingConfig, type SuiteDocument, loadGradingConfig, loadSuite, loadSuiteDocument, parseCasesFile, parseGradingConfig, parseSuite, parseSuiteDirectory };

package/dist/config/loader.js CHANGED Viewed

@@ -1,2 +1,2 @@
-import { a as parseGradingConfig, c as parseCasesFile, i as loadGradingConfig, l as ConfigError, n as parseSuite, o as loadSuiteDocument, r as parseSuiteDirectory, t as loadSuite } from "../loader-DnQ6Jt0i.js";
+import { a as parseGradingConfig, c as parseCasesFile, i as loadGradingConfig, l as ConfigError, n as parseSuite, o as loadSuiteDocument, r as parseSuiteDirectory, t as loadSuite } from "../loader-CiBm4Kf6.js";
 export { ConfigError, loadGradingConfig, loadSuite, loadSuiteDocument, parseCasesFile, parseGradingConfig, parseSuite, parseSuiteDirectory };

package/dist/{index-DnvP1UBl.d.ts → index-C56AEDUr.d.ts} RENAMED Viewed

@@ -1,4 +1,4 @@
-import { a as HarnessAdapter, i as BaseAdapterConfig, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "./types-C0gBkl0-.js";
+import { a as HarnessAdapter, i as BaseAdapterConfig, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "./types-D0HR2WnP.js";
 //#region src/adapters/claude-code/types.d.ts
 /** Claude Code permission modes (`--permission-mode`). */
@@ -70,4 +70,4 @@ declare function runClaudeCode(config: ClaudeCodeAdapterConfig): Promise<ClaudeC
 declare const claudeCodeAdapter: HarnessAdapter<ClaudeCodeAdapterConfig>;
 //#endregion
 export { ClaudeCodeAdapterResult as a, ClaudeCodeAdapterConfig as i, index_d_exports as n, ClaudeCodeOptions as o, runClaudeCode as r, PermissionMode as s, claudeCodeAdapter as t };
-//# sourceMappingURL=index-DnvP1UBl.d.ts.map
+//# sourceMappingURL=index-C56AEDUr.d.ts.map

package/dist/index.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-C0gBkl0-.js";
-import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-DnvP1UBl.js";
-import { i as CodexOptions } from "./types-Bu8uOZZN.js";
-import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-0QkNVyp9.js";
-import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-B1WmGGzf.js";
-import { t as runSuite } from "./suite-BEShV0by.js";
+import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-D0HR2WnP.js";
+import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-C56AEDUr.js";
+import { i as CodexOptions } from "./types-DFMpv_HJ.js";
+import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-CLt4Yygc.js";
+import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-CrmzNwkq.js";
+import { t as runSuite } from "./suite-qyOGre2g.js";
 import { Readable } from "node:stream";
 //#region src/grader/types.d.ts
@@ -87,11 +87,13 @@ interface GradeReportOptions {
   /** Working directory for the judge subprocess. */
   cwd?: string;
   /** Grading adapter id. Default: `claude-code`. */
-  judgeAdapter?: "claude-code" | "codex";
+  judgeAdapter?: "claude-code" | "codex" | "gemini-cli";
   /** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
   claudeCode?: Record<string, unknown>;
   /** Codex CLI options for the judge (nested in grading YAML under `codex`). */
   codex?: Record<string, unknown>;
+  /** Gemini CLI options for the judge (nested in grading YAML under `geminiCli`). */
+  geminiCli?: Record<string, unknown>;
   /** Path to grading YAML when `--config` was used. */
   gradingConfigPath?: string;
   /** Inject a custom grader (for tests). */

package/dist/index.js CHANGED Viewed

@@ -1,8 +1,8 @@
 import { a as isAssistantMessage, c as isSystemRetry, d as isToolUseBlock, f as isUserMessage, i as namespaceOf, l as isTextBlock, n as TrajectoryBuilder, o as isResult, r as buildTrajectory, s as isSystemInit, t as AdapterError, u as isToolResultBlock } from "./types-Bac8_Ixb.js";
-import { A as serializeToolInput, B as TRAJECTORY_SCHEMA_VERSION, C as trajectoryExactMatch, D as trajectorySingleToolUse, E as trajectoryRecall, I as trajectoryToTranscript, L as createCodexGrader, M as gradingReportPassed, N as resolveGradeOptions, O as toEvaluationInstance, P as gradeReport, R as createClaudeGrader, S as trajectoryAnyOrderMatch, T as trajectoryPrecision, _ as buildEvalRunEnvelopeFromFiles, b as computeTrajectoryMetrics, d as resolveGradingArtifactFromSuite, f as resolvePipelineInputs, g as buildEvalRunEnvelope, h as toTrajectory, i as runPipeline, j as formatGradingConsole, k as toTrajectoryInstances, m as toInstancesJsonl, n as emitOtel, r as trajectoryToOtlp, t as formatReport, v as enrichRepetitionWithProtojson, w as trajectoryInOrderMatch, x as parseToolInput, y as toHarnessMetrics, z as EVAL_RUN_SCHEMA_VERSION } from "./reporter-Biy-5-9M.js";
+import { A as serializeToolInput, B as TRAJECTORY_SCHEMA_VERSION, C as trajectoryExactMatch, D as trajectorySingleToolUse, E as trajectoryRecall, I as trajectoryToTranscript, L as createCodexGrader, M as gradingReportPassed, N as resolveGradeOptions, O as toEvaluationInstance, P as gradeReport, R as createClaudeGrader, S as trajectoryAnyOrderMatch, T as trajectoryPrecision, _ as buildEvalRunEnvelopeFromFiles, b as computeTrajectoryMetrics, d as resolveGradingArtifactFromSuite, f as resolvePipelineInputs, g as buildEvalRunEnvelope, h as toTrajectory, i as runPipeline, j as formatGradingConsole, k as toTrajectoryInstances, m as toInstancesJsonl, n as emitOtel, r as trajectoryToOtlp, t as formatReport, v as enrichRepetitionWithProtojson, w as trajectoryInOrderMatch, x as parseToolInput, y as toHarnessMetrics, z as EVAL_RUN_SCHEMA_VERSION } from "./reporter-BKCJZRYr.js";
 import { a as parseStreamJson, n as claude_code_exports } from "./claude-code-C_7hxC8z.js";
-import { a as aggregateCell, c as runRepetition, d as getDefaultAdapter, f as listAdapters, h as evaluateAll, i as DEFAULT_THRESHOLD, l as DEFAULT_ADAPTER_ID, m as evaluate, n as createLimit, o as getRepetitions, p as registerAdapter, r as DEFAULT_REPETITIONS, s as mergeConfig, t as runSuite, u as getAdapter } from "./suite-BcP64nlb.js";
-import { l as ConfigError, n as parseSuite, o as loadSuiteDocument, t as loadSuite } from "./loader-DnQ6Jt0i.js";
+import { _ as evaluateAll, a as aggregateCell, c as runRepetition, d as getDefaultAdapter, f as listAdapters, g as evaluate, i as DEFAULT_THRESHOLD, l as DEFAULT_ADAPTER_ID, n as createLimit, o as getRepetitions, p as registerAdapter, r as DEFAULT_REPETITIONS, s as mergeConfig, t as runSuite, u as getAdapter } from "./suite-C3-8EjUW.js";
+import { l as ConfigError, n as parseSuite, o as loadSuiteDocument, t as loadSuite } from "./loader-CiBm4Kf6.js";
 //#region src/metrics/tool-calls.ts
 /**
 * Tool-call-level metrics operating on prediction/reference tool-call pairs.

package/dist/{loader-DnQ6Jt0i.js → loader-CiBm4Kf6.js} RENAMED Viewed

@@ -58,6 +58,24 @@ const ClaudeCodeConfigSchema = z.object({
 	maxTurns: z.number().int().positive(),
 	isolateConfig: z.boolean()
 }).partial();
+/** Gemini CLI adapter-specific options (nested under `geminiCli` in suite YAML). */
+const GeminiCliConfigSchema = z.object({
+	binary: z.string(),
+	approvalMode: z.enum([
+		"default",
+		"auto_edit",
+		"yolo",
+		"plan"
+	]),
+	sandbox: z.string(),
+	skipTrust: z.boolean(),
+	includeDirectories: z.array(z.string()),
+	allowedMcpServerNames: z.array(z.string()),
+	extensions: z.array(z.string()),
+	debug: z.boolean(),
+	/** Fresh temp `GEMINI_CONFIG_DIR` per run when true. */
+	isolateConfig: z.boolean()
+}).partial();
 /** Codex CLI adapter-specific options (nested under `codex`). */
 const CodexConfigSchema = z.object({
 	binary: z.string(),
@@ -91,7 +109,8 @@ const ConfigPartialSchema = z.object({
 	timeoutMs: z.number().int().positive(),
 	env: z.record(z.string(), z.string()),
 	claudeCode: ClaudeCodeConfigSchema,
-	codex: CodexConfigSchema
+	codex: CodexConfigSchema,
+	geminiCli: GeminiCliConfigSchema
 }).partial();
 /** A matrix cell — one point in the configuration matrix. */
 const MatrixCellSchema = z.object({
@@ -1264,4 +1283,4 @@ function formatZodError(err, sourcePath) {
 //#endregion
 export { parseGradingConfig as a, parseCasesFile as c, loadGradingConfig as i, ConfigError as l, parseSuite as n, loadSuiteDocument as o, parseSuiteDirectory as r, DEFAULT_PIPELINE_OUTPUTS as s, loadSuite as t };
-//# sourceMappingURL=loader-DnQ6Jt0i.js.map
+//# sourceMappingURL=loader-CiBm4Kf6.js.map