@alis-build/harness-eval 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # @alis-build/harness-eval
2
2
 
3
- Statistical eval framework for **AI coding agent harnesses** (Claude Code today; Cursor and Gemini planned). Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
3
+ Statistical eval framework for **AI coding agent harnesses**. Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
4
+
5
+ **Built-in harness adapters:** `claude-code`, `codex`, and `gemini-cli`. Set `adapter:` in suite YAML; the runner, assertions, and eval interchange stay the same regardless of vendor.
4
6
 
5
7
  **Use it to answer:** “When users ask X, does this harness actually call our MCP tools — reliably, in this plugin/model setup?”
6
8
 
@@ -9,10 +11,20 @@ Statistical eval framework for **AI coding agent harnesses** (Claude Code today;
9
11
  ## Requirements
10
12
 
11
13
  - Node.js ≥ 22.12 required; Node 24 LTS recommended for development and CI
12
- - `claude` on `PATH` (for the Claude Code adapter)
13
- - Authentication for Claude Code:
14
- - **Option A:** `claude login` and set `isolateConfig: false` in your suite (uses your normal plugins/MCP setup)
15
- - **Option B:** `ANTHROPIC_API_KEY` with isolated config per run (default adapter behavior)
14
+ - A harness CLI on `PATH` for the adapter you use (see [Adding harness adapters](#adding-harness-adapters)):
15
+ - **`claude-code`** `claude` ([Claude Code CLI](https://code.claude.com/docs/en/cli-reference))
16
+ - **`codex`** `codex` ([Codex CLI](https://developers.openai.com/codex/cli/reference))
17
+ - **`gemini-cli`** `gemini` ([Gemini CLI](https://geminicli.com/docs/cli/cli-reference/))
18
+
19
+ ### Authentication (by adapter)
20
+
21
+ | Adapter | Typical auth |
22
+ | ------- | ------------ |
23
+ | **Claude Code** | `claude login` with `isolateConfig: false`, or `ANTHROPIC_API_KEY` with isolated config (default harness behavior) |
24
+ | **Codex** | Logged-in `~/.codex`, or `OPENAI_API_KEY` when `codex.isolateConfig: true` |
25
+ | **Gemini CLI** | Logged-in Gemini CLI config with `geminiCli.isolateConfig: false`, or Vertex/API key env vars (`GOOGLE_APPLICATION_CREDENTIALS`, `GEMINI_API_KEY`, etc.) when isolated |
26
+
27
+ Each adapter section below documents `isolateConfig`, MCP setup, and headless flags in detail.
16
28
 
17
29
  ---
18
30
 
@@ -55,13 +67,17 @@ pnpm exec harness-eval --help
55
67
  Suites are YAML files. Committed examples:
56
68
 
57
69
  - [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
58
- - [`examples/basic.yaml`](examples/basic.yaml) — minimal smoke test using the built-in `Read` tool on this repo's README
59
- - [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
70
+ - [`examples/basic.yaml`](examples/basic.yaml) — Claude Code smoke test (`Read` on this repo's README)
71
+ - [`examples/codex-basic.yaml`](examples/codex-basic.yaml) — Codex CLI smoke test
72
+ - [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) — Gemini CLI smoke test
73
+ - [`examples/matrix.yaml`](examples/matrix.yaml) — Claude Code with a model matrix (sonnet vs opus)
60
74
  - [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
61
- - [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config (alternate to inline `judge:`)
75
+ - [`examples/grading.yaml`](examples/grading.yaml) — Claude Code judge config (standalone)
76
+ - [`examples/codex-grading.yaml`](examples/codex-grading.yaml) — Codex judge config
77
+ - [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) — Gemini CLI judge config
62
78
 
63
79
  ```yaml
64
- adapter: claude-code
80
+ adapter: claude-code # or: codex | gemini-cli
65
81
 
66
82
  defaultConfig:
67
83
  model: claude-sonnet-4-6
@@ -95,7 +111,7 @@ cases:
95
111
  - "The summary is grounded in README content, not a generic refusal"
96
112
  ```
97
113
 
98
- Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
114
+ Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Harness-specific options nest under `claudeCode`, `codex`, or `geminiCli` depending on `adapter`.
99
115
 
100
116
  **Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
101
117
 
@@ -109,7 +125,7 @@ npx @alis-build/harness-eval pipeline examples/pipeline/
109
125
  npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
110
126
  ```
111
127
 
112
- This spawns Claude Code headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
128
+ This spawns the configured harness CLI headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
113
129
 
114
130
  **Progress (stderr):** one line per repetition with ETA by default; use `--quiet` for dots or `--verbose` for tool/assertion detail.
115
131
 
@@ -130,7 +146,7 @@ npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.y
130
146
  npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
131
147
  ```
132
148
 
133
- Runs a separate Claude subprocess as **judge** against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
149
+ Runs a separate harness subprocess as **judge** (`judge.adapter`: `claude-code`, `codex`, or `gemini-cli`) against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
134
150
 
135
151
  Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2` = no expectations or no gradable repetitions.
136
152
 
@@ -138,13 +154,13 @@ Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2`
138
154
 
139
155
  ## Data contracts & schemas
140
156
 
141
- harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not Claude `stream-json` or OTLP as your primary record.
157
+ harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not raw adapter NDJSON or OTLP as your primary record.
142
158
 
143
159
  ### Layering
144
160
 
145
161
  | Layer | Type | Where | Use for |
146
162
  | --------------- | --------------------- | ------------------------- | -------------------------------------------------- |
147
- | Vendor stream | `StreamEvent` | `src/types/stream.ts` | Claude `stream-json` debug only |
163
+ | Vendor stream | `StreamEvent` | `src/types/stream.ts` | Adapter debug only (Claude/Codex/Gemini NDJSON) |
148
164
  | Harness session | **`TrajectoryView`** | `src/types/trajectory.ts` | Assertions, trajectory queries, judge input |
149
165
  | Run report | **`SuiteReport`** | `report.json` from `run` | Runner output; full trajectories + assertion stats |
150
166
  | Eval record | **`EvalRunEnvelope`** | `buildEvalRunEnvelope()` | CI gates, APIs, DB storage |
@@ -268,7 +284,7 @@ You do not need `harness-eval grade` if you already have LangSmith, Braintrust,
268
284
  | ------------------------ | ------------------------------ | ------------------------------------------ |
269
285
  | Headless harness runs | `run` / `runSuite` | — |
270
286
  | Tool-call behavior | Assertions on `TrajectoryView` | Optional: re-implement on `toolCalls` |
271
- | Outcome / rubric scoring | `grade` (Claude judge) | Your judge, eval platform, or human review |
287
+ | Outcome / rubric scoring | `grade` (built-in judges) | Your judge, eval platform, or human review |
272
288
  | Storage contract | `EvalRunEnvelope` | Same envelope; attach `externalScores` |
273
289
 
274
290
  ### Pattern 1 — Behavioral only (no LLM judge)
@@ -305,7 +321,7 @@ const myJudge: GraderFn = async ({ prompt, transcript, expectations }) => {
305
321
  const grading = await gradeReport(report, { gradeFn: myJudge });
306
322
  ```
307
323
 
308
- Output is the same `SuiteGradingReport` shape as the built-in Claude grader — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
324
+ Output is the same `SuiteGradingReport` shape as the built-in judges — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
309
325
 
310
326
  ### Pattern 3 — Separate judge pipeline (any language)
311
327
 
@@ -339,7 +355,7 @@ envelope.cells[0].repetitions[0].externalScores = [
339
355
  ];
340
356
  ```
341
357
 
342
- **Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw Claude `stream-json` (Claude-only and verbose).
358
+ **Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw vendor NDJSON (adapter-specific and verbose).
343
359
 
344
360
  ### Pattern 4 — LangSmith, Braintrust, OpenAI Evals, etc.
345
361
 
@@ -401,7 +417,7 @@ Map your framework's output into these shapes (or use `externalScores`) so CI an
401
417
  | Layer | Command | What it checks | Mechanism |
402
418
  | ------------ | ------- | --------------------------------------- | -------------------------------------------- |
403
419
  | **Behavior** | `run` | Tool calls, order, args, efficiency | Deterministic assertions on `TrajectoryView` |
404
- | **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge on transcript + `finalResponse` |
420
+ | **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge (`claude-code`, `codex`, or `gemini-cli`) on transcript + `finalResponse` |
405
421
 
406
422
  Both layers use statistical thresholds: a case runs `repetitions` times per matrix cell, and each assertion/expectation has a pass-rate threshold (default `1.0`).
407
423
 
@@ -435,12 +451,12 @@ npx @alis-build/harness-eval --help
435
451
 
436
452
  ### `grade`
437
453
 
438
- Uses **`grading.yaml`** or an inline **`judge:`** block in `suite.yaml` (`--suite`).
454
+ Uses **`grading.yaml`**, an inline **`judge:`** block in `suite.yaml` (`--suite`), or adapter-specific grading files under `examples/`.
439
455
 
440
456
  **Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
441
457
 
442
458
  ```yaml
443
- # examples/grading.yaml
459
+ # examples/grading.yaml (Claude Code judge)
444
460
  judge:
445
461
  adapter: claude-code
446
462
  model: claude-sonnet-4-6
@@ -450,26 +466,38 @@ judge:
450
466
  permissionMode: bypassPermissions
451
467
  ```
452
468
 
469
+ Other committed judge configs: [`examples/codex-grading.yaml`](examples/codex-grading.yaml) (`adapter: codex`), [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) (`adapter: gemini-cli`).
470
+
453
471
  ```bash
454
472
  npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json
473
+ npx @alis-build/harness-eval grade report.json --config examples/codex-grading.yaml --output grading.json
474
+ npx @alis-build/harness-eval grade report.json --config examples/gemini-grading.yaml --output grading.json
455
475
  ```
456
476
 
457
477
  | Option | Description |
458
478
  | -------------------------------------- | ----------------------------------------------------------------- |
459
- | `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, `claudeCode` |
479
+ | `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, adapter options |
460
480
  | `--suite <path>` | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
461
481
  | `--output <path>` | Write grading JSON |
462
482
  | `--expectations <path>` | Sidecar YAML/JSON if report lacks expectations |
463
483
  | `--format console\|json` | Output format |
464
484
  | `--model <id>` | Overrides `judge.model` in config |
465
- | `--binary <path>` | Overrides `judge.claudeCode.binary` |
485
+ | `--binary <path>` | Overrides judge binary for the selected adapter |
466
486
  | `--timeout-ms <n>` | Overrides `judge.timeoutMs` |
467
487
  | `--max-concurrent <n>` | Overrides `judge.maxConcurrent` (default: 2 if unset) |
468
488
  | `--quiet` / `--verbose` / `--progress` | Same progress modes as `run` (including `--color` / `--no-color`) |
469
489
 
470
490
  CLI flags override the YAML file. Expectations still come from `report.json` (copied from the suite at `run` time) unless `--expectations` is set. The grading report may include `gradingConfigPath` when `--config` was used.
471
491
 
472
- The built-in judge spawns Claude with **`--output-format json`** (single-shot response, not `stream-json`). It applies **safe defaults** so Claude Code does not reload plugins/MCP during grading: `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, plus `permissionMode: bypassPermissions` on the judge subprocess. Override in `judge.claudeCode` only if you need a different judge setup.
492
+ **Built-in judge defaults** (override under `judge.claudeCode`, `judge.codex`, or `judge.geminiCli`):
493
+
494
+ | Adapter | Defaults (summary) |
495
+ | ------- | ------------------ |
496
+ | `claude-code` | `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, `permissionMode: bypassPermissions`; JSON output |
497
+ | `codex` | `ephemeral: true`, `ignoreUserConfig: true`, `skipGitRepoCheck: true`, `askForApproval: never` |
498
+ | `gemini-cli` | `approvalMode: yolo`, `isolateConfig: true`, `skipTrust: true`; `--output-format json` |
499
+
500
+ See [docs/suite-config.md](docs/suite-config.md) and each adapter section below for full flag tables.
473
501
 
474
502
  Exit codes: `0` = all expectations passed; `1` = failures; `2` = no expectations or no gradable repetitions (harness failures without trajectories are skipped).
475
503
 
@@ -583,9 +611,17 @@ Define expected tool calls for Vertex trajectory metrics on the eval envelope. U
583
611
 
584
612
  ---
585
613
 
586
- ## Adding harness adapters
614
+ ## Harness adapters
587
615
 
588
- Built-in adapters register at module load. **`claude-code`** and **`codex`** ship today; additional harnesses (Gemini CLI, Antigravity CLI) plug in via the same pattern:
616
+ Built-in adapters register at module load. Each has a dedicated section below with CLI flag mapping, examples, and judge configuration.
617
+
618
+ | Adapter | Suite key | Example suite | Example judge |
619
+ | ------- | --------- | ------------- | ------------- |
620
+ | Claude Code | `claudeCode` | [`examples/basic.yaml`](examples/basic.yaml) | [`examples/grading.yaml`](examples/grading.yaml) |
621
+ | Codex CLI | `codex` | [`examples/codex-basic.yaml`](examples/codex-basic.yaml) | [`examples/codex-grading.yaml`](examples/codex-grading.yaml) |
622
+ | Gemini CLI | `geminiCli` | [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) | [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) |
623
+
624
+ Additional harnesses (e.g. Antigravity CLI) plug in via the same pattern:
589
625
 
590
626
  1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
591
627
  2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
@@ -600,7 +636,7 @@ import {
600
636
  } from "@alis-build/harness-eval";
601
637
 
602
638
  registerAdapter("my-harness", myAdapter);
603
- console.log(listAdapters()); // ["claude-code", "codex", "my-harness"]
639
+ console.log(listAdapters()); // ["claude-code", "codex", "gemini-cli", …]
604
640
  ```
605
641
 
606
642
  Duplicate registration throws so accidental overrides fail fast during startup or tests.
@@ -695,6 +731,43 @@ The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feed
695
731
 
696
732
  **Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
697
733
 
734
+ **Package export:** `@alis-build/harness-eval/adapters/codex`
735
+
736
+ ---
737
+
738
+ ## Gemini CLI adapter
739
+
740
+ Nested under `geminiCli` in YAML (or flat in programmatic config). Maps to [Gemini CLI reference](https://geminicli.com/docs/cli/cli-reference/).
741
+
742
+ The harness adapter invokes:
743
+
744
+ ```bash
745
+ gemini -p "<prompt>" --output-format stream-json --approval-mode yolo [flags…]
746
+ ```
747
+
748
+ | Field | CLI flag | Notes |
749
+ | ----- | -------- | ----- |
750
+ | `binary` | — | Default `gemini` |
751
+ | `model` | `--model` | Also settable at top level |
752
+ | `approvalMode` | `--approval-mode` | Default `yolo`; overridable: `default`, `auto_edit`, `plan` |
753
+ | `sandbox` | `--sandbox` | Sandboxed execution |
754
+ | `skipTrust` | `--skip-trust` | Default `true` for harness and judge — skips folder trust in headless runs |
755
+ | `includeDirectories` | `--include-directories` | Extra workspace dirs (repeatable) |
756
+ | `allowedMcpServerNames` | `--allowed-mcp-server-names` | MCP server allowlist |
757
+ | `extensions` | `--extensions` | Extension allowlist |
758
+ | `debug` | `--debug` | Verbose logging |
759
+ | `isolateConfig` | — | `false` (default) = inherit caller config; `true` = temp config dir per run |
760
+
761
+ MCP tool calls map to harness names `mcp__<server>__<tool>`; built-in Gemini tools keep native names (e.g. `Bash`, `read_file`).
762
+
763
+ The adapter maps Gemini stream-json events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/gemini-cli/` — CI does not require `gemini` on `PATH`.
764
+
765
+ **Example suite:** [examples/gemini-cli-basic.yaml](examples/gemini-cli-basic.yaml)
766
+
767
+ **Gemini CLI judge:** set `judge.adapter: gemini-cli` and nest options under `judge.geminiCli` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)). Example: [examples/gemini-grading.yaml](examples/gemini-grading.yaml).
768
+
769
+ **Package export:** `@alis-build/harness-eval/adapters/gemini-cli`
770
+
698
771
  ---
699
772
 
700
773
  ## Library API
@@ -743,7 +816,7 @@ const envelope = buildEvalRunEnvelope(report, {
743
816
  });
744
817
  ```
745
818
 
746
- Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`.
819
+ Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`, `@alis-build/harness-eval/adapters/gemini-cli`.
747
820
 
748
821
  ---
749
822
 
@@ -766,7 +839,7 @@ Suite YAML → runSuite → Harness adapter → TrajectoryView
766
839
  EvalRunEnvelope → DB / CI / API
767
840
  ```
768
841
 
769
- - **Pluggable harness adapters** — runner and assertions depend only on `TrajectoryView`.
842
+ - **Pluggable harness adapters** — `claude-code`, `codex`, and `gemini-cli` today; runner and assertions depend only on `TrajectoryView`.
770
843
  - **Pluggable outcome layer** — built-in `grade`, custom `gradeFn`, or any external workflow.
771
844
  - **OTLP** — observability side export; not required for scoring.
772
845
 
@@ -1,3 +1,3 @@
1
- import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-C0gBkl0-.js";
2
- import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-DnvP1UBl.js";
1
+ import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-D0HR2WnP.js";
2
+ import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-C56AEDUr.js";
3
3
  export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };
@@ -1,5 +1,5 @@
1
- import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-C0gBkl0-.js";
2
- import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-Bu8uOZZN.js";
1
+ import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-D0HR2WnP.js";
2
+ import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-DFMpv_HJ.js";
3
3
 
4
4
  //#region src/adapters/codex/map-events.d.ts
5
5
  /** Stateful mapper — tracks session id and pending tool calls across the stream. */
package/dist/cli/bin.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
- import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-Biy-5-9M.js";
3
- import { t as runSuite, u as getAdapter } from "../suite-BcP64nlb.js";
4
- import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-DnQ6Jt0i.js";
2
+ import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-BKCJZRYr.js";
3
+ import { t as runSuite, u as getAdapter } from "../suite-C3-8EjUW.js";
4
+ import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-CiBm4Kf6.js";
5
5
  import { mkdir, readFile, writeFile } from "node:fs/promises";
6
6
  import { dirname, isAbsolute, join } from "node:path";
7
7
  import { fileURLToPath } from "node:url";
@@ -1,2 +1,2 @@
1
- import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, i as parseCasesFile, l as loadGradingConfig, n as parseSuite, o as SuiteDocument, r as parseSuiteDirectory, t as loadSuite, u as parseGradingConfig } from "../loader-B1WmGGzf.js";
1
+ import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, i as parseCasesFile, l as loadGradingConfig, n as parseSuite, o as SuiteDocument, r as parseSuiteDirectory, t as loadSuite, u as parseGradingConfig } from "../loader-CrmzNwkq.js";
2
2
  export { ConfigError, type GradingConfig, type SuiteDocument, loadGradingConfig, loadSuite, loadSuiteDocument, parseCasesFile, parseGradingConfig, parseSuite, parseSuiteDirectory };
@@ -1,2 +1,2 @@
1
- import { a as parseGradingConfig, c as parseCasesFile, i as loadGradingConfig, l as ConfigError, n as parseSuite, o as loadSuiteDocument, r as parseSuiteDirectory, t as loadSuite } from "../loader-DnQ6Jt0i.js";
1
+ import { a as parseGradingConfig, c as parseCasesFile, i as loadGradingConfig, l as ConfigError, n as parseSuite, o as loadSuiteDocument, r as parseSuiteDirectory, t as loadSuite } from "../loader-CiBm4Kf6.js";
2
2
  export { ConfigError, loadGradingConfig, loadSuite, loadSuiteDocument, parseCasesFile, parseGradingConfig, parseSuite, parseSuiteDirectory };
@@ -1,4 +1,4 @@
1
- import { a as HarnessAdapter, i as BaseAdapterConfig, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "./types-C0gBkl0-.js";
1
+ import { a as HarnessAdapter, i as BaseAdapterConfig, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "./types-D0HR2WnP.js";
2
2
 
3
3
  //#region src/adapters/claude-code/types.d.ts
4
4
  /** Claude Code permission modes (`--permission-mode`). */
@@ -70,4 +70,4 @@ declare function runClaudeCode(config: ClaudeCodeAdapterConfig): Promise<ClaudeC
70
70
  declare const claudeCodeAdapter: HarnessAdapter<ClaudeCodeAdapterConfig>;
71
71
  //#endregion
72
72
  export { ClaudeCodeAdapterResult as a, ClaudeCodeAdapterConfig as i, index_d_exports as n, ClaudeCodeOptions as o, runClaudeCode as r, PermissionMode as s, claudeCodeAdapter as t };
73
- //# sourceMappingURL=index-DnvP1UBl.d.ts.map
73
+ //# sourceMappingURL=index-C56AEDUr.d.ts.map
package/dist/index.d.ts CHANGED
@@ -1,9 +1,9 @@
1
- import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-C0gBkl0-.js";
2
- import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-DnvP1UBl.js";
3
- import { i as CodexOptions } from "./types-Bu8uOZZN.js";
4
- import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-0QkNVyp9.js";
5
- import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-B1WmGGzf.js";
6
- import { t as runSuite } from "./suite-BEShV0by.js";
1
+ import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-D0HR2WnP.js";
2
+ import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-C56AEDUr.js";
3
+ import { i as CodexOptions } from "./types-DFMpv_HJ.js";
4
+ import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-CLt4Yygc.js";
5
+ import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-CrmzNwkq.js";
6
+ import { t as runSuite } from "./suite-qyOGre2g.js";
7
7
  import { Readable } from "node:stream";
8
8
 
9
9
  //#region src/grader/types.d.ts
@@ -87,11 +87,13 @@ interface GradeReportOptions {
87
87
  /** Working directory for the judge subprocess. */
88
88
  cwd?: string;
89
89
  /** Grading adapter id. Default: `claude-code`. */
90
- judgeAdapter?: "claude-code" | "codex";
90
+ judgeAdapter?: "claude-code" | "codex" | "gemini-cli";
91
91
  /** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
92
92
  claudeCode?: Record<string, unknown>;
93
93
  /** Codex CLI options for the judge (nested in grading YAML under `codex`). */
94
94
  codex?: Record<string, unknown>;
95
+ /** Gemini CLI options for the judge (nested in grading YAML under `geminiCli`). */
96
+ geminiCli?: Record<string, unknown>;
95
97
  /** Path to grading YAML when `--config` was used. */
96
98
  gradingConfigPath?: string;
97
99
  /** Inject a custom grader (for tests). */
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
1
  import { a as isAssistantMessage, c as isSystemRetry, d as isToolUseBlock, f as isUserMessage, i as namespaceOf, l as isTextBlock, n as TrajectoryBuilder, o as isResult, r as buildTrajectory, s as isSystemInit, t as AdapterError, u as isToolResultBlock } from "./types-Bac8_Ixb.js";
2
- import { A as serializeToolInput, B as TRAJECTORY_SCHEMA_VERSION, C as trajectoryExactMatch, D as trajectorySingleToolUse, E as trajectoryRecall, I as trajectoryToTranscript, L as createCodexGrader, M as gradingReportPassed, N as resolveGradeOptions, O as toEvaluationInstance, P as gradeReport, R as createClaudeGrader, S as trajectoryAnyOrderMatch, T as trajectoryPrecision, _ as buildEvalRunEnvelopeFromFiles, b as computeTrajectoryMetrics, d as resolveGradingArtifactFromSuite, f as resolvePipelineInputs, g as buildEvalRunEnvelope, h as toTrajectory, i as runPipeline, j as formatGradingConsole, k as toTrajectoryInstances, m as toInstancesJsonl, n as emitOtel, r as trajectoryToOtlp, t as formatReport, v as enrichRepetitionWithProtojson, w as trajectoryInOrderMatch, x as parseToolInput, y as toHarnessMetrics, z as EVAL_RUN_SCHEMA_VERSION } from "./reporter-Biy-5-9M.js";
2
+ import { A as serializeToolInput, B as TRAJECTORY_SCHEMA_VERSION, C as trajectoryExactMatch, D as trajectorySingleToolUse, E as trajectoryRecall, I as trajectoryToTranscript, L as createCodexGrader, M as gradingReportPassed, N as resolveGradeOptions, O as toEvaluationInstance, P as gradeReport, R as createClaudeGrader, S as trajectoryAnyOrderMatch, T as trajectoryPrecision, _ as buildEvalRunEnvelopeFromFiles, b as computeTrajectoryMetrics, d as resolveGradingArtifactFromSuite, f as resolvePipelineInputs, g as buildEvalRunEnvelope, h as toTrajectory, i as runPipeline, j as formatGradingConsole, k as toTrajectoryInstances, m as toInstancesJsonl, n as emitOtel, r as trajectoryToOtlp, t as formatReport, v as enrichRepetitionWithProtojson, w as trajectoryInOrderMatch, x as parseToolInput, y as toHarnessMetrics, z as EVAL_RUN_SCHEMA_VERSION } from "./reporter-BKCJZRYr.js";
3
3
  import { a as parseStreamJson, n as claude_code_exports } from "./claude-code-C_7hxC8z.js";
4
- import { a as aggregateCell, c as runRepetition, d as getDefaultAdapter, f as listAdapters, h as evaluateAll, i as DEFAULT_THRESHOLD, l as DEFAULT_ADAPTER_ID, m as evaluate, n as createLimit, o as getRepetitions, p as registerAdapter, r as DEFAULT_REPETITIONS, s as mergeConfig, t as runSuite, u as getAdapter } from "./suite-BcP64nlb.js";
5
- import { l as ConfigError, n as parseSuite, o as loadSuiteDocument, t as loadSuite } from "./loader-DnQ6Jt0i.js";
4
+ import { _ as evaluateAll, a as aggregateCell, c as runRepetition, d as getDefaultAdapter, f as listAdapters, g as evaluate, i as DEFAULT_THRESHOLD, l as DEFAULT_ADAPTER_ID, n as createLimit, o as getRepetitions, p as registerAdapter, r as DEFAULT_REPETITIONS, s as mergeConfig, t as runSuite, u as getAdapter } from "./suite-C3-8EjUW.js";
5
+ import { l as ConfigError, n as parseSuite, o as loadSuiteDocument, t as loadSuite } from "./loader-CiBm4Kf6.js";
6
6
  //#region src/metrics/tool-calls.ts
7
7
  /**
8
8
  * Tool-call-level metrics operating on prediction/reference tool-call pairs.
@@ -58,6 +58,24 @@ const ClaudeCodeConfigSchema = z.object({
58
58
  maxTurns: z.number().int().positive(),
59
59
  isolateConfig: z.boolean()
60
60
  }).partial();
61
+ /** Gemini CLI adapter-specific options (nested under `geminiCli` in suite YAML). */
62
+ const GeminiCliConfigSchema = z.object({
63
+ binary: z.string(),
64
+ approvalMode: z.enum([
65
+ "default",
66
+ "auto_edit",
67
+ "yolo",
68
+ "plan"
69
+ ]),
70
+ sandbox: z.string(),
71
+ skipTrust: z.boolean(),
72
+ includeDirectories: z.array(z.string()),
73
+ allowedMcpServerNames: z.array(z.string()),
74
+ extensions: z.array(z.string()),
75
+ debug: z.boolean(),
76
+ /** Fresh temp `GEMINI_CONFIG_DIR` per run when true. */
77
+ isolateConfig: z.boolean()
78
+ }).partial();
61
79
  /** Codex CLI adapter-specific options (nested under `codex`). */
62
80
  const CodexConfigSchema = z.object({
63
81
  binary: z.string(),
@@ -91,7 +109,8 @@ const ConfigPartialSchema = z.object({
91
109
  timeoutMs: z.number().int().positive(),
92
110
  env: z.record(z.string(), z.string()),
93
111
  claudeCode: ClaudeCodeConfigSchema,
94
- codex: CodexConfigSchema
112
+ codex: CodexConfigSchema,
113
+ geminiCli: GeminiCliConfigSchema
95
114
  }).partial();
96
115
  /** A matrix cell — one point in the configuration matrix. */
97
116
  const MatrixCellSchema = z.object({
@@ -1264,4 +1283,4 @@ function formatZodError(err, sourcePath) {
1264
1283
  //#endregion
1265
1284
  export { parseGradingConfig as a, parseCasesFile as c, loadGradingConfig as i, ConfigError as l, parseSuite as n, loadSuiteDocument as o, parseSuiteDirectory as r, DEFAULT_PIPELINE_OUTPUTS as s, loadSuite as t };
1266
1285
 
1267
- //# sourceMappingURL=loader-DnQ6Jt0i.js.map
1286
+ //# sourceMappingURL=loader-CiBm4Kf6.js.map