@alis-build/harness-eval 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +187 -30
  2. package/dist/adapters/claude-code/index.d.ts +2 -2
  3. package/dist/adapters/claude-code/index.js +2 -1
  4. package/dist/adapters/codex/index.d.ts +68 -0
  5. package/dist/adapters/codex/index.js +3 -0
  6. package/dist/{claude-code-DZ4Vkgp6.js → claude-code-C_7hxC8z.js} +3 -245
  7. package/dist/claude-code-C_7hxC8z.js.map +1 -0
  8. package/dist/cli/bin.js +131 -151
  9. package/dist/cli/bin.js.map +1 -1
  10. package/dist/codex-0cHO2te9.js +496 -0
  11. package/dist/codex-0cHO2te9.js.map +1 -0
  12. package/dist/config/loader.d.ts +2 -2
  13. package/dist/config/loader.js +2 -2
  14. package/dist/{index-V22PrR0p.d.ts → index-C56AEDUr.d.ts} +2 -2
  15. package/dist/index.d.ts +134 -6
  16. package/dist/index.js +6 -5
  17. package/dist/index.js.map +1 -1
  18. package/dist/{loader-DcI0KfRX.js → loader-CiBm4Kf6.js} +491 -209
  19. package/dist/loader-CiBm4Kf6.js.map +1 -0
  20. package/dist/loader-CrmzNwkq.d.ts +107 -0
  21. package/dist/{projections-BcX7w-f6.js → reporter-BKCJZRYr.js} +1475 -729
  22. package/dist/reporter-BKCJZRYr.js.map +1 -0
  23. package/dist/runner/suite.d.ts +1 -1
  24. package/dist/runner/suite.js +1 -1
  25. package/dist/{suite-Dlzl-HI0.js → suite-C3-8EjUW.js} +558 -4
  26. package/dist/suite-C3-8EjUW.js.map +1 -0
  27. package/dist/{suite-DPJMIEbu.d.ts → suite-qyOGre2g.d.ts} +2 -2
  28. package/dist/types-Bac8_Ixb.js +246 -0
  29. package/dist/types-Bac8_Ixb.js.map +1 -0
  30. package/dist/{types-CD3TwOtZ.d.ts → types-CLt4Yygc.d.ts} +2 -2
  31. package/dist/{types-B9H4IZtA.d.ts → types-D0HR2WnP.d.ts} +9 -2
  32. package/dist/types-DFMpv_HJ.d.ts +77 -0
  33. package/package.json +11 -2
  34. package/schemas/eval-run-envelope.schema.json +193 -183
  35. package/dist/claude-code-DZ4Vkgp6.js.map +0 -1
  36. package/dist/loader-C9yQHUPC.d.ts +0 -50
  37. package/dist/loader-DcI0KfRX.js.map +0 -1
  38. package/dist/projections-BcX7w-f6.js.map +0 -1
  39. package/dist/suite-Dlzl-HI0.js.map +0 -1
package/README.md CHANGED
@@ -1,6 +1,8 @@
1
1
  # @alis-build/harness-eval
2
2
 
3
- Statistical eval framework for **AI coding agent harnesses** (Claude Code today; Cursor and Gemini planned). Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
3
+ Statistical eval framework for **AI coding agent harnesses**. Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
4
+
5
+ **Built-in harness adapters:** `claude-code`, `codex`, and `gemini-cli`. Set `adapter:` in suite YAML; the runner, assertions, and eval interchange stay the same regardless of vendor.
4
6
 
5
7
  **Use it to answer:** “When users ask X, does this harness actually call our MCP tools — reliably, in this plugin/model setup?”
6
8
 
@@ -9,10 +11,20 @@ Statistical eval framework for **AI coding agent harnesses** (Claude Code today;
9
11
  ## Requirements
10
12
 
11
13
  - Node.js ≥ 22.12 required; Node 24 LTS recommended for development and CI
12
- - `claude` on `PATH` (for the Claude Code adapter)
13
- - Authentication for Claude Code:
14
- - **Option A:** `claude login` and set `isolateConfig: false` in your suite (uses your normal plugins/MCP setup)
15
- - **Option B:** `ANTHROPIC_API_KEY` with isolated config per run (default adapter behavior)
14
+ - A harness CLI on `PATH` for the adapter you use (see [Adding harness adapters](#adding-harness-adapters)):
15
+ - **`claude-code`** `claude` ([Claude Code CLI](https://code.claude.com/docs/en/cli-reference))
16
+ - **`codex`** `codex` ([Codex CLI](https://developers.openai.com/codex/cli/reference))
17
+ - **`gemini-cli`** `gemini` ([Gemini CLI](https://geminicli.com/docs/cli/cli-reference/))
18
+
19
+ ### Authentication (by adapter)
20
+
21
+ | Adapter | Typical auth |
22
+ | ------- | ------------ |
23
+ | **Claude Code** | `claude login` with `isolateConfig: false`, or `ANTHROPIC_API_KEY` with isolated config (default harness behavior) |
24
+ | **Codex** | Logged-in `~/.codex`, or `OPENAI_API_KEY` when `codex.isolateConfig: true` |
25
+ | **Gemini CLI** | Logged-in Gemini CLI config with `geminiCli.isolateConfig: false`, or Vertex/API key env vars (`GOOGLE_APPLICATION_CREDENTIALS`, `GEMINI_API_KEY`, etc.) when isolated |
26
+
27
+ Each adapter section below documents `isolateConfig`, MCP setup, and headless flags in detail.
16
28
 
17
29
  ---
18
30
 
@@ -54,13 +66,18 @@ pnpm exec harness-eval --help
54
66
 
55
67
  Suites are YAML files. Committed examples:
56
68
 
57
- - [`examples/basic.yaml`](examples/basic.yaml) — smoke test using the built-in `Read` tool on this repo's README
58
- - [`examples/matrix.yaml`](examples/matrix.yaml) — same idea with a model matrix (sonnet vs opus)
69
+ - [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
70
+ - [`examples/basic.yaml`](examples/basic.yaml) — Claude Code smoke test (`Read` on this repo's README)
71
+ - [`examples/codex-basic.yaml`](examples/codex-basic.yaml) — Codex CLI smoke test
72
+ - [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) — Gemini CLI smoke test
73
+ - [`examples/matrix.yaml`](examples/matrix.yaml) — Claude Code with a model matrix (sonnet vs opus)
59
74
  - [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
60
- - [`examples/grading.yaml`](examples/grading.yaml) — standalone judge config for `harness-eval grade`
75
+ - [`examples/grading.yaml`](examples/grading.yaml) — Claude Code judge config (standalone)
76
+ - [`examples/codex-grading.yaml`](examples/codex-grading.yaml) — Codex judge config
77
+ - [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) — Gemini CLI judge config
61
78
 
62
79
  ```yaml
63
- adapter: claude-code
80
+ adapter: claude-code # or: codex | gemini-cli
64
81
 
65
82
  defaultConfig:
66
83
  model: claude-sonnet-4-6
@@ -94,17 +111,21 @@ cases:
94
111
  - "The summary is grounded in README content, not a generic refusal"
95
112
  ```
96
113
 
97
- Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Claude-specific options go under `claudeCode`.
114
+ Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Harness-specific options nest under `claudeCode`, `codex`, or `geminiCli` depending on `adapter`.
98
115
 
99
- **Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, `reference_trajectory`, `human_ratings`, multi-file layout, and `grading.yaml` options.
116
+ **Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
100
117
 
101
118
  ### 2. Run behavioral eval
102
119
 
103
120
  ```bash
121
+ # Unified pipeline (run + optional grade + envelope when pipeline: is defined)
122
+ npx @alis-build/harness-eval pipeline examples/pipeline/
123
+
124
+ # Or run harness only
104
125
  npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
105
126
  ```
106
127
 
107
- This spawns Claude Code headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
128
+ This spawns the configured harness CLI headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
108
129
 
109
130
  **Progress (stderr):** one line per repetition with ETA by default; use `--quiet` for dots or `--verbose` for tool/assertion detail.
110
131
 
@@ -112,13 +133,20 @@ Exit code `0` = all cells passed all assertion thresholds.
112
133
 
113
134
  ### 3. Grade outcomes (optional)
114
135
 
115
- Judge model, timeout, env, and `claudeCode` flags live in a separate **`grading.yaml`** (not in the suite file). See [`examples/grading.yaml`](examples/grading.yaml).
136
+ **Unified suite:** add a top-level `judge:` block in `suite.yaml` (see [`examples/pipeline/suite.yaml`](examples/pipeline/suite.yaml)), then:
137
+
138
+ ```bash
139
+ npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.yaml --output grading.json --max-concurrent 1 --format console
140
+ # or: npx @alis-build/harness-eval pipeline examples/pipeline/ --steps grade
141
+ ```
142
+
143
+ **Standalone grading file:** judge config in a separate **`grading.yaml`** (still supported). See [`examples/grading.yaml`](examples/grading.yaml).
116
144
 
117
145
  ```bash
118
146
  npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
119
147
  ```
120
148
 
121
- Runs a separate Claude subprocess as **judge** against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
149
+ Runs a separate harness subprocess as **judge** (`judge.adapter`: `claude-code`, `codex`, or `gemini-cli`) against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
122
150
 
123
151
  Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2` = no expectations or no gradable repetitions.
124
152
 
@@ -126,13 +154,13 @@ Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2`
126
154
 
127
155
  ## Data contracts & schemas
128
156
 
129
- harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not Claude `stream-json` or OTLP as your primary record.
157
+ harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not raw adapter NDJSON or OTLP as your primary record.
130
158
 
131
159
  ### Layering
132
160
 
133
161
  | Layer | Type | Where | Use for |
134
162
  | --------------- | --------------------- | ------------------------- | -------------------------------------------------- |
135
- | Vendor stream | `StreamEvent` | `src/types/stream.ts` | Claude `stream-json` debug only |
163
+ | Vendor stream | `StreamEvent` | `src/types/stream.ts` | Adapter debug only (Claude/Codex/Gemini NDJSON) |
136
164
  | Harness session | **`TrajectoryView`** | `src/types/trajectory.ts` | Assertions, trajectory queries, judge input |
137
165
  | Run report | **`SuiteReport`** | `report.json` from `run` | Runner output; full trajectories + assertion stats |
138
166
  | Eval record | **`EvalRunEnvelope`** | `buildEvalRunEnvelope()` | CI gates, APIs, DB storage |
@@ -256,7 +284,7 @@ You do not need `harness-eval grade` if you already have LangSmith, Braintrust,
256
284
  | ------------------------ | ------------------------------ | ------------------------------------------ |
257
285
  | Headless harness runs | `run` / `runSuite` | — |
258
286
  | Tool-call behavior | Assertions on `TrajectoryView` | Optional: re-implement on `toolCalls` |
259
- | Outcome / rubric scoring | `grade` (Claude judge) | Your judge, eval platform, or human review |
287
+ | Outcome / rubric scoring | `grade` (built-in judges) | Your judge, eval platform, or human review |
260
288
  | Storage contract | `EvalRunEnvelope` | Same envelope; attach `externalScores` |
261
289
 
262
290
  ### Pattern 1 — Behavioral only (no LLM judge)
@@ -293,7 +321,7 @@ const myJudge: GraderFn = async ({ prompt, transcript, expectations }) => {
293
321
  const grading = await gradeReport(report, { gradeFn: myJudge });
294
322
  ```
295
323
 
296
- Output is the same `SuiteGradingReport` shape as the built-in Claude grader — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
324
+ Output is the same `SuiteGradingReport` shape as the built-in judges — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
297
325
 
298
326
  ### Pattern 3 — Separate judge pipeline (any language)
299
327
 
@@ -327,7 +355,7 @@ envelope.cells[0].repetitions[0].externalScores = [
327
355
  ];
328
356
  ```
329
357
 
330
- **Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw Claude `stream-json` (Claude-only and verbose).
358
+ **Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw vendor NDJSON (adapter-specific and verbose).
331
359
 
332
360
  ### Pattern 4 — LangSmith, Braintrust, OpenAI Evals, etc.
333
361
 
@@ -389,7 +417,7 @@ Map your framework's output into these shapes (or use `externalScores`) so CI an
389
417
  | Layer | Command | What it checks | Mechanism |
390
418
  | ------------ | ------- | --------------------------------------- | -------------------------------------------- |
391
419
  | **Behavior** | `run` | Tool calls, order, args, efficiency | Deterministic assertions on `TrajectoryView` |
392
- | **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge on transcript + `finalResponse` |
420
+ | **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge (`claude-code`, `codex`, or `gemini-cli`) on transcript + `finalResponse` |
393
421
 
394
422
  Both layers use statistical thresholds: a case runs `repetitions` times per matrix cell, and each assertion/expectation has a pass-rate threshold (default `1.0`).
395
423
 
@@ -401,6 +429,7 @@ Both layers use statistical thresholds: a case runs `repetitions` times per matr
401
429
  npx @alis-build/harness-eval run <suite.yaml> [options]
402
430
  npx @alis-build/harness-eval grade <report.json> [options]
403
431
  npx @alis-build/harness-eval envelope <report.json> [options]
432
+ npx @alis-build/harness-eval pipeline <suite.yaml|dir> [options]
404
433
  npx @alis-build/harness-eval format <report.json> [options]
405
434
  npx @alis-build/harness-eval --help
406
435
  ```
@@ -422,12 +451,12 @@ npx @alis-build/harness-eval --help
422
451
 
423
452
  ### `grade`
424
453
 
425
- Uses a standalone **`grading.yaml`** for judge model, timeout, env, and `claudeCode` flags (Option B separate from the suite file).
454
+ Uses **`grading.yaml`**, an inline **`judge:`** block in `suite.yaml` (`--suite`), or adapter-specific grading files under `examples/`.
426
455
 
427
456
  **Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
428
457
 
429
458
  ```yaml
430
- # examples/grading.yaml
459
+ # examples/grading.yaml (Claude Code judge)
431
460
  judge:
432
461
  adapter: claude-code
433
462
  model: claude-sonnet-4-6
@@ -437,25 +466,38 @@ judge:
437
466
  permissionMode: bypassPermissions
438
467
  ```
439
468
 
469
+ Other committed judge configs: [`examples/codex-grading.yaml`](examples/codex-grading.yaml) (`adapter: codex`), [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) (`adapter: gemini-cli`).
470
+
440
471
  ```bash
441
472
  npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json
473
+ npx @alis-build/harness-eval grade report.json --config examples/codex-grading.yaml --output grading.json
474
+ npx @alis-build/harness-eval grade report.json --config examples/gemini-grading.yaml --output grading.json
442
475
  ```
443
476
 
444
477
  | Option | Description |
445
478
  | -------------------------------------- | ----------------------------------------------------------------- |
446
- | `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, `claudeCode` |
479
+ | `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, adapter options |
480
+ | `--suite <path>` | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
447
481
  | `--output <path>` | Write grading JSON |
448
482
  | `--expectations <path>` | Sidecar YAML/JSON if report lacks expectations |
449
483
  | `--format console\|json` | Output format |
450
484
  | `--model <id>` | Overrides `judge.model` in config |
451
- | `--binary <path>` | Overrides `judge.claudeCode.binary` |
485
+ | `--binary <path>` | Overrides judge binary for the selected adapter |
452
486
  | `--timeout-ms <n>` | Overrides `judge.timeoutMs` |
453
487
  | `--max-concurrent <n>` | Overrides `judge.maxConcurrent` (default: 2 if unset) |
454
488
  | `--quiet` / `--verbose` / `--progress` | Same progress modes as `run` (including `--color` / `--no-color`) |
455
489
 
456
490
  CLI flags override the YAML file. Expectations still come from `report.json` (copied from the suite at `run` time) unless `--expectations` is set. The grading report may include `gradingConfigPath` when `--config` was used.
457
491
 
458
- The built-in judge spawns Claude with **`--output-format json`** (single-shot response, not `stream-json`). It applies **safe defaults** so Claude Code does not reload plugins/MCP during grading: `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, plus `permissionMode: bypassPermissions` on the judge subprocess. Override in `judge.claudeCode` only if you need a different judge setup.
492
+ **Built-in judge defaults** (override under `judge.claudeCode`, `judge.codex`, or `judge.geminiCli`):
493
+
494
+ | Adapter | Defaults (summary) |
495
+ | ------- | ------------------ |
496
+ | `claude-code` | `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, `permissionMode: bypassPermissions`; JSON output |
497
+ | `codex` | `ephemeral: true`, `ignoreUserConfig: true`, `skipGitRepoCheck: true`, `askForApproval: never` |
498
+ | `gemini-cli` | `approvalMode: yolo`, `isolateConfig: true`, `skipTrust: true`; `--output-format json` |
499
+
500
+ See [docs/suite-config.md](docs/suite-config.md) and each adapter section below for full flag tables.
459
501
 
460
502
  Exit codes: `0` = all expectations passed; `1` = failures; `2` = no expectations or no gradable repetitions (harness failures without trajectories are skipped).
461
503
 
@@ -485,6 +527,28 @@ npx @alis-build/harness-eval envelope report.json --projection instances --outpu
485
527
 
486
528
  Exit codes: `0` = envelope built and behavioral pass; `1` = built but behavioral failures; `2` = usage or file errors.
487
529
 
530
+ ### `pipeline`
531
+
532
+ Orchestrate **run → grade → envelope** from a unified `suite.yaml` when a `pipeline:` block is present. See [docs/suite-config.md — Pipeline orchestration](docs/suite-config.md#pipeline-orchestration-pipeline).
533
+
534
+ ```bash
535
+ npx @alis-build/harness-eval pipeline examples/pipeline/
536
+ npx @alis-build/harness-eval pipeline my-suite/ --steps run,grade
537
+ ```
538
+
539
+ | Option | Description |
540
+ | ------ | ----------- |
541
+ | `--steps run,grade,envelope` | Subset of configured steps (default: all configured) |
542
+ | `--output <path>` | Override `pipeline.run.output` |
543
+ | `--report <path>` | Override report input for grade/envelope |
544
+ | `--grading <path>` | Override grading input for envelope |
545
+ | `--grading-output <path>` | Override `pipeline.grade.output` |
546
+ | `--envelope-output <path>` | Override `pipeline.envelope.output` |
547
+ | `--projection envelope\|trajectory\|instances` | Envelope projection |
548
+ | `--max-concurrent <n>` | Parallel harness/judge workers |
549
+
550
+ Exit codes match the first failing step (`run`, `grade`, or `envelope`). Returns `2` when no `pipeline:` block exists.
551
+
488
552
  ### `format`
489
553
 
490
554
  Re-render an existing `report.json` without re-running the harness.
@@ -547,9 +611,17 @@ Define expected tool calls for Vertex trajectory metrics on the eval envelope. U
547
611
 
548
612
  ---
549
613
 
550
- ## Adding harness adapters
614
+ ## Harness adapters
615
+
616
+ Built-in adapters register at module load. Each has a dedicated section below with CLI flag mapping, examples, and judge configuration.
551
617
 
552
- Built-in adapters register at module load. Today only `claude-code` ships; additional harnesses (Codex, Gemini CLI, Antigravity CLI) plug in via the same pattern:
618
+ | Adapter | Suite key | Example suite | Example judge |
619
+ | ------- | --------- | ------------- | ------------- |
620
+ | Claude Code | `claudeCode` | [`examples/basic.yaml`](examples/basic.yaml) | [`examples/grading.yaml`](examples/grading.yaml) |
621
+ | Codex CLI | `codex` | [`examples/codex-basic.yaml`](examples/codex-basic.yaml) | [`examples/codex-grading.yaml`](examples/codex-grading.yaml) |
622
+ | Gemini CLI | `geminiCli` | [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) | [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) |
623
+
624
+ Additional harnesses (e.g. Antigravity CLI) plug in via the same pattern:
553
625
 
554
626
  1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
555
627
  2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
@@ -564,7 +636,7 @@ import {
564
636
  } from "@alis-build/harness-eval";
565
637
 
566
638
  registerAdapter("my-harness", myAdapter);
567
- console.log(listAdapters()); // ["claude-code", "my-harness"]
639
+ console.log(listAdapters()); // ["claude-code", "codex", "gemini-cli", …]
568
640
  ```
569
641
 
570
642
  Duplicate registration throws so accidental overrides fail fast during startup or tests.
@@ -620,12 +692,92 @@ The adapter captures Claude’s stream-json output and builds a `TrajectoryView`
620
692
 
621
693
  ---
622
694
 
695
+ ## Codex CLI adapter
696
+
697
+ Nested under `codex` in YAML (or flat in programmatic config). Maps to [Codex CLI reference](https://developers.openai.com/codex/cli/reference) (`codex exec` flags).
698
+
699
+ The harness adapter invokes:
700
+
701
+ ```bash
702
+ codex --ask-for-approval never exec --json [exec flags…] "<prompt>"
703
+ ```
704
+
705
+ `--ask-for-approval` is a **global** flag (before `exec`); other options attach to the `exec` subcommand.
706
+
707
+ | Field | CLI flag | Notes |
708
+ | ----- | -------- | ----- |
709
+ | `binary` | — | Default `codex` |
710
+ | `model` | `--model` | Also settable at top level |
711
+ | `profile` | `--profile` | Layer `$CODEX_HOME/<profile>.config.toml` |
712
+ | `sandbox` | `--sandbox` | `read-only`, `workspace-write`, `danger-full-access` |
713
+ | `addDirs` | `--add-dir` | Extra writable dirs (repeatable) |
714
+ | `configOverrides` | `-c key=value` | Inline TOML overrides (repeatable) |
715
+ | `askForApproval` | `--ask-for-approval` | Default `never` for non-interactive eval |
716
+ | `dangerouslyBypassApprovalsAndSandbox` | `--yolo` | Hardened CI only |
717
+ | `dangerouslyBypassHookTrust` | `--dangerously-bypass-hook-trust` | Automation with vetted hooks |
718
+ | `ephemeral` | `--ephemeral` | No session rollout files |
719
+ | `ignoreUserConfig` | `--ignore-user-config` | Skip `$CODEX_HOME/config.toml` |
720
+ | `skipGitRepoCheck` | `--skip-git-repo-check` | Allow runs outside git repos |
721
+ | `outputSchema` | `--output-schema` | JSON Schema for structured final output |
722
+ | `outputLastMessage` | `--output-last-message` | Write final assistant message to file (auto temp path when `captureLastMessage` is true) |
723
+ | `captureLastMessage` | — | Default `true`: auto `--output-last-message` and read into `finalResponse` if JSONL has no assistant text |
724
+ | `isolateConfig` | — | `false` (default) = inherit `~/.codex`; `true` = temp `$CODEX_HOME` per run |
725
+
726
+ Generic `cwd` sets the child process working directory (`--cd`). MCP tool calls in Codex `--json` output map to harness names `mcp__<server>__<tool>`; shell commands map to `Bash`.
727
+
728
+ The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/codex/` — CI does not require `codex` on `PATH`.
729
+
730
+ **Example suite:** [examples/codex-basic.yaml](examples/codex-basic.yaml)
731
+
732
+ **Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
733
+
734
+ **Package export:** `@alis-build/harness-eval/adapters/codex`
735
+
736
+ ---
737
+
738
+ ## Gemini CLI adapter
739
+
740
+ Nested under `geminiCli` in YAML (or flat in programmatic config). Maps to [Gemini CLI reference](https://geminicli.com/docs/cli/cli-reference/).
741
+
742
+ The harness adapter invokes:
743
+
744
+ ```bash
745
+ gemini -p "<prompt>" --output-format stream-json --approval-mode yolo [flags…]
746
+ ```
747
+
748
+ | Field | CLI flag | Notes |
749
+ | ----- | -------- | ----- |
750
+ | `binary` | — | Default `gemini` |
751
+ | `model` | `--model` | Also settable at top level |
752
+ | `approvalMode` | `--approval-mode` | Default `yolo`; overridable: `default`, `auto_edit`, `plan` |
753
+ | `sandbox` | `--sandbox` | Sandboxed execution |
754
+ | `skipTrust` | `--skip-trust` | Default `true` for harness and judge — skips folder trust in headless runs |
755
+ | `includeDirectories` | `--include-directories` | Extra workspace dirs (repeatable) |
756
+ | `allowedMcpServerNames` | `--allowed-mcp-server-names` | MCP server allowlist |
757
+ | `extensions` | `--extensions` | Extension allowlist |
758
+ | `debug` | `--debug` | Verbose logging |
759
+ | `isolateConfig` | — | `false` (default) = inherit caller config; `true` = temp config dir per run |
760
+
761
+ MCP tool calls map to harness names `mcp__<server>__<tool>`; built-in Gemini tools keep native names (e.g. `Bash`, `read_file`).
762
+
763
+ The adapter maps Gemini stream-json events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/gemini-cli/` — CI does not require `gemini` on `PATH`.
764
+
765
+ **Example suite:** [examples/gemini-cli-basic.yaml](examples/gemini-cli-basic.yaml)
766
+
767
+ **Gemini CLI judge:** set `judge.adapter: gemini-cli` and nest options under `judge.geminiCli` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)). Example: [examples/gemini-grading.yaml](examples/gemini-grading.yaml).
768
+
769
+ **Package export:** `@alis-build/harness-eval/adapters/gemini-cli`
770
+
771
+ ---
772
+
623
773
  ## Library API
624
774
 
625
775
  ```typescript
626
776
  import {
627
777
  loadSuite,
778
+ loadSuiteDocument,
628
779
  runSuite,
780
+ runPipeline,
629
781
  gradeReport,
630
782
  buildEvalRunEnvelope,
631
783
  trajectoryToTranscript,
@@ -635,6 +787,11 @@ import {
635
787
  } from "@alis-build/harness-eval";
636
788
  import { loadGradingConfig } from "@alis-build/harness-eval/config";
637
789
 
790
+ // Unified pipeline
791
+ const doc = await loadSuiteDocument("./examples/pipeline/suite.yaml");
792
+ const { exitCode } = await runPipeline(doc, { maxConcurrent: 2 });
793
+
794
+ // Or step-by-step
638
795
  const suite = await loadSuite("./examples/basic.yaml");
639
796
  const report = await runSuite(suite, { maxConcurrent: 2 });
640
797
 
@@ -659,7 +816,7 @@ const envelope = buildEvalRunEnvelope(report, {
659
816
  });
660
817
  ```
661
818
 
662
- Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`.
819
+ Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`, `@alis-build/harness-eval/adapters/gemini-cli`.
663
820
 
664
821
  ---
665
822
 
@@ -682,7 +839,7 @@ Suite YAML → runSuite → Harness adapter → TrajectoryView
682
839
  EvalRunEnvelope → DB / CI / API
683
840
  ```
684
841
 
685
- - **Pluggable harness adapters** — runner and assertions depend only on `TrajectoryView`.
842
+ - **Pluggable harness adapters** — `claude-code`, `codex`, and `gemini-cli` today; runner and assertions depend only on `TrajectoryView`.
686
843
  - **Pluggable outcome layer** — built-in `grade`, custom `gradeFn`, or any external workflow.
687
844
  - **OTLP** — observability side export; not required for scoring.
688
845
 
@@ -1,3 +1,3 @@
1
- import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-B9H4IZtA.js";
2
- import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-V22PrR0p.js";
1
+ import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-D0HR2WnP.js";
2
+ import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-C56AEDUr.js";
3
3
  export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };
@@ -1,2 +1,3 @@
1
- import { a as AdapterError, r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-DZ4Vkgp6.js";
1
+ import { t as AdapterError } from "../../types-Bac8_Ixb.js";
2
+ import { r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-C_7hxC8z.js";
2
3
  export { AdapterError, claudeCodeAdapter, runClaudeCode };
@@ -0,0 +1,68 @@
1
+ import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-D0HR2WnP.js";
2
+ import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-DFMpv_HJ.js";
3
+
4
+ //#region src/adapters/codex/map-events.d.ts
5
+ /** Stateful mapper — tracks session id and pending tool calls across the stream. */
6
+ declare class CodexEventMapper {
7
+ private sessionId;
8
+ private sawInit;
9
+ private startedItems;
10
+ private turnCount;
11
+ /** Map one parsed Codex JSON object to zero or more stream events. */
12
+ map(event: CodexJsonEvent): StreamEvent[];
13
+ private buildInit;
14
+ private ensureInit;
15
+ private mapItemStarted;
16
+ private mapItemCompleted;
17
+ private toolUseEvent;
18
+ private commandUseEvent;
19
+ private toolResultEvent;
20
+ private buildResult;
21
+ }
22
+ /** Map an entire fixture or stream of Codex events through a fresh mapper. */
23
+ declare function mapCodexEvents(events: CodexJsonEvent[]): StreamEvent[];
24
+ /** Build harness-qualified MCP tool name from Codex server + tool fields. */
25
+ declare function mcpToolName(server: string, tool: string): string;
26
+ //#endregion
27
+ //#region src/adapters/codex/flags.d.ts
28
+ /** Prepend global flags that must appear before the `exec` subcommand. */
29
+ declare function appendGlobalCodexFlags(args: string[], config: CodexOptions): void;
30
+ /** Append `codex exec` subcommand flags (after `exec`, before prompt). */
31
+ declare function appendExecCodexFlags(args: string[], config: CodexOptions & {
32
+ model?: string;
33
+ cwd?: string;
34
+ }): void;
35
+ /** @deprecated Use appendGlobalCodexFlags + appendExecCodexFlags */
36
+ declare function appendCodexFlags(args: string[], config: CodexOptions & {
37
+ model?: string;
38
+ cwd?: string;
39
+ }): void;
40
+ /**
41
+ * Ensure harness runs pass `--output-last-message` when capture is enabled.
42
+ * Returns the auto-generated path (for cleanup), or null if unchanged.
43
+ */
44
+ declare function ensureHarnessOutputLastMessage(config: CodexAdapterConfig): string | null;
45
+ /**
46
+ * Build argv for `codex --ask-for-approval never exec --json … "<prompt>"`.
47
+ *
48
+ * Expects `config.outputLastMessage` to already be set if capture is desired;
49
+ * call {@link ensureHarnessOutputLastMessage} before this if spawning outside
50
+ * of {@link spawnCodex}.
51
+ */
52
+ declare function buildArgs(config: CodexAdapterConfig): string[];
53
+ /**
54
+ * Build argv for `codex --ask-for-approval never exec … "<prompt>"` (no `--json`).
55
+ */
56
+ declare function buildJudgeArgs(prompt: string, config?: CodexOptions & {
57
+ model?: string;
58
+ cwd?: string;
59
+ }): string[];
60
+ //#endregion
61
+ //#region src/adapters/codex/index.d.ts
62
+ /** Run Codex in headless `exec --json` mode and return a trajectory. */
63
+ declare function runCodex(config: CodexAdapterConfig): Promise<CodexAdapterResult>;
64
+ /** Registered {@link HarnessAdapter} for Codex CLI headless runs. */
65
+ declare const codexAdapter: HarnessAdapter<CodexAdapterConfig>;
66
+ //#endregion
67
+ export { type AdapterDiagnostics, AdapterError, type AdapterResult, type CodexAdapterConfig, type CodexAdapterResult, CodexEventMapper, type CodexOptions, type ParseErrorRecord, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
68
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,3 @@
1
+ import { t as AdapterError } from "../../types-Bac8_Ixb.js";
2
+ import { a as appendGlobalCodexFlags, c as ensureHarnessOutputLastMessage, d as mcpToolName, i as appendExecCodexFlags, l as CodexEventMapper, n as runCodex, o as buildArgs, r as appendCodexFlags, s as buildJudgeArgs, t as codexAdapter, u as mapCodexEvents } from "../../codex-0cHO2te9.js";
3
+ export { AdapterError, CodexEventMapper, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };