@alis-build/harness-eval 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +187 -30
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/claude-code/index.js +2 -1
- package/dist/adapters/codex/index.d.ts +68 -0
- package/dist/adapters/codex/index.js +3 -0
- package/dist/{claude-code-DZ4Vkgp6.js → claude-code-C_7hxC8z.js} +3 -245
- package/dist/claude-code-C_7hxC8z.js.map +1 -0
- package/dist/cli/bin.js +131 -151
- package/dist/cli/bin.js.map +1 -1
- package/dist/codex-0cHO2te9.js +496 -0
- package/dist/codex-0cHO2te9.js.map +1 -0
- package/dist/config/loader.d.ts +2 -2
- package/dist/config/loader.js +2 -2
- package/dist/{index-V22PrR0p.d.ts → index-C56AEDUr.d.ts} +2 -2
- package/dist/index.d.ts +134 -6
- package/dist/index.js +6 -5
- package/dist/index.js.map +1 -1
- package/dist/{loader-DcI0KfRX.js → loader-CiBm4Kf6.js} +491 -209
- package/dist/loader-CiBm4Kf6.js.map +1 -0
- package/dist/loader-CrmzNwkq.d.ts +107 -0
- package/dist/{projections-BcX7w-f6.js → reporter-BKCJZRYr.js} +1475 -729
- package/dist/reporter-BKCJZRYr.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-Dlzl-HI0.js → suite-C3-8EjUW.js} +558 -4
- package/dist/suite-C3-8EjUW.js.map +1 -0
- package/dist/{suite-DPJMIEbu.d.ts → suite-qyOGre2g.d.ts} +2 -2
- package/dist/types-Bac8_Ixb.js +246 -0
- package/dist/types-Bac8_Ixb.js.map +1 -0
- package/dist/{types-CD3TwOtZ.d.ts → types-CLt4Yygc.d.ts} +2 -2
- package/dist/{types-B9H4IZtA.d.ts → types-D0HR2WnP.d.ts} +9 -2
- package/dist/types-DFMpv_HJ.d.ts +77 -0
- package/package.json +11 -2
- package/schemas/eval-run-envelope.schema.json +193 -183
- package/dist/claude-code-DZ4Vkgp6.js.map +0 -1
- package/dist/loader-C9yQHUPC.d.ts +0 -50
- package/dist/loader-DcI0KfRX.js.map +0 -1
- package/dist/projections-BcX7w-f6.js.map +0 -1
- package/dist/suite-Dlzl-HI0.js.map +0 -1
package/README.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# @alis-build/harness-eval
|
|
2
2
|
|
|
3
|
-
Statistical eval framework for **AI coding agent harnesses
|
|
3
|
+
Statistical eval framework for **AI coding agent harnesses**. Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
|
|
4
|
+
|
|
5
|
+
**Built-in harness adapters:** `claude-code`, `codex`, and `gemini-cli`. Set `adapter:` in suite YAML; the runner, assertions, and eval interchange stay the same regardless of vendor.
|
|
4
6
|
|
|
5
7
|
**Use it to answer:** “When users ask X, does this harness actually call our MCP tools — reliably, in this plugin/model setup?”
|
|
6
8
|
|
|
@@ -9,10 +11,20 @@ Statistical eval framework for **AI coding agent harnesses** (Claude Code today;
|
|
|
9
11
|
## Requirements
|
|
10
12
|
|
|
11
13
|
- Node.js ≥ 22.12 required; Node 24 LTS recommended for development and CI
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
14
|
+
- A harness CLI on `PATH` for the adapter you use (see [Adding harness adapters](#adding-harness-adapters)):
|
|
15
|
+
- **`claude-code`** — `claude` ([Claude Code CLI](https://code.claude.com/docs/en/cli-reference))
|
|
16
|
+
- **`codex`** — `codex` ([Codex CLI](https://developers.openai.com/codex/cli/reference))
|
|
17
|
+
- **`gemini-cli`** — `gemini` ([Gemini CLI](https://geminicli.com/docs/cli/cli-reference/))
|
|
18
|
+
|
|
19
|
+
### Authentication (by adapter)
|
|
20
|
+
|
|
21
|
+
| Adapter | Typical auth |
|
|
22
|
+
| ------- | ------------ |
|
|
23
|
+
| **Claude Code** | `claude login` with `isolateConfig: false`, or `ANTHROPIC_API_KEY` with isolated config (default harness behavior) |
|
|
24
|
+
| **Codex** | Logged-in `~/.codex`, or `OPENAI_API_KEY` when `codex.isolateConfig: true` |
|
|
25
|
+
| **Gemini CLI** | Logged-in Gemini CLI config with `geminiCli.isolateConfig: false`, or Vertex/API key env vars (`GOOGLE_APPLICATION_CREDENTIALS`, `GEMINI_API_KEY`, etc.) when isolated |
|
|
26
|
+
|
|
27
|
+
Each adapter section below documents `isolateConfig`, MCP setup, and headless flags in detail.
|
|
16
28
|
|
|
17
29
|
---
|
|
18
30
|
|
|
@@ -54,13 +66,18 @@ pnpm exec harness-eval --help
|
|
|
54
66
|
|
|
55
67
|
Suites are YAML files. Committed examples:
|
|
56
68
|
|
|
57
|
-
- [`examples/
|
|
58
|
-
- [`examples/
|
|
69
|
+
- [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
|
|
70
|
+
- [`examples/basic.yaml`](examples/basic.yaml) — Claude Code smoke test (`Read` on this repo's README)
|
|
71
|
+
- [`examples/codex-basic.yaml`](examples/codex-basic.yaml) — Codex CLI smoke test
|
|
72
|
+
- [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) — Gemini CLI smoke test
|
|
73
|
+
- [`examples/matrix.yaml`](examples/matrix.yaml) — Claude Code with a model matrix (sonnet vs opus)
|
|
59
74
|
- [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
|
|
60
|
-
- [`examples/grading.yaml`](examples/grading.yaml) —
|
|
75
|
+
- [`examples/grading.yaml`](examples/grading.yaml) — Claude Code judge config (standalone)
|
|
76
|
+
- [`examples/codex-grading.yaml`](examples/codex-grading.yaml) — Codex judge config
|
|
77
|
+
- [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) — Gemini CLI judge config
|
|
61
78
|
|
|
62
79
|
```yaml
|
|
63
|
-
adapter: claude-code
|
|
80
|
+
adapter: claude-code # or: codex | gemini-cli
|
|
64
81
|
|
|
65
82
|
defaultConfig:
|
|
66
83
|
model: claude-sonnet-4-6
|
|
@@ -94,17 +111,21 @@ cases:
|
|
|
94
111
|
- "The summary is grounded in README content, not a generic refusal"
|
|
95
112
|
```
|
|
96
113
|
|
|
97
|
-
Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level.
|
|
114
|
+
Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Harness-specific options nest under `claudeCode`, `codex`, or `geminiCli` depending on `adapter`.
|
|
98
115
|
|
|
99
|
-
**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, `
|
|
116
|
+
**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
|
|
100
117
|
|
|
101
118
|
### 2. Run behavioral eval
|
|
102
119
|
|
|
103
120
|
```bash
|
|
121
|
+
# Unified pipeline (run + optional grade + envelope when pipeline: is defined)
|
|
122
|
+
npx @alis-build/harness-eval pipeline examples/pipeline/
|
|
123
|
+
|
|
124
|
+
# Or run harness only
|
|
104
125
|
npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
|
|
105
126
|
```
|
|
106
127
|
|
|
107
|
-
This spawns
|
|
128
|
+
This spawns the configured harness CLI headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
|
|
108
129
|
|
|
109
130
|
**Progress (stderr):** one line per repetition with ETA by default; use `--quiet` for dots or `--verbose` for tool/assertion detail.
|
|
110
131
|
|
|
@@ -112,13 +133,20 @@ Exit code `0` = all cells passed all assertion thresholds.
|
|
|
112
133
|
|
|
113
134
|
### 3. Grade outcomes (optional)
|
|
114
135
|
|
|
115
|
-
|
|
136
|
+
**Unified suite:** add a top-level `judge:` block in `suite.yaml` (see [`examples/pipeline/suite.yaml`](examples/pipeline/suite.yaml)), then:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.yaml --output grading.json --max-concurrent 1 --format console
|
|
140
|
+
# or: npx @alis-build/harness-eval pipeline examples/pipeline/ --steps grade
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
**Standalone grading file:** judge config in a separate **`grading.yaml`** (still supported). See [`examples/grading.yaml`](examples/grading.yaml).
|
|
116
144
|
|
|
117
145
|
```bash
|
|
118
146
|
npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
|
|
119
147
|
```
|
|
120
148
|
|
|
121
|
-
Runs a separate
|
|
149
|
+
Runs a separate harness subprocess as **judge** (`judge.adapter`: `claude-code`, `codex`, or `gemini-cli`) against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
|
|
122
150
|
|
|
123
151
|
Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2` = no expectations or no gradable repetitions.
|
|
124
152
|
|
|
@@ -126,13 +154,13 @@ Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2`
|
|
|
126
154
|
|
|
127
155
|
## Data contracts & schemas
|
|
128
156
|
|
|
129
|
-
harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not
|
|
157
|
+
harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not raw adapter NDJSON or OTLP as your primary record.
|
|
130
158
|
|
|
131
159
|
### Layering
|
|
132
160
|
|
|
133
161
|
| Layer | Type | Where | Use for |
|
|
134
162
|
| --------------- | --------------------- | ------------------------- | -------------------------------------------------- |
|
|
135
|
-
| Vendor stream | `StreamEvent` | `src/types/stream.ts` |
|
|
163
|
+
| Vendor stream | `StreamEvent` | `src/types/stream.ts` | Adapter debug only (Claude/Codex/Gemini NDJSON) |
|
|
136
164
|
| Harness session | **`TrajectoryView`** | `src/types/trajectory.ts` | Assertions, trajectory queries, judge input |
|
|
137
165
|
| Run report | **`SuiteReport`** | `report.json` from `run` | Runner output; full trajectories + assertion stats |
|
|
138
166
|
| Eval record | **`EvalRunEnvelope`** | `buildEvalRunEnvelope()` | CI gates, APIs, DB storage |
|
|
@@ -256,7 +284,7 @@ You do not need `harness-eval grade` if you already have LangSmith, Braintrust,
|
|
|
256
284
|
| ------------------------ | ------------------------------ | ------------------------------------------ |
|
|
257
285
|
| Headless harness runs | `run` / `runSuite` | — |
|
|
258
286
|
| Tool-call behavior | Assertions on `TrajectoryView` | Optional: re-implement on `toolCalls` |
|
|
259
|
-
| Outcome / rubric scoring | `grade` (
|
|
287
|
+
| Outcome / rubric scoring | `grade` (built-in judges) | Your judge, eval platform, or human review |
|
|
260
288
|
| Storage contract | `EvalRunEnvelope` | Same envelope; attach `externalScores` |
|
|
261
289
|
|
|
262
290
|
### Pattern 1 — Behavioral only (no LLM judge)
|
|
@@ -293,7 +321,7 @@ const myJudge: GraderFn = async ({ prompt, transcript, expectations }) => {
|
|
|
293
321
|
const grading = await gradeReport(report, { gradeFn: myJudge });
|
|
294
322
|
```
|
|
295
323
|
|
|
296
|
-
Output is the same `SuiteGradingReport` shape as the built-in
|
|
324
|
+
Output is the same `SuiteGradingReport` shape as the built-in judges — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
|
|
297
325
|
|
|
298
326
|
### Pattern 3 — Separate judge pipeline (any language)
|
|
299
327
|
|
|
@@ -327,7 +355,7 @@ envelope.cells[0].repetitions[0].externalScores = [
|
|
|
327
355
|
];
|
|
328
356
|
```
|
|
329
357
|
|
|
330
|
-
**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw
|
|
358
|
+
**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw vendor NDJSON (adapter-specific and verbose).
|
|
331
359
|
|
|
332
360
|
### Pattern 4 — LangSmith, Braintrust, OpenAI Evals, etc.
|
|
333
361
|
|
|
@@ -389,7 +417,7 @@ Map your framework's output into these shapes (or use `externalScores`) so CI an
|
|
|
389
417
|
| Layer | Command | What it checks | Mechanism |
|
|
390
418
|
| ------------ | ------- | --------------------------------------- | -------------------------------------------- |
|
|
391
419
|
| **Behavior** | `run` | Tool calls, order, args, efficiency | Deterministic assertions on `TrajectoryView` |
|
|
392
|
-
| **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge on transcript + `finalResponse`
|
|
420
|
+
| **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge (`claude-code`, `codex`, or `gemini-cli`) on transcript + `finalResponse` |
|
|
393
421
|
|
|
394
422
|
Both layers use statistical thresholds: a case runs `repetitions` times per matrix cell, and each assertion/expectation has a pass-rate threshold (default `1.0`).
|
|
395
423
|
|
|
@@ -401,6 +429,7 @@ Both layers use statistical thresholds: a case runs `repetitions` times per matr
|
|
|
401
429
|
npx @alis-build/harness-eval run <suite.yaml> [options]
|
|
402
430
|
npx @alis-build/harness-eval grade <report.json> [options]
|
|
403
431
|
npx @alis-build/harness-eval envelope <report.json> [options]
|
|
432
|
+
npx @alis-build/harness-eval pipeline <suite.yaml|dir> [options]
|
|
404
433
|
npx @alis-build/harness-eval format <report.json> [options]
|
|
405
434
|
npx @alis-build/harness-eval --help
|
|
406
435
|
```
|
|
@@ -422,12 +451,12 @@ npx @alis-build/harness-eval --help
|
|
|
422
451
|
|
|
423
452
|
### `grade`
|
|
424
453
|
|
|
425
|
-
Uses
|
|
454
|
+
Uses **`grading.yaml`**, an inline **`judge:`** block in `suite.yaml` (`--suite`), or adapter-specific grading files under `examples/`.
|
|
426
455
|
|
|
427
456
|
**Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
|
|
428
457
|
|
|
429
458
|
```yaml
|
|
430
|
-
# examples/grading.yaml
|
|
459
|
+
# examples/grading.yaml (Claude Code judge)
|
|
431
460
|
judge:
|
|
432
461
|
adapter: claude-code
|
|
433
462
|
model: claude-sonnet-4-6
|
|
@@ -437,25 +466,38 @@ judge:
|
|
|
437
466
|
permissionMode: bypassPermissions
|
|
438
467
|
```
|
|
439
468
|
|
|
469
|
+
Other committed judge configs: [`examples/codex-grading.yaml`](examples/codex-grading.yaml) (`adapter: codex`), [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) (`adapter: gemini-cli`).
|
|
470
|
+
|
|
440
471
|
```bash
|
|
441
472
|
npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json
|
|
473
|
+
npx @alis-build/harness-eval grade report.json --config examples/codex-grading.yaml --output grading.json
|
|
474
|
+
npx @alis-build/harness-eval grade report.json --config examples/gemini-grading.yaml --output grading.json
|
|
442
475
|
```
|
|
443
476
|
|
|
444
477
|
| Option | Description |
|
|
445
478
|
| -------------------------------------- | ----------------------------------------------------------------- |
|
|
446
|
-
| `--config <path>` | Grading YAML (`judge` block) — model, env, timeout,
|
|
479
|
+
| `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, adapter options |
|
|
480
|
+
| `--suite <path>` | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
|
|
447
481
|
| `--output <path>` | Write grading JSON |
|
|
448
482
|
| `--expectations <path>` | Sidecar YAML/JSON if report lacks expectations |
|
|
449
483
|
| `--format console\|json` | Output format |
|
|
450
484
|
| `--model <id>` | Overrides `judge.model` in config |
|
|
451
|
-
| `--binary <path>` | Overrides
|
|
485
|
+
| `--binary <path>` | Overrides judge binary for the selected adapter |
|
|
452
486
|
| `--timeout-ms <n>` | Overrides `judge.timeoutMs` |
|
|
453
487
|
| `--max-concurrent <n>` | Overrides `judge.maxConcurrent` (default: 2 if unset) |
|
|
454
488
|
| `--quiet` / `--verbose` / `--progress` | Same progress modes as `run` (including `--color` / `--no-color`) |
|
|
455
489
|
|
|
456
490
|
CLI flags override the YAML file. Expectations still come from `report.json` (copied from the suite at `run` time) unless `--expectations` is set. The grading report may include `gradingConfigPath` when `--config` was used.
|
|
457
491
|
|
|
458
|
-
|
|
492
|
+
**Built-in judge defaults** (override under `judge.claudeCode`, `judge.codex`, or `judge.geminiCli`):
|
|
493
|
+
|
|
494
|
+
| Adapter | Defaults (summary) |
|
|
495
|
+
| ------- | ------------------ |
|
|
496
|
+
| `claude-code` | `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, `permissionMode: bypassPermissions`; JSON output |
|
|
497
|
+
| `codex` | `ephemeral: true`, `ignoreUserConfig: true`, `skipGitRepoCheck: true`, `askForApproval: never` |
|
|
498
|
+
| `gemini-cli` | `approvalMode: yolo`, `isolateConfig: true`, `skipTrust: true`; `--output-format json` |
|
|
499
|
+
|
|
500
|
+
See [docs/suite-config.md](docs/suite-config.md) and each adapter section below for full flag tables.
|
|
459
501
|
|
|
460
502
|
Exit codes: `0` = all expectations passed; `1` = failures; `2` = no expectations or no gradable repetitions (harness failures without trajectories are skipped).
|
|
461
503
|
|
|
@@ -485,6 +527,28 @@ npx @alis-build/harness-eval envelope report.json --projection instances --outpu
|
|
|
485
527
|
|
|
486
528
|
Exit codes: `0` = envelope built and behavioral pass; `1` = built but behavioral failures; `2` = usage or file errors.
|
|
487
529
|
|
|
530
|
+
### `pipeline`
|
|
531
|
+
|
|
532
|
+
Orchestrate **run → grade → envelope** from a unified `suite.yaml` when a `pipeline:` block is present. See [docs/suite-config.md — Pipeline orchestration](docs/suite-config.md#pipeline-orchestration-pipeline).
|
|
533
|
+
|
|
534
|
+
```bash
|
|
535
|
+
npx @alis-build/harness-eval pipeline examples/pipeline/
|
|
536
|
+
npx @alis-build/harness-eval pipeline my-suite/ --steps run,grade
|
|
537
|
+
```
|
|
538
|
+
|
|
539
|
+
| Option | Description |
|
|
540
|
+
| ------ | ----------- |
|
|
541
|
+
| `--steps run,grade,envelope` | Subset of configured steps (default: all configured) |
|
|
542
|
+
| `--output <path>` | Override `pipeline.run.output` |
|
|
543
|
+
| `--report <path>` | Override report input for grade/envelope |
|
|
544
|
+
| `--grading <path>` | Override grading input for envelope |
|
|
545
|
+
| `--grading-output <path>` | Override `pipeline.grade.output` |
|
|
546
|
+
| `--envelope-output <path>` | Override `pipeline.envelope.output` |
|
|
547
|
+
| `--projection envelope\|trajectory\|instances` | Envelope projection |
|
|
548
|
+
| `--max-concurrent <n>` | Parallel harness/judge workers |
|
|
549
|
+
|
|
550
|
+
Exit codes match the first failing step (`run`, `grade`, or `envelope`). Returns `2` when no `pipeline:` block exists.
|
|
551
|
+
|
|
488
552
|
### `format`
|
|
489
553
|
|
|
490
554
|
Re-render an existing `report.json` without re-running the harness.
|
|
@@ -547,9 +611,17 @@ Define expected tool calls for Vertex trajectory metrics on the eval envelope. U
|
|
|
547
611
|
|
|
548
612
|
---
|
|
549
613
|
|
|
550
|
-
##
|
|
614
|
+
## Harness adapters
|
|
615
|
+
|
|
616
|
+
Built-in adapters register at module load. Each has a dedicated section below with CLI flag mapping, examples, and judge configuration.
|
|
551
617
|
|
|
552
|
-
|
|
618
|
+
| Adapter | Suite key | Example suite | Example judge |
|
|
619
|
+
| ------- | --------- | ------------- | ------------- |
|
|
620
|
+
| Claude Code | `claudeCode` | [`examples/basic.yaml`](examples/basic.yaml) | [`examples/grading.yaml`](examples/grading.yaml) |
|
|
621
|
+
| Codex CLI | `codex` | [`examples/codex-basic.yaml`](examples/codex-basic.yaml) | [`examples/codex-grading.yaml`](examples/codex-grading.yaml) |
|
|
622
|
+
| Gemini CLI | `geminiCli` | [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) | [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) |
|
|
623
|
+
|
|
624
|
+
Additional harnesses (e.g. Antigravity CLI) plug in via the same pattern:
|
|
553
625
|
|
|
554
626
|
1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
|
|
555
627
|
2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
|
|
@@ -564,7 +636,7 @@ import {
|
|
|
564
636
|
} from "@alis-build/harness-eval";
|
|
565
637
|
|
|
566
638
|
registerAdapter("my-harness", myAdapter);
|
|
567
|
-
console.log(listAdapters()); // ["claude-code", "
|
|
639
|
+
console.log(listAdapters()); // ["claude-code", "codex", "gemini-cli", …]
|
|
568
640
|
```
|
|
569
641
|
|
|
570
642
|
Duplicate registration throws so accidental overrides fail fast during startup or tests.
|
|
@@ -620,12 +692,92 @@ The adapter captures Claude’s stream-json output and builds a `TrajectoryView`
|
|
|
620
692
|
|
|
621
693
|
---
|
|
622
694
|
|
|
695
|
+
## Codex CLI adapter
|
|
696
|
+
|
|
697
|
+
Nested under `codex` in YAML (or flat in programmatic config). Maps to [Codex CLI reference](https://developers.openai.com/codex/cli/reference) (`codex exec` flags).
|
|
698
|
+
|
|
699
|
+
The harness adapter invokes:
|
|
700
|
+
|
|
701
|
+
```bash
|
|
702
|
+
codex --ask-for-approval never exec --json [exec flags…] "<prompt>"
|
|
703
|
+
```
|
|
704
|
+
|
|
705
|
+
`--ask-for-approval` is a **global** flag (before `exec`); other options attach to the `exec` subcommand.
|
|
706
|
+
|
|
707
|
+
| Field | CLI flag | Notes |
|
|
708
|
+
| ----- | -------- | ----- |
|
|
709
|
+
| `binary` | — | Default `codex` |
|
|
710
|
+
| `model` | `--model` | Also settable at top level |
|
|
711
|
+
| `profile` | `--profile` | Layer `$CODEX_HOME/<profile>.config.toml` |
|
|
712
|
+
| `sandbox` | `--sandbox` | `read-only`, `workspace-write`, `danger-full-access` |
|
|
713
|
+
| `addDirs` | `--add-dir` | Extra writable dirs (repeatable) |
|
|
714
|
+
| `configOverrides` | `-c key=value` | Inline TOML overrides (repeatable) |
|
|
715
|
+
| `askForApproval` | `--ask-for-approval` | Default `never` for non-interactive eval |
|
|
716
|
+
| `dangerouslyBypassApprovalsAndSandbox` | `--yolo` | Hardened CI only |
|
|
717
|
+
| `dangerouslyBypassHookTrust` | `--dangerously-bypass-hook-trust` | Automation with vetted hooks |
|
|
718
|
+
| `ephemeral` | `--ephemeral` | No session rollout files |
|
|
719
|
+
| `ignoreUserConfig` | `--ignore-user-config` | Skip `$CODEX_HOME/config.toml` |
|
|
720
|
+
| `skipGitRepoCheck` | `--skip-git-repo-check` | Allow runs outside git repos |
|
|
721
|
+
| `outputSchema` | `--output-schema` | JSON Schema for structured final output |
|
|
722
|
+
| `outputLastMessage` | `--output-last-message` | Write final assistant message to file (auto temp path when `captureLastMessage` is true) |
|
|
723
|
+
| `captureLastMessage` | — | Default `true`: auto `--output-last-message` and read into `finalResponse` if JSONL has no assistant text |
|
|
724
|
+
| `isolateConfig` | — | `false` (default) = inherit `~/.codex`; `true` = temp `$CODEX_HOME` per run |
|
|
725
|
+
|
|
726
|
+
Generic `cwd` sets the child process working directory (`--cd`). MCP tool calls in Codex `--json` output map to harness names `mcp__<server>__<tool>`; shell commands map to `Bash`.
|
|
727
|
+
|
|
728
|
+
The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/codex/` — CI does not require `codex` on `PATH`.
|
|
729
|
+
|
|
730
|
+
**Example suite:** [examples/codex-basic.yaml](examples/codex-basic.yaml)
|
|
731
|
+
|
|
732
|
+
**Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
|
|
733
|
+
|
|
734
|
+
**Package export:** `@alis-build/harness-eval/adapters/codex`
|
|
735
|
+
|
|
736
|
+
---
|
|
737
|
+
|
|
738
|
+
## Gemini CLI adapter
|
|
739
|
+
|
|
740
|
+
Nested under `geminiCli` in YAML (or flat in programmatic config). Maps to [Gemini CLI reference](https://geminicli.com/docs/cli/cli-reference/).
|
|
741
|
+
|
|
742
|
+
The harness adapter invokes:
|
|
743
|
+
|
|
744
|
+
```bash
|
|
745
|
+
gemini -p "<prompt>" --output-format stream-json --approval-mode yolo [flags…]
|
|
746
|
+
```
|
|
747
|
+
|
|
748
|
+
| Field | CLI flag | Notes |
|
|
749
|
+
| ----- | -------- | ----- |
|
|
750
|
+
| `binary` | — | Default `gemini` |
|
|
751
|
+
| `model` | `--model` | Also settable at top level |
|
|
752
|
+
| `approvalMode` | `--approval-mode` | Default `yolo`; overridable: `default`, `auto_edit`, `plan` |
|
|
753
|
+
| `sandbox` | `--sandbox` | Sandboxed execution |
|
|
754
|
+
| `skipTrust` | `--skip-trust` | Default `true` for harness and judge — skips folder trust in headless runs |
|
|
755
|
+
| `includeDirectories` | `--include-directories` | Extra workspace dirs (repeatable) |
|
|
756
|
+
| `allowedMcpServerNames` | `--allowed-mcp-server-names` | MCP server allowlist |
|
|
757
|
+
| `extensions` | `--extensions` | Extension allowlist |
|
|
758
|
+
| `debug` | `--debug` | Verbose logging |
|
|
759
|
+
| `isolateConfig` | — | `false` (default) = inherit caller config; `true` = temp config dir per run |
|
|
760
|
+
|
|
761
|
+
MCP tool calls map to harness names `mcp__<server>__<tool>`; built-in Gemini tools keep native names (e.g. `Bash`, `read_file`).
|
|
762
|
+
|
|
763
|
+
The adapter maps Gemini stream-json events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/gemini-cli/` — CI does not require `gemini` on `PATH`.
|
|
764
|
+
|
|
765
|
+
**Example suite:** [examples/gemini-cli-basic.yaml](examples/gemini-cli-basic.yaml)
|
|
766
|
+
|
|
767
|
+
**Gemini CLI judge:** set `judge.adapter: gemini-cli` and nest options under `judge.geminiCli` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)). Example: [examples/gemini-grading.yaml](examples/gemini-grading.yaml).
|
|
768
|
+
|
|
769
|
+
**Package export:** `@alis-build/harness-eval/adapters/gemini-cli`
|
|
770
|
+
|
|
771
|
+
---
|
|
772
|
+
|
|
623
773
|
## Library API
|
|
624
774
|
|
|
625
775
|
```typescript
|
|
626
776
|
import {
|
|
627
777
|
loadSuite,
|
|
778
|
+
loadSuiteDocument,
|
|
628
779
|
runSuite,
|
|
780
|
+
runPipeline,
|
|
629
781
|
gradeReport,
|
|
630
782
|
buildEvalRunEnvelope,
|
|
631
783
|
trajectoryToTranscript,
|
|
@@ -635,6 +787,11 @@ import {
|
|
|
635
787
|
} from "@alis-build/harness-eval";
|
|
636
788
|
import { loadGradingConfig } from "@alis-build/harness-eval/config";
|
|
637
789
|
|
|
790
|
+
// Unified pipeline
|
|
791
|
+
const doc = await loadSuiteDocument("./examples/pipeline/suite.yaml");
|
|
792
|
+
const { exitCode } = await runPipeline(doc, { maxConcurrent: 2 });
|
|
793
|
+
|
|
794
|
+
// Or step-by-step
|
|
638
795
|
const suite = await loadSuite("./examples/basic.yaml");
|
|
639
796
|
const report = await runSuite(suite, { maxConcurrent: 2 });
|
|
640
797
|
|
|
@@ -659,7 +816,7 @@ const envelope = buildEvalRunEnvelope(report, {
|
|
|
659
816
|
});
|
|
660
817
|
```
|
|
661
818
|
|
|
662
|
-
Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`.
|
|
819
|
+
Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`, `@alis-build/harness-eval/adapters/gemini-cli`.
|
|
663
820
|
|
|
664
821
|
---
|
|
665
822
|
|
|
@@ -682,7 +839,7 @@ Suite YAML → runSuite → Harness adapter → TrajectoryView
|
|
|
682
839
|
EvalRunEnvelope → DB / CI / API
|
|
683
840
|
```
|
|
684
841
|
|
|
685
|
-
- **Pluggable harness adapters** — runner and assertions depend only on `TrajectoryView`.
|
|
842
|
+
- **Pluggable harness adapters** — `claude-code`, `codex`, and `gemini-cli` today; runner and assertions depend only on `TrajectoryView`.
|
|
686
843
|
- **Pluggable outcome layer** — built-in `grade`, custom `gradeFn`, or any external workflow.
|
|
687
844
|
- **OTLP** — observability side export; not required for scoring.
|
|
688
845
|
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-
|
|
2
|
-
import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-
|
|
1
|
+
import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-D0HR2WnP.js";
|
|
2
|
+
import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-C56AEDUr.js";
|
|
3
3
|
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };
|
|
@@ -1,2 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { t as AdapterError } from "../../types-Bac8_Ixb.js";
|
|
2
|
+
import { r as runClaudeCode, t as claudeCodeAdapter } from "../../claude-code-C_7hxC8z.js";
|
|
2
3
|
export { AdapterError, claudeCodeAdapter, runClaudeCode };
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-D0HR2WnP.js";
|
|
2
|
+
import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-DFMpv_HJ.js";
|
|
3
|
+
|
|
4
|
+
//#region src/adapters/codex/map-events.d.ts
|
|
5
|
+
/** Stateful mapper — tracks session id and pending tool calls across the stream. */
|
|
6
|
+
declare class CodexEventMapper {
|
|
7
|
+
private sessionId;
|
|
8
|
+
private sawInit;
|
|
9
|
+
private startedItems;
|
|
10
|
+
private turnCount;
|
|
11
|
+
/** Map one parsed Codex JSON object to zero or more stream events. */
|
|
12
|
+
map(event: CodexJsonEvent): StreamEvent[];
|
|
13
|
+
private buildInit;
|
|
14
|
+
private ensureInit;
|
|
15
|
+
private mapItemStarted;
|
|
16
|
+
private mapItemCompleted;
|
|
17
|
+
private toolUseEvent;
|
|
18
|
+
private commandUseEvent;
|
|
19
|
+
private toolResultEvent;
|
|
20
|
+
private buildResult;
|
|
21
|
+
}
|
|
22
|
+
/** Map an entire fixture or stream of Codex events through a fresh mapper. */
|
|
23
|
+
declare function mapCodexEvents(events: CodexJsonEvent[]): StreamEvent[];
|
|
24
|
+
/** Build harness-qualified MCP tool name from Codex server + tool fields. */
|
|
25
|
+
declare function mcpToolName(server: string, tool: string): string;
|
|
26
|
+
//#endregion
|
|
27
|
+
//#region src/adapters/codex/flags.d.ts
|
|
28
|
+
/** Prepend global flags that must appear before the `exec` subcommand. */
|
|
29
|
+
declare function appendGlobalCodexFlags(args: string[], config: CodexOptions): void;
|
|
30
|
+
/** Append `codex exec` subcommand flags (after `exec`, before prompt). */
|
|
31
|
+
declare function appendExecCodexFlags(args: string[], config: CodexOptions & {
|
|
32
|
+
model?: string;
|
|
33
|
+
cwd?: string;
|
|
34
|
+
}): void;
|
|
35
|
+
/** @deprecated Use appendGlobalCodexFlags + appendExecCodexFlags */
|
|
36
|
+
declare function appendCodexFlags(args: string[], config: CodexOptions & {
|
|
37
|
+
model?: string;
|
|
38
|
+
cwd?: string;
|
|
39
|
+
}): void;
|
|
40
|
+
/**
|
|
41
|
+
* Ensure harness runs pass `--output-last-message` when capture is enabled.
|
|
42
|
+
* Returns the auto-generated path (for cleanup), or null if unchanged.
|
|
43
|
+
*/
|
|
44
|
+
declare function ensureHarnessOutputLastMessage(config: CodexAdapterConfig): string | null;
|
|
45
|
+
/**
|
|
46
|
+
* Build argv for `codex --ask-for-approval never exec --json … "<prompt>"`.
|
|
47
|
+
*
|
|
48
|
+
* Expects `config.outputLastMessage` to already be set if capture is desired;
|
|
49
|
+
* call {@link ensureHarnessOutputLastMessage} before this if spawning outside
|
|
50
|
+
* of {@link spawnCodex}.
|
|
51
|
+
*/
|
|
52
|
+
declare function buildArgs(config: CodexAdapterConfig): string[];
|
|
53
|
+
/**
|
|
54
|
+
* Build argv for `codex --ask-for-approval never exec … "<prompt>"` (no `--json`).
|
|
55
|
+
*/
|
|
56
|
+
declare function buildJudgeArgs(prompt: string, config?: CodexOptions & {
|
|
57
|
+
model?: string;
|
|
58
|
+
cwd?: string;
|
|
59
|
+
}): string[];
|
|
60
|
+
//#endregion
|
|
61
|
+
//#region src/adapters/codex/index.d.ts
|
|
62
|
+
/** Run Codex in headless `exec --json` mode and return a trajectory. */
|
|
63
|
+
declare function runCodex(config: CodexAdapterConfig): Promise<CodexAdapterResult>;
|
|
64
|
+
/** Registered {@link HarnessAdapter} for Codex CLI headless runs. */
|
|
65
|
+
declare const codexAdapter: HarnessAdapter<CodexAdapterConfig>;
|
|
66
|
+
//#endregion
|
|
67
|
+
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type CodexAdapterConfig, type CodexAdapterResult, CodexEventMapper, type CodexOptions, type ParseErrorRecord, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
|
|
68
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
import { t as AdapterError } from "../../types-Bac8_Ixb.js";
|
|
2
|
+
import { a as appendGlobalCodexFlags, c as ensureHarnessOutputLastMessage, d as mcpToolName, i as appendExecCodexFlags, l as CodexEventMapper, n as runCodex, o as buildArgs, r as appendCodexFlags, s as buildJudgeArgs, t as codexAdapter, u as mapCodexEvents } from "../../codex-0cHO2te9.js";
|
|
3
|
+
export { AdapterError, CodexEventMapper, appendCodexFlags, appendExecCodexFlags, appendGlobalCodexFlags, buildArgs, buildJudgeArgs, codexAdapter, ensureHarnessOutputLastMessage, mapCodexEvents, mcpToolName, runCodex };
|