@alis-build/harness-eval 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -28
- package/dist/adapters/claude-code/index.d.ts +2 -2
- package/dist/adapters/codex/index.d.ts +2 -2
- package/dist/cli/bin.js +3 -3
- package/dist/config/loader.d.ts +1 -1
- package/dist/config/loader.js +1 -1
- package/dist/{index-DnvP1UBl.d.ts → index-C56AEDUr.d.ts} +2 -2
- package/dist/index.d.ts +9 -7
- package/dist/index.js +3 -3
- package/dist/{loader-DnQ6Jt0i.js → loader-CiBm4Kf6.js} +21 -2
- package/dist/loader-CiBm4Kf6.js.map +1 -0
- package/dist/{loader-B1WmGGzf.d.ts → loader-CrmzNwkq.d.ts} +3 -3
- package/dist/{reporter-Biy-5-9M.js → reporter-BKCJZRYr.js} +186 -17
- package/dist/reporter-BKCJZRYr.js.map +1 -0
- package/dist/runner/suite.d.ts +1 -1
- package/dist/runner/suite.js +1 -1
- package/dist/{suite-BcP64nlb.js → suite-C3-8EjUW.js} +543 -3
- package/dist/suite-C3-8EjUW.js.map +1 -0
- package/dist/{suite-BEShV0by.d.ts → suite-qyOGre2g.d.ts} +2 -2
- package/dist/types-Bac8_Ixb.js.map +1 -1
- package/dist/{types-0QkNVyp9.d.ts → types-CLt4Yygc.d.ts} +2 -2
- package/dist/{types-C0gBkl0-.d.ts → types-D0HR2WnP.d.ts} +8 -2
- package/dist/{types-Bu8uOZZN.d.ts → types-DFMpv_HJ.d.ts} +2 -2
- package/package.json +6 -1
- package/schemas/eval-run-envelope.schema.json +193 -183
- package/dist/loader-DnQ6Jt0i.js.map +0 -1
- package/dist/reporter-Biy-5-9M.js.map +0 -1
- package/dist/suite-BcP64nlb.js.map +0 -1
package/README.md
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# @alis-build/harness-eval
|
|
2
2
|
|
|
3
|
-
Statistical eval framework for **AI coding agent harnesses
|
|
3
|
+
Statistical eval framework for **AI coding agent harnesses**. Run real headless harness sessions, capture tool trajectories, and score behavior and outcomes across many repetitions and configurations.
|
|
4
|
+
|
|
5
|
+
**Built-in harness adapters:** `claude-code`, `codex`, and `gemini-cli`. Set `adapter:` in suite YAML; the runner, assertions, and eval interchange stay the same regardless of vendor.
|
|
4
6
|
|
|
5
7
|
**Use it to answer:** “When users ask X, does this harness actually call our MCP tools — reliably, in this plugin/model setup?”
|
|
6
8
|
|
|
@@ -9,10 +11,20 @@ Statistical eval framework for **AI coding agent harnesses** (Claude Code today;
|
|
|
9
11
|
## Requirements
|
|
10
12
|
|
|
11
13
|
- Node.js ≥ 22.12 required; Node 24 LTS recommended for development and CI
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
14
|
+
- A harness CLI on `PATH` for the adapter you use (see [Adding harness adapters](#adding-harness-adapters)):
|
|
15
|
+
- **`claude-code`** — `claude` ([Claude Code CLI](https://code.claude.com/docs/en/cli-reference))
|
|
16
|
+
- **`codex`** — `codex` ([Codex CLI](https://developers.openai.com/codex/cli/reference))
|
|
17
|
+
- **`gemini-cli`** — `gemini` ([Gemini CLI](https://geminicli.com/docs/cli/cli-reference/))
|
|
18
|
+
|
|
19
|
+
### Authentication (by adapter)
|
|
20
|
+
|
|
21
|
+
| Adapter | Typical auth |
|
|
22
|
+
| ------- | ------------ |
|
|
23
|
+
| **Claude Code** | `claude login` with `isolateConfig: false`, or `ANTHROPIC_API_KEY` with isolated config (default harness behavior) |
|
|
24
|
+
| **Codex** | Logged-in `~/.codex`, or `OPENAI_API_KEY` when `codex.isolateConfig: true` |
|
|
25
|
+
| **Gemini CLI** | Logged-in Gemini CLI config with `geminiCli.isolateConfig: false`, or Vertex/API key env vars (`GOOGLE_APPLICATION_CREDENTIALS`, `GEMINI_API_KEY`, etc.) when isolated |
|
|
26
|
+
|
|
27
|
+
Each adapter section below documents `isolateConfig`, MCP setup, and headless flags in detail.
|
|
16
28
|
|
|
17
29
|
---
|
|
18
30
|
|
|
@@ -55,13 +67,17 @@ pnpm exec harness-eval --help
|
|
|
55
67
|
Suites are YAML files. Committed examples:
|
|
56
68
|
|
|
57
69
|
- [`examples/pipeline/`](examples/pipeline/) — **recommended** unified layout with inline `judge:` + `pipeline:` orchestration
|
|
58
|
-
- [`examples/basic.yaml`](examples/basic.yaml) —
|
|
59
|
-
- [`examples/
|
|
70
|
+
- [`examples/basic.yaml`](examples/basic.yaml) — Claude Code smoke test (`Read` on this repo's README)
|
|
71
|
+
- [`examples/codex-basic.yaml`](examples/codex-basic.yaml) — Codex CLI smoke test
|
|
72
|
+
- [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) — Gemini CLI smoke test
|
|
73
|
+
- [`examples/matrix.yaml`](examples/matrix.yaml) — Claude Code with a model matrix (sonnet vs opus)
|
|
60
74
|
- [`examples/multi-file/`](examples/multi-file/) — directory layout with `suite.yaml` plus cases under `cases/`
|
|
61
|
-
- [`examples/grading.yaml`](examples/grading.yaml) —
|
|
75
|
+
- [`examples/grading.yaml`](examples/grading.yaml) — Claude Code judge config (standalone)
|
|
76
|
+
- [`examples/codex-grading.yaml`](examples/codex-grading.yaml) — Codex judge config
|
|
77
|
+
- [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) — Gemini CLI judge config
|
|
62
78
|
|
|
63
79
|
```yaml
|
|
64
|
-
adapter: claude-code
|
|
80
|
+
adapter: claude-code # or: codex | gemini-cli
|
|
65
81
|
|
|
66
82
|
defaultConfig:
|
|
67
83
|
model: claude-sonnet-4-6
|
|
@@ -95,7 +111,7 @@ cases:
|
|
|
95
111
|
- "The summary is grounded in README content, not a generic refusal"
|
|
96
112
|
```
|
|
97
113
|
|
|
98
|
-
Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level.
|
|
114
|
+
Generic fields (`model`, `cwd`, `timeoutMs`, `env`) sit at the top level. Harness-specific options nest under `claudeCode`, `codex`, or `geminiCli` depending on `adapter`.
|
|
99
115
|
|
|
100
116
|
**Full suite & grading YAML reference:** [docs/suite-config.md](docs/suite-config.md) — all case/matrix fields, inline `judge:` / `pipeline:`, multi-file layout, and standalone `grading.yaml`.
|
|
101
117
|
|
|
@@ -109,7 +125,7 @@ npx @alis-build/harness-eval pipeline examples/pipeline/
|
|
|
109
125
|
npx @alis-build/harness-eval run examples/basic.yaml --output report.json --max-concurrent 1 --format console
|
|
110
126
|
```
|
|
111
127
|
|
|
112
|
-
This spawns
|
|
128
|
+
This spawns the configured harness CLI headless for each (case × matrix cell × repetition), evaluates **assertions** on the captured trajectory, and prints pass rates.
|
|
113
129
|
|
|
114
130
|
**Progress (stderr):** one line per repetition with ETA by default; use `--quiet` for dots or `--verbose` for tool/assertion detail.
|
|
115
131
|
|
|
@@ -130,7 +146,7 @@ npx @alis-build/harness-eval grade report.json --suite examples/pipeline/suite.y
|
|
|
130
146
|
npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json --max-concurrent 1 --format console
|
|
131
147
|
```
|
|
132
148
|
|
|
133
|
-
Runs a separate
|
|
149
|
+
Runs a separate harness subprocess as **judge** (`judge.adapter`: `claude-code`, `codex`, or `gemini-cli`) against the `expectations` in your suite (copied into `report.json`). Produces per-expectation PASS/FAIL with cited evidence.
|
|
134
150
|
|
|
135
151
|
Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2` = no expectations or no gradable repetitions.
|
|
136
152
|
|
|
@@ -138,13 +154,13 @@ Exit codes: `0` = all graded expectations passed; `1` = at least one failed; `2`
|
|
|
138
154
|
|
|
139
155
|
## Data contracts & schemas
|
|
140
156
|
|
|
141
|
-
harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not
|
|
157
|
+
harness-eval separates **vendor output** from **eval interchange**. Use the types below when wiring CI, a database, or an external judge — not raw adapter NDJSON or OTLP as your primary record.
|
|
142
158
|
|
|
143
159
|
### Layering
|
|
144
160
|
|
|
145
161
|
| Layer | Type | Where | Use for |
|
|
146
162
|
| --------------- | --------------------- | ------------------------- | -------------------------------------------------- |
|
|
147
|
-
| Vendor stream | `StreamEvent` | `src/types/stream.ts` |
|
|
163
|
+
| Vendor stream | `StreamEvent` | `src/types/stream.ts` | Adapter debug only (Claude/Codex/Gemini NDJSON) |
|
|
148
164
|
| Harness session | **`TrajectoryView`** | `src/types/trajectory.ts` | Assertions, trajectory queries, judge input |
|
|
149
165
|
| Run report | **`SuiteReport`** | `report.json` from `run` | Runner output; full trajectories + assertion stats |
|
|
150
166
|
| Eval record | **`EvalRunEnvelope`** | `buildEvalRunEnvelope()` | CI gates, APIs, DB storage |
|
|
@@ -268,7 +284,7 @@ You do not need `harness-eval grade` if you already have LangSmith, Braintrust,
|
|
|
268
284
|
| ------------------------ | ------------------------------ | ------------------------------------------ |
|
|
269
285
|
| Headless harness runs | `run` / `runSuite` | — |
|
|
270
286
|
| Tool-call behavior | Assertions on `TrajectoryView` | Optional: re-implement on `toolCalls` |
|
|
271
|
-
| Outcome / rubric scoring | `grade` (
|
|
287
|
+
| Outcome / rubric scoring | `grade` (built-in judges) | Your judge, eval platform, or human review |
|
|
272
288
|
| Storage contract | `EvalRunEnvelope` | Same envelope; attach `externalScores` |
|
|
273
289
|
|
|
274
290
|
### Pattern 1 — Behavioral only (no LLM judge)
|
|
@@ -305,7 +321,7 @@ const myJudge: GraderFn = async ({ prompt, transcript, expectations }) => {
|
|
|
305
321
|
const grading = await gradeReport(report, { gradeFn: myJudge });
|
|
306
322
|
```
|
|
307
323
|
|
|
308
|
-
Output is the same `SuiteGradingReport` shape as the built-in
|
|
324
|
+
Output is the same `SuiteGradingReport` shape as the built-in judges — merge into `EvalRunEnvelope` via `buildEvalRunEnvelope(report, { grading })`.
|
|
309
325
|
|
|
310
326
|
### Pattern 3 — Separate judge pipeline (any language)
|
|
311
327
|
|
|
@@ -339,7 +355,7 @@ envelope.cells[0].repetitions[0].externalScores = [
|
|
|
339
355
|
];
|
|
340
356
|
```
|
|
341
357
|
|
|
342
|
-
**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw
|
|
358
|
+
**Judges should use `trajectoryToTranscript(view, prompt)` or structured `toolCalls`** — not raw vendor NDJSON (adapter-specific and verbose).
|
|
343
359
|
|
|
344
360
|
### Pattern 4 — LangSmith, Braintrust, OpenAI Evals, etc.
|
|
345
361
|
|
|
@@ -401,7 +417,7 @@ Map your framework's output into these shapes (or use `externalScores`) so CI an
|
|
|
401
417
|
| Layer | Command | What it checks | Mechanism |
|
|
402
418
|
| ------------ | ------- | --------------------------------------- | -------------------------------------------- |
|
|
403
419
|
| **Behavior** | `run` | Tool calls, order, args, efficiency | Deterministic assertions on `TrajectoryView` |
|
|
404
|
-
| **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge on transcript + `finalResponse`
|
|
420
|
+
| **Outcome** | `grade` | Answer quality, grounding, completeness | LLM judge (`claude-code`, `codex`, or `gemini-cli`) on transcript + `finalResponse` |
|
|
405
421
|
|
|
406
422
|
Both layers use statistical thresholds: a case runs `repetitions` times per matrix cell, and each assertion/expectation has a pass-rate threshold (default `1.0`).
|
|
407
423
|
|
|
@@ -435,12 +451,12 @@ npx @alis-build/harness-eval --help
|
|
|
435
451
|
|
|
436
452
|
### `grade`
|
|
437
453
|
|
|
438
|
-
Uses **`grading.yaml
|
|
454
|
+
Uses **`grading.yaml`**, an inline **`judge:`** block in `suite.yaml` (`--suite`), or adapter-specific grading files under `examples/`.
|
|
439
455
|
|
|
440
456
|
**Field reference:** [docs/suite-config.md — Grading config](docs/suite-config.md#grading-config-gradingyaml)
|
|
441
457
|
|
|
442
458
|
```yaml
|
|
443
|
-
# examples/grading.yaml
|
|
459
|
+
# examples/grading.yaml (Claude Code judge)
|
|
444
460
|
judge:
|
|
445
461
|
adapter: claude-code
|
|
446
462
|
model: claude-sonnet-4-6
|
|
@@ -450,26 +466,38 @@ judge:
|
|
|
450
466
|
permissionMode: bypassPermissions
|
|
451
467
|
```
|
|
452
468
|
|
|
469
|
+
Other committed judge configs: [`examples/codex-grading.yaml`](examples/codex-grading.yaml) (`adapter: codex`), [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) (`adapter: gemini-cli`).
|
|
470
|
+
|
|
453
471
|
```bash
|
|
454
472
|
npx @alis-build/harness-eval grade report.json --config examples/grading.yaml --output grading.json
|
|
473
|
+
npx @alis-build/harness-eval grade report.json --config examples/codex-grading.yaml --output grading.json
|
|
474
|
+
npx @alis-build/harness-eval grade report.json --config examples/gemini-grading.yaml --output grading.json
|
|
455
475
|
```
|
|
456
476
|
|
|
457
477
|
| Option | Description |
|
|
458
478
|
| -------------------------------------- | ----------------------------------------------------------------- |
|
|
459
|
-
| `--config <path>` | Grading YAML (`judge` block) — model, env, timeout,
|
|
479
|
+
| `--config <path>` | Grading YAML (`judge` block) — model, env, timeout, adapter options |
|
|
460
480
|
| `--suite <path>` | Unified `suite.yaml` with inline `judge:` (alternative to `--config`) |
|
|
461
481
|
| `--output <path>` | Write grading JSON |
|
|
462
482
|
| `--expectations <path>` | Sidecar YAML/JSON if report lacks expectations |
|
|
463
483
|
| `--format console\|json` | Output format |
|
|
464
484
|
| `--model <id>` | Overrides `judge.model` in config |
|
|
465
|
-
| `--binary <path>` | Overrides
|
|
485
|
+
| `--binary <path>` | Overrides judge binary for the selected adapter |
|
|
466
486
|
| `--timeout-ms <n>` | Overrides `judge.timeoutMs` |
|
|
467
487
|
| `--max-concurrent <n>` | Overrides `judge.maxConcurrent` (default: 2 if unset) |
|
|
468
488
|
| `--quiet` / `--verbose` / `--progress` | Same progress modes as `run` (including `--color` / `--no-color`) |
|
|
469
489
|
|
|
470
490
|
CLI flags override the YAML file. Expectations still come from `report.json` (copied from the suite at `run` time) unless `--expectations` is set. The grading report may include `gradingConfigPath` when `--config` was used.
|
|
471
491
|
|
|
472
|
-
|
|
492
|
+
**Built-in judge defaults** (override under `judge.claudeCode`, `judge.codex`, or `judge.geminiCli`):
|
|
493
|
+
|
|
494
|
+
| Adapter | Defaults (summary) |
|
|
495
|
+
| ------- | ------------------ |
|
|
496
|
+
| `claude-code` | `maxTurns: 1`, `bare: true`, `disableSlashCommands: true`, `noSessionPersistence: true`, `permissionMode: bypassPermissions`; JSON output |
|
|
497
|
+
| `codex` | `ephemeral: true`, `ignoreUserConfig: true`, `skipGitRepoCheck: true`, `askForApproval: never` |
|
|
498
|
+
| `gemini-cli` | `approvalMode: yolo`, `isolateConfig: true`, `skipTrust: true`; `--output-format json` |
|
|
499
|
+
|
|
500
|
+
See [docs/suite-config.md](docs/suite-config.md) and each adapter section below for full flag tables.
|
|
473
501
|
|
|
474
502
|
Exit codes: `0` = all expectations passed; `1` = failures; `2` = no expectations or no gradable repetitions (harness failures without trajectories are skipped).
|
|
475
503
|
|
|
@@ -583,9 +611,17 @@ Define expected tool calls for Vertex trajectory metrics on the eval envelope. U
|
|
|
583
611
|
|
|
584
612
|
---
|
|
585
613
|
|
|
586
|
-
##
|
|
614
|
+
## Harness adapters
|
|
587
615
|
|
|
588
|
-
Built-in adapters register at module load.
|
|
616
|
+
Built-in adapters register at module load. Each has a dedicated section below with CLI flag mapping, examples, and judge configuration.
|
|
617
|
+
|
|
618
|
+
| Adapter | Suite key | Example suite | Example judge |
|
|
619
|
+
| ------- | --------- | ------------- | ------------- |
|
|
620
|
+
| Claude Code | `claudeCode` | [`examples/basic.yaml`](examples/basic.yaml) | [`examples/grading.yaml`](examples/grading.yaml) |
|
|
621
|
+
| Codex CLI | `codex` | [`examples/codex-basic.yaml`](examples/codex-basic.yaml) | [`examples/codex-grading.yaml`](examples/codex-grading.yaml) |
|
|
622
|
+
| Gemini CLI | `geminiCli` | [`examples/gemini-cli-basic.yaml`](examples/gemini-cli-basic.yaml) | [`examples/gemini-grading.yaml`](examples/gemini-grading.yaml) |
|
|
623
|
+
|
|
624
|
+
Additional harnesses (e.g. Antigravity CLI) plug in via the same pattern:
|
|
589
625
|
|
|
590
626
|
1. Implement `HarnessAdapter` under `src/adapters/<id>/` with a `run(config)` that returns a `TrajectoryView`.
|
|
591
627
|
2. Add a nested config key on `SuiteConfig` (e.g. `codex: { ... }`) for harness-specific options.
|
|
@@ -600,7 +636,7 @@ import {
|
|
|
600
636
|
} from "@alis-build/harness-eval";
|
|
601
637
|
|
|
602
638
|
registerAdapter("my-harness", myAdapter);
|
|
603
|
-
console.log(listAdapters()); // ["claude-code", "codex", "
|
|
639
|
+
console.log(listAdapters()); // ["claude-code", "codex", "gemini-cli", …]
|
|
604
640
|
```
|
|
605
641
|
|
|
606
642
|
Duplicate registration throws so accidental overrides fail fast during startup or tests.
|
|
@@ -695,6 +731,43 @@ The adapter maps Codex JSONL events into the shared `StreamEvent` shape and feed
|
|
|
695
731
|
|
|
696
732
|
**Codex judge:** set `judge.adapter: codex` and nest options under `judge.codex` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)).
|
|
697
733
|
|
|
734
|
+
**Package export:** `@alis-build/harness-eval/adapters/codex`
|
|
735
|
+
|
|
736
|
+
---
|
|
737
|
+
|
|
738
|
+
## Gemini CLI adapter
|
|
739
|
+
|
|
740
|
+
Nested under `geminiCli` in YAML (or flat in programmatic config). Maps to [Gemini CLI reference](https://geminicli.com/docs/cli/cli-reference/).
|
|
741
|
+
|
|
742
|
+
The harness adapter invokes:
|
|
743
|
+
|
|
744
|
+
```bash
|
|
745
|
+
gemini -p "<prompt>" --output-format stream-json --approval-mode yolo [flags…]
|
|
746
|
+
```
|
|
747
|
+
|
|
748
|
+
| Field | CLI flag | Notes |
|
|
749
|
+
| ----- | -------- | ----- |
|
|
750
|
+
| `binary` | — | Default `gemini` |
|
|
751
|
+
| `model` | `--model` | Also settable at top level |
|
|
752
|
+
| `approvalMode` | `--approval-mode` | Default `yolo`; overridable: `default`, `auto_edit`, `plan` |
|
|
753
|
+
| `sandbox` | `--sandbox` | Sandboxed execution |
|
|
754
|
+
| `skipTrust` | `--skip-trust` | Default `true` for harness and judge — skips folder trust in headless runs |
|
|
755
|
+
| `includeDirectories` | `--include-directories` | Extra workspace dirs (repeatable) |
|
|
756
|
+
| `allowedMcpServerNames` | `--allowed-mcp-server-names` | MCP server allowlist |
|
|
757
|
+
| `extensions` | `--extensions` | Extension allowlist |
|
|
758
|
+
| `debug` | `--debug` | Verbose logging |
|
|
759
|
+
| `isolateConfig` | — | `false` (default) = inherit caller config; `true` = temp config dir per run |
|
|
760
|
+
|
|
761
|
+
MCP tool calls map to harness names `mcp__<server>__<tool>`; built-in Gemini tools keep native names (e.g. `Bash`, `read_file`).
|
|
762
|
+
|
|
763
|
+
The adapter maps Gemini stream-json events into the shared `StreamEvent` shape and feeds `TrajectoryBuilder`. Fixture-driven tests use committed recordings under `tests/fixtures/gemini-cli/` — CI does not require `gemini` on `PATH`.
|
|
764
|
+
|
|
765
|
+
**Example suite:** [examples/gemini-cli-basic.yaml](examples/gemini-cli-basic.yaml)
|
|
766
|
+
|
|
767
|
+
**Gemini CLI judge:** set `judge.adapter: gemini-cli` and nest options under `judge.geminiCli` in grading YAML (see [docs/suite-config.md](docs/suite-config.md)). Example: [examples/gemini-grading.yaml](examples/gemini-grading.yaml).
|
|
768
|
+
|
|
769
|
+
**Package export:** `@alis-build/harness-eval/adapters/gemini-cli`
|
|
770
|
+
|
|
698
771
|
---
|
|
699
772
|
|
|
700
773
|
## Library API
|
|
@@ -743,7 +816,7 @@ const envelope = buildEvalRunEnvelope(report, {
|
|
|
743
816
|
});
|
|
744
817
|
```
|
|
745
818
|
|
|
746
|
-
Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`.
|
|
819
|
+
Subpath exports: `@alis-build/harness-eval/runner`, `@alis-build/harness-eval/config`, `@alis-build/harness-eval/adapters/claude-code`, `@alis-build/harness-eval/adapters/codex`, `@alis-build/harness-eval/adapters/gemini-cli`.
|
|
747
820
|
|
|
748
821
|
---
|
|
749
822
|
|
|
@@ -766,7 +839,7 @@ Suite YAML → runSuite → Harness adapter → TrajectoryView
|
|
|
766
839
|
EvalRunEnvelope → DB / CI / API
|
|
767
840
|
```
|
|
768
841
|
|
|
769
|
-
- **Pluggable harness adapters** — runner and assertions depend only on `TrajectoryView`.
|
|
842
|
+
- **Pluggable harness adapters** — `claude-code`, `codex`, and `gemini-cli` today; runner and assertions depend only on `TrajectoryView`.
|
|
770
843
|
- **Pluggable outcome layer** — built-in `grade`, custom `gradeFn`, or any external workflow.
|
|
771
844
|
- **OTLP** — observability side export; not required for scoring.
|
|
772
845
|
|
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-
|
|
2
|
-
import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-
|
|
1
|
+
import { n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics } from "../../types-D0HR2WnP.js";
|
|
2
|
+
import { a as ClaudeCodeAdapterResult, i as ClaudeCodeAdapterConfig, o as ClaudeCodeOptions, r as runClaudeCode, s as PermissionMode, t as claudeCodeAdapter } from "../../index-C56AEDUr.js";
|
|
3
3
|
export { type AdapterDiagnostics, AdapterError, type AdapterResult, type ClaudeCodeAdapterConfig, type ClaudeCodeAdapterResult, type ClaudeCodeOptions, type ParseErrorRecord, type PermissionMode, claudeCodeAdapter, runClaudeCode };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-
|
|
2
|
-
import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-
|
|
1
|
+
import { a as HarnessAdapter, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "../../types-D0HR2WnP.js";
|
|
2
|
+
import { i as CodexOptions, n as CodexAdapterResult, r as CodexJsonEvent, t as CodexAdapterConfig } from "../../types-DFMpv_HJ.js";
|
|
3
3
|
|
|
4
4
|
//#region src/adapters/codex/map-events.d.ts
|
|
5
5
|
/** Stateful mapper — tracks session id and pending tool calls across the stream. */
|
package/dist/cli/bin.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-
|
|
3
|
-
import { t as runSuite, u as getAdapter } from "../suite-
|
|
4
|
-
import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-
|
|
2
|
+
import { F as loadSuiteReport, M as gradingReportPassed, N as resolveGradeOptions, P as gradeReport, a as envelopeCommand, c as getOptionInt, i as runPipeline, j as formatGradingConsole, l as hasOption, o as parseEnvelopeProjection, p as suiteDirectoryFromPath, r as trajectoryToOtlp, s as getOption, t as formatReport, u as parseArgs } from "../reporter-BKCJZRYr.js";
|
|
3
|
+
import { t as runSuite, u as getAdapter } from "../suite-C3-8EjUW.js";
|
|
4
|
+
import { i as loadGradingConfig, o as loadSuiteDocument, t as loadSuite } from "../loader-CiBm4Kf6.js";
|
|
5
5
|
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
6
6
|
import { dirname, isAbsolute, join } from "node:path";
|
|
7
7
|
import { fileURLToPath } from "node:url";
|
package/dist/config/loader.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, i as parseCasesFile, l as loadGradingConfig, n as parseSuite, o as SuiteDocument, r as parseSuiteDirectory, t as loadSuite, u as parseGradingConfig } from "../loader-
|
|
1
|
+
import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, i as parseCasesFile, l as loadGradingConfig, n as parseSuite, o as SuiteDocument, r as parseSuiteDirectory, t as loadSuite, u as parseGradingConfig } from "../loader-CrmzNwkq.js";
|
|
2
2
|
export { ConfigError, type GradingConfig, type SuiteDocument, loadGradingConfig, loadSuite, loadSuiteDocument, parseCasesFile, parseGradingConfig, parseSuite, parseSuiteDirectory };
|
package/dist/config/loader.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { a as parseGradingConfig, c as parseCasesFile, i as loadGradingConfig, l as ConfigError, n as parseSuite, o as loadSuiteDocument, r as parseSuiteDirectory, t as loadSuite } from "../loader-
|
|
1
|
+
import { a as parseGradingConfig, c as parseCasesFile, i as loadGradingConfig, l as ConfigError, n as parseSuite, o as loadSuiteDocument, r as parseSuiteDirectory, t as loadSuite } from "../loader-CiBm4Kf6.js";
|
|
2
2
|
export { ConfigError, loadGradingConfig, loadSuite, loadSuiteDocument, parseCasesFile, parseGradingConfig, parseSuite, parseSuiteDirectory };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { a as HarnessAdapter, i as BaseAdapterConfig, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "./types-
|
|
1
|
+
import { a as HarnessAdapter, i as BaseAdapterConfig, n as AdapterError, o as ParseErrorRecord, r as AdapterResult, t as AdapterDiagnostics, x as StreamEvent } from "./types-D0HR2WnP.js";
|
|
2
2
|
|
|
3
3
|
//#region src/adapters/claude-code/types.d.ts
|
|
4
4
|
/** Claude Code permission modes (`--permission-mode`). */
|
|
@@ -70,4 +70,4 @@ declare function runClaudeCode(config: ClaudeCodeAdapterConfig): Promise<ClaudeC
|
|
|
70
70
|
declare const claudeCodeAdapter: HarnessAdapter<ClaudeCodeAdapterConfig>;
|
|
71
71
|
//#endregion
|
|
72
72
|
export { ClaudeCodeAdapterResult as a, ClaudeCodeAdapterConfig as i, index_d_exports as n, ClaudeCodeOptions as o, runClaudeCode as r, PermissionMode as s, claudeCodeAdapter as t };
|
|
73
|
-
//# sourceMappingURL=index-
|
|
73
|
+
//# sourceMappingURL=index-C56AEDUr.d.ts.map
|
package/dist/index.d.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-
|
|
2
|
-
import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-
|
|
3
|
-
import { i as CodexOptions } from "./types-
|
|
4
|
-
import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-
|
|
5
|
-
import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-
|
|
6
|
-
import { t as runSuite } from "./suite-
|
|
1
|
+
import { A as Usage, B as isUserMessage, C as SystemInitEvent, D as TextBlock, E as SystemUnknownEvent, F as isSystemInit, I as isSystemRetry, L as isTextBlock, M as UserMessageEvent, N as isAssistantMessage, O as ToolResultBlock, P as isResult, R as isToolResultBlock, S as SystemCompactBoundaryEvent, T as SystemRetryEvent, _ as ContentBlock, a as HarnessAdapter, b as StopReason, c as AssistantTurn, d as ToolCall, f as TrajectoryView, g as AssistantMessageEvent, h as AssistantMessage, i as BaseAdapterConfig, j as UserMessage, k as ToolUseBlock, l as RetryRecord, m as namespaceOf, n as AdapterError, o as ParseErrorRecord, p as UsageSummary, r as AdapterResult, s as SuiteConfig, t as AdapterDiagnostics, u as SessionMeta, v as McpServerStatus, w as SystemPluginInstallEvent, x as StreamEvent, y as ResultEvent, z as isToolUseBlock } from "./types-D0HR2WnP.js";
|
|
2
|
+
import { n as index_d_exports, o as ClaudeCodeOptions } from "./index-C56AEDUr.js";
|
|
3
|
+
import { i as CodexOptions } from "./types-DFMpv_HJ.js";
|
|
4
|
+
import { A as ObjectPredicate, C as TrajectoryPairInstanceJson, D as Cardinality, E as AssertionResult, M as ThresholdedAssertion, N as ToolPattern, O as CompoundPredicate, S as TrajectoryInstancesJson, T as Assertion, _ as ProtojsonToolCall, a as ProgressEvent, b as ReferenceTrajectoryConfig, c as RunSuiteOptions, d as TestSuite, f as EvalDatasetRow, g as InstancesJsonlRow, h as InstanceData, i as ProgressCallback, j as Predicate, k as LeafPredicate, l as SuiteReport, m as HarnessMetrics, n as CellReport, o as RepetitionError, p as EvaluationInstanceJson, r as MatrixCell, s as RepetitionResult, t as AssertionStat, u as TestCase, v as ProtojsonTrajectory, w as TrajectorySingleToolUseInstanceJson, x as TrajectoryInstanceMetricKey, y as ReferenceToolNameMode } from "./types-CLt4Yygc.js";
|
|
5
|
+
import { a as loadSuiteDocument, c as GradingConfig, d as ConfigError, n as parseSuite, o as SuiteDocument, s as PipelineConfig, t as loadSuite } from "./loader-CrmzNwkq.js";
|
|
6
|
+
import { t as runSuite } from "./suite-qyOGre2g.js";
|
|
7
7
|
import { Readable } from "node:stream";
|
|
8
8
|
|
|
9
9
|
//#region src/grader/types.d.ts
|
|
@@ -87,11 +87,13 @@ interface GradeReportOptions {
|
|
|
87
87
|
/** Working directory for the judge subprocess. */
|
|
88
88
|
cwd?: string;
|
|
89
89
|
/** Grading adapter id. Default: `claude-code`. */
|
|
90
|
-
judgeAdapter?: "claude-code" | "codex";
|
|
90
|
+
judgeAdapter?: "claude-code" | "codex" | "gemini-cli";
|
|
91
91
|
/** Claude Code options for the judge (nested in grading YAML under `claudeCode`). */
|
|
92
92
|
claudeCode?: Record<string, unknown>;
|
|
93
93
|
/** Codex CLI options for the judge (nested in grading YAML under `codex`). */
|
|
94
94
|
codex?: Record<string, unknown>;
|
|
95
|
+
/** Gemini CLI options for the judge (nested in grading YAML under `geminiCli`). */
|
|
96
|
+
geminiCli?: Record<string, unknown>;
|
|
95
97
|
/** Path to grading YAML when `--config` was used. */
|
|
96
98
|
gradingConfigPath?: string;
|
|
97
99
|
/** Inject a custom grader (for tests). */
|
package/dist/index.js
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { a as isAssistantMessage, c as isSystemRetry, d as isToolUseBlock, f as isUserMessage, i as namespaceOf, l as isTextBlock, n as TrajectoryBuilder, o as isResult, r as buildTrajectory, s as isSystemInit, t as AdapterError, u as isToolResultBlock } from "./types-Bac8_Ixb.js";
|
|
2
|
-
import { A as serializeToolInput, B as TRAJECTORY_SCHEMA_VERSION, C as trajectoryExactMatch, D as trajectorySingleToolUse, E as trajectoryRecall, I as trajectoryToTranscript, L as createCodexGrader, M as gradingReportPassed, N as resolveGradeOptions, O as toEvaluationInstance, P as gradeReport, R as createClaudeGrader, S as trajectoryAnyOrderMatch, T as trajectoryPrecision, _ as buildEvalRunEnvelopeFromFiles, b as computeTrajectoryMetrics, d as resolveGradingArtifactFromSuite, f as resolvePipelineInputs, g as buildEvalRunEnvelope, h as toTrajectory, i as runPipeline, j as formatGradingConsole, k as toTrajectoryInstances, m as toInstancesJsonl, n as emitOtel, r as trajectoryToOtlp, t as formatReport, v as enrichRepetitionWithProtojson, w as trajectoryInOrderMatch, x as parseToolInput, y as toHarnessMetrics, z as EVAL_RUN_SCHEMA_VERSION } from "./reporter-
|
|
2
|
+
import { A as serializeToolInput, B as TRAJECTORY_SCHEMA_VERSION, C as trajectoryExactMatch, D as trajectorySingleToolUse, E as trajectoryRecall, I as trajectoryToTranscript, L as createCodexGrader, M as gradingReportPassed, N as resolveGradeOptions, O as toEvaluationInstance, P as gradeReport, R as createClaudeGrader, S as trajectoryAnyOrderMatch, T as trajectoryPrecision, _ as buildEvalRunEnvelopeFromFiles, b as computeTrajectoryMetrics, d as resolveGradingArtifactFromSuite, f as resolvePipelineInputs, g as buildEvalRunEnvelope, h as toTrajectory, i as runPipeline, j as formatGradingConsole, k as toTrajectoryInstances, m as toInstancesJsonl, n as emitOtel, r as trajectoryToOtlp, t as formatReport, v as enrichRepetitionWithProtojson, w as trajectoryInOrderMatch, x as parseToolInput, y as toHarnessMetrics, z as EVAL_RUN_SCHEMA_VERSION } from "./reporter-BKCJZRYr.js";
|
|
3
3
|
import { a as parseStreamJson, n as claude_code_exports } from "./claude-code-C_7hxC8z.js";
|
|
4
|
-
import { a as aggregateCell, c as runRepetition, d as getDefaultAdapter, f as listAdapters,
|
|
5
|
-
import { l as ConfigError, n as parseSuite, o as loadSuiteDocument, t as loadSuite } from "./loader-
|
|
4
|
+
import { _ as evaluateAll, a as aggregateCell, c as runRepetition, d as getDefaultAdapter, f as listAdapters, g as evaluate, i as DEFAULT_THRESHOLD, l as DEFAULT_ADAPTER_ID, n as createLimit, o as getRepetitions, p as registerAdapter, r as DEFAULT_REPETITIONS, s as mergeConfig, t as runSuite, u as getAdapter } from "./suite-C3-8EjUW.js";
|
|
5
|
+
import { l as ConfigError, n as parseSuite, o as loadSuiteDocument, t as loadSuite } from "./loader-CiBm4Kf6.js";
|
|
6
6
|
//#region src/metrics/tool-calls.ts
|
|
7
7
|
/**
|
|
8
8
|
* Tool-call-level metrics operating on prediction/reference tool-call pairs.
|
|
@@ -58,6 +58,24 @@ const ClaudeCodeConfigSchema = z.object({
|
|
|
58
58
|
maxTurns: z.number().int().positive(),
|
|
59
59
|
isolateConfig: z.boolean()
|
|
60
60
|
}).partial();
|
|
61
|
+
/** Gemini CLI adapter-specific options (nested under `geminiCli` in suite YAML). */
|
|
62
|
+
const GeminiCliConfigSchema = z.object({
|
|
63
|
+
binary: z.string(),
|
|
64
|
+
approvalMode: z.enum([
|
|
65
|
+
"default",
|
|
66
|
+
"auto_edit",
|
|
67
|
+
"yolo",
|
|
68
|
+
"plan"
|
|
69
|
+
]),
|
|
70
|
+
sandbox: z.string(),
|
|
71
|
+
skipTrust: z.boolean(),
|
|
72
|
+
includeDirectories: z.array(z.string()),
|
|
73
|
+
allowedMcpServerNames: z.array(z.string()),
|
|
74
|
+
extensions: z.array(z.string()),
|
|
75
|
+
debug: z.boolean(),
|
|
76
|
+
/** Fresh temp `GEMINI_CONFIG_DIR` per run when true. */
|
|
77
|
+
isolateConfig: z.boolean()
|
|
78
|
+
}).partial();
|
|
61
79
|
/** Codex CLI adapter-specific options (nested under `codex`). */
|
|
62
80
|
const CodexConfigSchema = z.object({
|
|
63
81
|
binary: z.string(),
|
|
@@ -91,7 +109,8 @@ const ConfigPartialSchema = z.object({
|
|
|
91
109
|
timeoutMs: z.number().int().positive(),
|
|
92
110
|
env: z.record(z.string(), z.string()),
|
|
93
111
|
claudeCode: ClaudeCodeConfigSchema,
|
|
94
|
-
codex: CodexConfigSchema
|
|
112
|
+
codex: CodexConfigSchema,
|
|
113
|
+
geminiCli: GeminiCliConfigSchema
|
|
95
114
|
}).partial();
|
|
96
115
|
/** A matrix cell — one point in the configuration matrix. */
|
|
97
116
|
const MatrixCellSchema = z.object({
|
|
@@ -1264,4 +1283,4 @@ function formatZodError(err, sourcePath) {
|
|
|
1264
1283
|
//#endregion
|
|
1265
1284
|
export { parseGradingConfig as a, parseCasesFile as c, loadGradingConfig as i, ConfigError as l, parseSuite as n, loadSuiteDocument as o, parseSuiteDirectory as r, DEFAULT_PIPELINE_OUTPUTS as s, loadSuite as t };
|
|
1266
1285
|
|
|
1267
|
-
//# sourceMappingURL=loader-
|
|
1286
|
+
//# sourceMappingURL=loader-CiBm4Kf6.js.map
|